Commit 463f7c3ef5f37c2b51c225bd5971d4a168457548

Authored by 谢茂盛
1 parent 0c4251f2

feat: nature article爬取

src/main/java/com/canrd/webmagic/processor/NatureSearchPageProcessor.java
... ... @@ -59,8 +59,6 @@ public class NatureSearchPageProcessor implements PageProcessor {
59 59 */
60 60 private void doArticleList4ReSearch(Page page) {
61 61 String url = page.getUrl().get();
62   - String[] split = url.split("=");
63   - Integer pageIndex = Integer.parseInt(split[split.length - 1]);
64 62 /**
65 63 * 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容
66 64 * 1、通过$或css()方法获取到该page html下某元素dom
... ... @@ -77,9 +75,8 @@ public class NatureSearchPageProcessor implements PageProcessor {
77 75 Selectable node = nodes.get(i).$(".u-full-height").nodes().get(2).nodes().get(0).$(".u-full-height").select(new XpathSelector("a[@class='c-card__link u-link-inherit']")).nodes().get(0);
78 76 String link = node.$("a", "href").get();
79 77 page.addTargetRequest(link);
80   - String link1 = node.links().get();
81 78 String title = node.$("a", "text").get();
82   - System.out.printf("%d、%s,访问地址:%s%n", i, title, link1);
  79 + log.info("research文章列表链接:{},标题:{},文章链接:{}", url, title, link);
83 80 }
84 81 }
85 82  
... ... @@ -89,8 +86,7 @@ public class NatureSearchPageProcessor implements PageProcessor {
89 86 }
90 87 //解析页面
91 88 Html html = page.getHtml();
92   - String[] urlArr = page.getUrl().get().split("/");
93   - String articleCode = urlArr[urlArr.length - 1];
  89 + String articleCode = page.getUrl().get();
94 90 Selectable headSelectable = html.xpath("//div[@class='c-article-header']/header");
95 91 List<Selectable> authorEmailSelectables = html.xpath("//p[@id='corresponding-author-list']/a").nodes();
96 92 Selectable referencesSelectable = html.xpath("//ol[@class='c-article-references']").select(new XpathSelector("li[@class='c-article-references__item js-c-reading-companion-references-item']"));
... ... @@ -104,10 +100,10 @@ public class NatureSearchPageProcessor implements PageProcessor {
104 100 String publishTime;
105 101 try {
106 102 publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get();
107   - }catch (Exception e) {
  103 + } catch (Exception e) {
108 104 try {
109 105 publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(1).xpath("//li/time/text()").get();
110   - }catch (Exception e1) {
  106 + } catch (Exception e1) {
111 107 publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(0).xpath("//li/time/text()").get();
112 108 }
113 109 }
... ... @@ -161,8 +157,7 @@ public class NatureSearchPageProcessor implements PageProcessor {
161 157 jsonObject.put("email", email);
162 158 authorEmail.add(jsonObject);
163 159 }
164   - System.out.println("code:" + articleCode + ",发布时间:" + publishTime + ",标题:" + title + ",作者:" + authorName + ",邮箱信息:" + authorEmail.toJSONString());
165   -
  160 + log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorEmail.toJSONString());
166 161  
167 162 page.putField("article", ArticleDO.builder()
168 163 .articleType(ArticleTypeEnum.NATURE.getType())
... ... @@ -178,8 +173,6 @@ public class NatureSearchPageProcessor implements PageProcessor {
178 173  
179 174 private void doArticleList(Page page) {
180 175 String url = page.getUrl().get();
181   - String[] split = url.split("=");
182   - Integer pageIndex = Integer.parseInt(split[split.length - 1]);
183 176 /**
184 177 * 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容
185 178 * 1、通过$或css()方法获取到该page html下某元素dom
... ... @@ -196,9 +189,8 @@ public class NatureSearchPageProcessor implements PageProcessor {
196 189 Selectable node = nodes.get(i).$(".u-full-height").nodes().get(2).nodes().get(0).$(".u-full-height").select(new XpathSelector("a[@class='c-card__link u-link-inherit']")).nodes().get(0);
197 190 String link = node.$("a", "href").get();
198 191 page.addTargetRequest(link);
199   - String link1 = node.links().get();
200 192 String title = node.$("a", "text").get();
201   - System.out.printf("%d、%s,访问地址:%s%n", i, title, link1);
  193 + log.info("关键字文章列表链接:{},标题:{},文章链接:{}", url, title, link);
202 194 }
203 195 }
204 196  
... ...
src/test/java/com/canrd/webmagic/utils/DateTimeUtilTest.java
... ... @@ -3,7 +3,9 @@ package com.canrd.webmagic.utils;
3 3 import com.alibaba.fastjson.JSON;
4 4 import com.alibaba.fastjson.JSONArray;
5 5 import com.alibaba.fastjson.JSONObject;
  6 +import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
6 7 import com.canrd.webmagic.BaseTest;
  8 +import com.canrd.webmagic.common.utils.StringUtils;
7 9 import com.canrd.webmagic.domain.dto.ArticleDO;
8 10 import com.canrd.webmagic.service.ArticleService;
9 11 import org.junit.Test;
... ... @@ -24,9 +26,12 @@ public class DateTimeUtilTest extends BaseTest {
24 26  
25 27 @Test
26 28 public void export() {
27   - List<ArticleDO> articleDOList = articleService.list();
  29 + List<ArticleDO> articleDOList = articleService.list(new LambdaQueryWrapper<ArticleDO>().select(ArticleDO::getEmailInfo));
28 30 JSONArray array = new JSONArray();
29 31 for (ArticleDO articleDO : articleDOList) {
  32 + if (StringUtils.isBlank(articleDO.getEmailInfo())) {
  33 + continue;
  34 + }
30 35 JSONArray jsonArray = JSONArray.parseArray(articleDO.getEmailInfo());
31 36 array.addAll(jsonArray);
32 37 }
... ...