Commit 463f7c3ef5f37c2b51c225bd5971d4a168457548

Authored by 谢茂盛
1 parent 0c4251f2

feat: nature article爬取

src/main/java/com/canrd/webmagic/processor/NatureSearchPageProcessor.java
@@ -59,8 +59,6 @@ public class NatureSearchPageProcessor implements PageProcessor { @@ -59,8 +59,6 @@ public class NatureSearchPageProcessor implements PageProcessor {
59 */ 59 */
60 private void doArticleList4ReSearch(Page page) { 60 private void doArticleList4ReSearch(Page page) {
61 String url = page.getUrl().get(); 61 String url = page.getUrl().get();
62 - String[] split = url.split("=");  
63 - Integer pageIndex = Integer.parseInt(split[split.length - 1]);  
64 /** 62 /**
65 * 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容 63 * 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容
66 * 1、通过$或css()方法获取到该page html下某元素dom 64 * 1、通过$或css()方法获取到该page html下某元素dom
@@ -77,9 +75,8 @@ public class NatureSearchPageProcessor implements PageProcessor { @@ -77,9 +75,8 @@ public class NatureSearchPageProcessor implements PageProcessor {
77 Selectable node = nodes.get(i).$(".u-full-height").nodes().get(2).nodes().get(0).$(".u-full-height").select(new XpathSelector("a[@class='c-card__link u-link-inherit']")).nodes().get(0); 75 Selectable node = nodes.get(i).$(".u-full-height").nodes().get(2).nodes().get(0).$(".u-full-height").select(new XpathSelector("a[@class='c-card__link u-link-inherit']")).nodes().get(0);
78 String link = node.$("a", "href").get(); 76 String link = node.$("a", "href").get();
79 page.addTargetRequest(link); 77 page.addTargetRequest(link);
80 - String link1 = node.links().get();  
81 String title = node.$("a", "text").get(); 78 String title = node.$("a", "text").get();
82 - System.out.printf("%d、%s,访问地址:%s%n", i, title, link1); 79 + log.info("research文章列表链接:{},标题:{},文章链接:{}", url, title, link);
83 } 80 }
84 } 81 }
85 82
@@ -89,8 +86,7 @@ public class NatureSearchPageProcessor implements PageProcessor { @@ -89,8 +86,7 @@ public class NatureSearchPageProcessor implements PageProcessor {
89 } 86 }
90 //解析页面 87 //解析页面
91 Html html = page.getHtml(); 88 Html html = page.getHtml();
92 - String[] urlArr = page.getUrl().get().split("/");  
93 - String articleCode = urlArr[urlArr.length - 1]; 89 + String articleCode = page.getUrl().get();
94 Selectable headSelectable = html.xpath("//div[@class='c-article-header']/header"); 90 Selectable headSelectable = html.xpath("//div[@class='c-article-header']/header");
95 List<Selectable> authorEmailSelectables = html.xpath("//p[@id='corresponding-author-list']/a").nodes(); 91 List<Selectable> authorEmailSelectables = html.xpath("//p[@id='corresponding-author-list']/a").nodes();
96 Selectable referencesSelectable = html.xpath("//ol[@class='c-article-references']").select(new XpathSelector("li[@class='c-article-references__item js-c-reading-companion-references-item']")); 92 Selectable referencesSelectable = html.xpath("//ol[@class='c-article-references']").select(new XpathSelector("li[@class='c-article-references__item js-c-reading-companion-references-item']"));
@@ -104,10 +100,10 @@ public class NatureSearchPageProcessor implements PageProcessor { @@ -104,10 +100,10 @@ public class NatureSearchPageProcessor implements PageProcessor {
104 String publishTime; 100 String publishTime;
105 try { 101 try {
106 publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get(); 102 publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get();
107 - }catch (Exception e) { 103 + } catch (Exception e) {
108 try { 104 try {
109 publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(1).xpath("//li/time/text()").get(); 105 publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(1).xpath("//li/time/text()").get();
110 - }catch (Exception e1) { 106 + } catch (Exception e1) {
111 publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(0).xpath("//li/time/text()").get(); 107 publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(0).xpath("//li/time/text()").get();
112 } 108 }
113 } 109 }
@@ -161,8 +157,7 @@ public class NatureSearchPageProcessor implements PageProcessor { @@ -161,8 +157,7 @@ public class NatureSearchPageProcessor implements PageProcessor {
161 jsonObject.put("email", email); 157 jsonObject.put("email", email);
162 authorEmail.add(jsonObject); 158 authorEmail.add(jsonObject);
163 } 159 }
164 - System.out.println("code:" + articleCode + ",发布时间:" + publishTime + ",标题:" + title + ",作者:" + authorName + ",邮箱信息:" + authorEmail.toJSONString());  
165 - 160 + log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorEmail.toJSONString());
166 161
167 page.putField("article", ArticleDO.builder() 162 page.putField("article", ArticleDO.builder()
168 .articleType(ArticleTypeEnum.NATURE.getType()) 163 .articleType(ArticleTypeEnum.NATURE.getType())
@@ -178,8 +173,6 @@ public class NatureSearchPageProcessor implements PageProcessor { @@ -178,8 +173,6 @@ public class NatureSearchPageProcessor implements PageProcessor {
178 173
179 private void doArticleList(Page page) { 174 private void doArticleList(Page page) {
180 String url = page.getUrl().get(); 175 String url = page.getUrl().get();
181 - String[] split = url.split("=");  
182 - Integer pageIndex = Integer.parseInt(split[split.length - 1]);  
183 /** 176 /**
184 * 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容 177 * 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容
185 * 1、通过$或css()方法获取到该page html下某元素dom 178 * 1、通过$或css()方法获取到该page html下某元素dom
@@ -196,9 +189,8 @@ public class NatureSearchPageProcessor implements PageProcessor { @@ -196,9 +189,8 @@ public class NatureSearchPageProcessor implements PageProcessor {
196 Selectable node = nodes.get(i).$(".u-full-height").nodes().get(2).nodes().get(0).$(".u-full-height").select(new XpathSelector("a[@class='c-card__link u-link-inherit']")).nodes().get(0); 189 Selectable node = nodes.get(i).$(".u-full-height").nodes().get(2).nodes().get(0).$(".u-full-height").select(new XpathSelector("a[@class='c-card__link u-link-inherit']")).nodes().get(0);
197 String link = node.$("a", "href").get(); 190 String link = node.$("a", "href").get();
198 page.addTargetRequest(link); 191 page.addTargetRequest(link);
199 - String link1 = node.links().get();  
200 String title = node.$("a", "text").get(); 192 String title = node.$("a", "text").get();
201 - System.out.printf("%d、%s,访问地址:%s%n", i, title, link1); 193 + log.info("关键字文章列表链接:{},标题:{},文章链接:{}", url, title, link);
202 } 194 }
203 } 195 }
204 196
src/test/java/com/canrd/webmagic/utils/DateTimeUtilTest.java
@@ -3,7 +3,9 @@ package com.canrd.webmagic.utils; @@ -3,7 +3,9 @@ package com.canrd.webmagic.utils;
3 import com.alibaba.fastjson.JSON; 3 import com.alibaba.fastjson.JSON;
4 import com.alibaba.fastjson.JSONArray; 4 import com.alibaba.fastjson.JSONArray;
5 import com.alibaba.fastjson.JSONObject; 5 import com.alibaba.fastjson.JSONObject;
  6 +import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
6 import com.canrd.webmagic.BaseTest; 7 import com.canrd.webmagic.BaseTest;
  8 +import com.canrd.webmagic.common.utils.StringUtils;
7 import com.canrd.webmagic.domain.dto.ArticleDO; 9 import com.canrd.webmagic.domain.dto.ArticleDO;
8 import com.canrd.webmagic.service.ArticleService; 10 import com.canrd.webmagic.service.ArticleService;
9 import org.junit.Test; 11 import org.junit.Test;
@@ -24,9 +26,12 @@ public class DateTimeUtilTest extends BaseTest { @@ -24,9 +26,12 @@ public class DateTimeUtilTest extends BaseTest {
24 26
25 @Test 27 @Test
26 public void export() { 28 public void export() {
27 - List<ArticleDO> articleDOList = articleService.list(); 29 + List<ArticleDO> articleDOList = articleService.list(new LambdaQueryWrapper<ArticleDO>().select(ArticleDO::getEmailInfo));
28 JSONArray array = new JSONArray(); 30 JSONArray array = new JSONArray();
29 for (ArticleDO articleDO : articleDOList) { 31 for (ArticleDO articleDO : articleDOList) {
  32 + if (StringUtils.isBlank(articleDO.getEmailInfo())) {
  33 + continue;
  34 + }
30 JSONArray jsonArray = JSONArray.parseArray(articleDO.getEmailInfo()); 35 JSONArray jsonArray = JSONArray.parseArray(articleDO.getEmailInfo());
31 array.addAll(jsonArray); 36 array.addAll(jsonArray);
32 } 37 }