Commit 463f7c3ef5f37c2b51c225bd5971d4a168457548
1 parent
0c4251f2
feat: nature article爬取
Showing
2 changed files
with
12 additions
and
15 deletions
src/main/java/com/canrd/webmagic/processor/NatureSearchPageProcessor.java
@@ -59,8 +59,6 @@ public class NatureSearchPageProcessor implements PageProcessor { | @@ -59,8 +59,6 @@ public class NatureSearchPageProcessor implements PageProcessor { | ||
59 | */ | 59 | */ |
60 | private void doArticleList4ReSearch(Page page) { | 60 | private void doArticleList4ReSearch(Page page) { |
61 | String url = page.getUrl().get(); | 61 | String url = page.getUrl().get(); |
62 | - String[] split = url.split("="); | ||
63 | - Integer pageIndex = Integer.parseInt(split[split.length - 1]); | ||
64 | /** | 62 | /** |
65 | * 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容 | 63 | * 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容 |
66 | * 1、通过$或css()方法获取到该page html下某元素dom | 64 | * 1、通过$或css()方法获取到该page html下某元素dom |
@@ -77,9 +75,8 @@ public class NatureSearchPageProcessor implements PageProcessor { | @@ -77,9 +75,8 @@ public class NatureSearchPageProcessor implements PageProcessor { | ||
77 | Selectable node = nodes.get(i).$(".u-full-height").nodes().get(2).nodes().get(0).$(".u-full-height").select(new XpathSelector("a[@class='c-card__link u-link-inherit']")).nodes().get(0); | 75 | Selectable node = nodes.get(i).$(".u-full-height").nodes().get(2).nodes().get(0).$(".u-full-height").select(new XpathSelector("a[@class='c-card__link u-link-inherit']")).nodes().get(0); |
78 | String link = node.$("a", "href").get(); | 76 | String link = node.$("a", "href").get(); |
79 | page.addTargetRequest(link); | 77 | page.addTargetRequest(link); |
80 | - String link1 = node.links().get(); | ||
81 | String title = node.$("a", "text").get(); | 78 | String title = node.$("a", "text").get(); |
82 | - System.out.printf("%d、%s,访问地址:%s%n", i, title, link1); | 79 | + log.info("research文章列表链接:{},标题:{},文章链接:{}", url, title, link); |
83 | } | 80 | } |
84 | } | 81 | } |
85 | 82 | ||
@@ -89,8 +86,7 @@ public class NatureSearchPageProcessor implements PageProcessor { | @@ -89,8 +86,7 @@ public class NatureSearchPageProcessor implements PageProcessor { | ||
89 | } | 86 | } |
90 | //解析页面 | 87 | //解析页面 |
91 | Html html = page.getHtml(); | 88 | Html html = page.getHtml(); |
92 | - String[] urlArr = page.getUrl().get().split("/"); | ||
93 | - String articleCode = urlArr[urlArr.length - 1]; | 89 | + String articleCode = page.getUrl().get(); |
94 | Selectable headSelectable = html.xpath("//div[@class='c-article-header']/header"); | 90 | Selectable headSelectable = html.xpath("//div[@class='c-article-header']/header"); |
95 | List<Selectable> authorEmailSelectables = html.xpath("//p[@id='corresponding-author-list']/a").nodes(); | 91 | List<Selectable> authorEmailSelectables = html.xpath("//p[@id='corresponding-author-list']/a").nodes(); |
96 | Selectable referencesSelectable = html.xpath("//ol[@class='c-article-references']").select(new XpathSelector("li[@class='c-article-references__item js-c-reading-companion-references-item']")); | 92 | Selectable referencesSelectable = html.xpath("//ol[@class='c-article-references']").select(new XpathSelector("li[@class='c-article-references__item js-c-reading-companion-references-item']")); |
@@ -104,10 +100,10 @@ public class NatureSearchPageProcessor implements PageProcessor { | @@ -104,10 +100,10 @@ public class NatureSearchPageProcessor implements PageProcessor { | ||
104 | String publishTime; | 100 | String publishTime; |
105 | try { | 101 | try { |
106 | publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get(); | 102 | publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get(); |
107 | - }catch (Exception e) { | 103 | + } catch (Exception e) { |
108 | try { | 104 | try { |
109 | publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(1).xpath("//li/time/text()").get(); | 105 | publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(1).xpath("//li/time/text()").get(); |
110 | - }catch (Exception e1) { | 106 | + } catch (Exception e1) { |
111 | publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(0).xpath("//li/time/text()").get(); | 107 | publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(0).xpath("//li/time/text()").get(); |
112 | } | 108 | } |
113 | } | 109 | } |
@@ -161,8 +157,7 @@ public class NatureSearchPageProcessor implements PageProcessor { | @@ -161,8 +157,7 @@ public class NatureSearchPageProcessor implements PageProcessor { | ||
161 | jsonObject.put("email", email); | 157 | jsonObject.put("email", email); |
162 | authorEmail.add(jsonObject); | 158 | authorEmail.add(jsonObject); |
163 | } | 159 | } |
164 | - System.out.println("code:" + articleCode + ",发布时间:" + publishTime + ",标题:" + title + ",作者:" + authorName + ",邮箱信息:" + authorEmail.toJSONString()); | ||
165 | - | 160 | + log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorEmail.toJSONString()); |
166 | 161 | ||
167 | page.putField("article", ArticleDO.builder() | 162 | page.putField("article", ArticleDO.builder() |
168 | .articleType(ArticleTypeEnum.NATURE.getType()) | 163 | .articleType(ArticleTypeEnum.NATURE.getType()) |
@@ -178,8 +173,6 @@ public class NatureSearchPageProcessor implements PageProcessor { | @@ -178,8 +173,6 @@ public class NatureSearchPageProcessor implements PageProcessor { | ||
178 | 173 | ||
179 | private void doArticleList(Page page) { | 174 | private void doArticleList(Page page) { |
180 | String url = page.getUrl().get(); | 175 | String url = page.getUrl().get(); |
181 | - String[] split = url.split("="); | ||
182 | - Integer pageIndex = Integer.parseInt(split[split.length - 1]); | ||
183 | /** | 176 | /** |
184 | * 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容 | 177 | * 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容 |
185 | * 1、通过$或css()方法获取到该page html下某元素dom | 178 | * 1、通过$或css()方法获取到该page html下某元素dom |
@@ -196,9 +189,8 @@ public class NatureSearchPageProcessor implements PageProcessor { | @@ -196,9 +189,8 @@ public class NatureSearchPageProcessor implements PageProcessor { | ||
196 | Selectable node = nodes.get(i).$(".u-full-height").nodes().get(2).nodes().get(0).$(".u-full-height").select(new XpathSelector("a[@class='c-card__link u-link-inherit']")).nodes().get(0); | 189 | Selectable node = nodes.get(i).$(".u-full-height").nodes().get(2).nodes().get(0).$(".u-full-height").select(new XpathSelector("a[@class='c-card__link u-link-inherit']")).nodes().get(0); |
197 | String link = node.$("a", "href").get(); | 190 | String link = node.$("a", "href").get(); |
198 | page.addTargetRequest(link); | 191 | page.addTargetRequest(link); |
199 | - String link1 = node.links().get(); | ||
200 | String title = node.$("a", "text").get(); | 192 | String title = node.$("a", "text").get(); |
201 | - System.out.printf("%d、%s,访问地址:%s%n", i, title, link1); | 193 | + log.info("关键字文章列表链接:{},标题:{},文章链接:{}", url, title, link); |
202 | } | 194 | } |
203 | } | 195 | } |
204 | 196 |
src/test/java/com/canrd/webmagic/utils/DateTimeUtilTest.java
@@ -3,7 +3,9 @@ package com.canrd.webmagic.utils; | @@ -3,7 +3,9 @@ package com.canrd.webmagic.utils; | ||
3 | import com.alibaba.fastjson.JSON; | 3 | import com.alibaba.fastjson.JSON; |
4 | import com.alibaba.fastjson.JSONArray; | 4 | import com.alibaba.fastjson.JSONArray; |
5 | import com.alibaba.fastjson.JSONObject; | 5 | import com.alibaba.fastjson.JSONObject; |
6 | +import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper; | ||
6 | import com.canrd.webmagic.BaseTest; | 7 | import com.canrd.webmagic.BaseTest; |
8 | +import com.canrd.webmagic.common.utils.StringUtils; | ||
7 | import com.canrd.webmagic.domain.dto.ArticleDO; | 9 | import com.canrd.webmagic.domain.dto.ArticleDO; |
8 | import com.canrd.webmagic.service.ArticleService; | 10 | import com.canrd.webmagic.service.ArticleService; |
9 | import org.junit.Test; | 11 | import org.junit.Test; |
@@ -24,9 +26,12 @@ public class DateTimeUtilTest extends BaseTest { | @@ -24,9 +26,12 @@ public class DateTimeUtilTest extends BaseTest { | ||
24 | 26 | ||
25 | @Test | 27 | @Test |
26 | public void export() { | 28 | public void export() { |
27 | - List<ArticleDO> articleDOList = articleService.list(); | 29 | + List<ArticleDO> articleDOList = articleService.list(new LambdaQueryWrapper<ArticleDO>().select(ArticleDO::getEmailInfo)); |
28 | JSONArray array = new JSONArray(); | 30 | JSONArray array = new JSONArray(); |
29 | for (ArticleDO articleDO : articleDOList) { | 31 | for (ArticleDO articleDO : articleDOList) { |
32 | + if (StringUtils.isBlank(articleDO.getEmailInfo())) { | ||
33 | + continue; | ||
34 | + } | ||
30 | JSONArray jsonArray = JSONArray.parseArray(articleDO.getEmailInfo()); | 35 | JSONArray jsonArray = JSONArray.parseArray(articleDO.getEmailInfo()); |
31 | array.addAll(jsonArray); | 36 | array.addAll(jsonArray); |
32 | } | 37 | } |