Commit 463f7c3ef5f37c2b51c225bd5971d4a168457548
1 parent
0c4251f2
feat: nature article爬取
Showing
2 changed files
with
12 additions
and
15 deletions
src/main/java/com/canrd/webmagic/processor/NatureSearchPageProcessor.java
... | ... | @@ -59,8 +59,6 @@ public class NatureSearchPageProcessor implements PageProcessor { |
59 | 59 | */ |
60 | 60 | private void doArticleList4ReSearch(Page page) { |
61 | 61 | String url = page.getUrl().get(); |
62 | - String[] split = url.split("="); | |
63 | - Integer pageIndex = Integer.parseInt(split[split.length - 1]); | |
64 | 62 | /** |
65 | 63 | * 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容 |
66 | 64 | * 1、通过$或css()方法获取到该page html下某元素dom |
... | ... | @@ -77,9 +75,8 @@ public class NatureSearchPageProcessor implements PageProcessor { |
77 | 75 | Selectable node = nodes.get(i).$(".u-full-height").nodes().get(2).nodes().get(0).$(".u-full-height").select(new XpathSelector("a[@class='c-card__link u-link-inherit']")).nodes().get(0); |
78 | 76 | String link = node.$("a", "href").get(); |
79 | 77 | page.addTargetRequest(link); |
80 | - String link1 = node.links().get(); | |
81 | 78 | String title = node.$("a", "text").get(); |
82 | - System.out.printf("%d、%s,访问地址:%s%n", i, title, link1); | |
79 | + log.info("research文章列表链接:{},标题:{},文章链接:{}", url, title, link); | |
83 | 80 | } |
84 | 81 | } |
85 | 82 | |
... | ... | @@ -89,8 +86,7 @@ public class NatureSearchPageProcessor implements PageProcessor { |
89 | 86 | } |
90 | 87 | //解析页面 |
91 | 88 | Html html = page.getHtml(); |
92 | - String[] urlArr = page.getUrl().get().split("/"); | |
93 | - String articleCode = urlArr[urlArr.length - 1]; | |
89 | + String articleCode = page.getUrl().get(); | |
94 | 90 | Selectable headSelectable = html.xpath("//div[@class='c-article-header']/header"); |
95 | 91 | List<Selectable> authorEmailSelectables = html.xpath("//p[@id='corresponding-author-list']/a").nodes(); |
96 | 92 | Selectable referencesSelectable = html.xpath("//ol[@class='c-article-references']").select(new XpathSelector("li[@class='c-article-references__item js-c-reading-companion-references-item']")); |
... | ... | @@ -104,10 +100,10 @@ public class NatureSearchPageProcessor implements PageProcessor { |
104 | 100 | String publishTime; |
105 | 101 | try { |
106 | 102 | publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get(); |
107 | - }catch (Exception e) { | |
103 | + } catch (Exception e) { | |
108 | 104 | try { |
109 | 105 | publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(1).xpath("//li/time/text()").get(); |
110 | - }catch (Exception e1) { | |
106 | + } catch (Exception e1) { | |
111 | 107 | publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(0).xpath("//li/time/text()").get(); |
112 | 108 | } |
113 | 109 | } |
... | ... | @@ -161,8 +157,7 @@ public class NatureSearchPageProcessor implements PageProcessor { |
161 | 157 | jsonObject.put("email", email); |
162 | 158 | authorEmail.add(jsonObject); |
163 | 159 | } |
164 | - System.out.println("code:" + articleCode + ",发布时间:" + publishTime + ",标题:" + title + ",作者:" + authorName + ",邮箱信息:" + authorEmail.toJSONString()); | |
165 | - | |
160 | + log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorEmail.toJSONString()); | |
166 | 161 | |
167 | 162 | page.putField("article", ArticleDO.builder() |
168 | 163 | .articleType(ArticleTypeEnum.NATURE.getType()) |
... | ... | @@ -178,8 +173,6 @@ public class NatureSearchPageProcessor implements PageProcessor { |
178 | 173 | |
179 | 174 | private void doArticleList(Page page) { |
180 | 175 | String url = page.getUrl().get(); |
181 | - String[] split = url.split("="); | |
182 | - Integer pageIndex = Integer.parseInt(split[split.length - 1]); | |
183 | 176 | /** |
184 | 177 | * 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容 |
185 | 178 | * 1、通过$或css()方法获取到该page html下某元素dom |
... | ... | @@ -196,9 +189,8 @@ public class NatureSearchPageProcessor implements PageProcessor { |
196 | 189 | Selectable node = nodes.get(i).$(".u-full-height").nodes().get(2).nodes().get(0).$(".u-full-height").select(new XpathSelector("a[@class='c-card__link u-link-inherit']")).nodes().get(0); |
197 | 190 | String link = node.$("a", "href").get(); |
198 | 191 | page.addTargetRequest(link); |
199 | - String link1 = node.links().get(); | |
200 | 192 | String title = node.$("a", "text").get(); |
201 | - System.out.printf("%d、%s,访问地址:%s%n", i, title, link1); | |
193 | + log.info("关键字文章列表链接:{},标题:{},文章链接:{}", url, title, link); | |
202 | 194 | } |
203 | 195 | } |
204 | 196 | ... | ... |
src/test/java/com/canrd/webmagic/utils/DateTimeUtilTest.java
... | ... | @@ -3,7 +3,9 @@ package com.canrd.webmagic.utils; |
3 | 3 | import com.alibaba.fastjson.JSON; |
4 | 4 | import com.alibaba.fastjson.JSONArray; |
5 | 5 | import com.alibaba.fastjson.JSONObject; |
6 | +import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper; | |
6 | 7 | import com.canrd.webmagic.BaseTest; |
8 | +import com.canrd.webmagic.common.utils.StringUtils; | |
7 | 9 | import com.canrd.webmagic.domain.dto.ArticleDO; |
8 | 10 | import com.canrd.webmagic.service.ArticleService; |
9 | 11 | import org.junit.Test; |
... | ... | @@ -24,9 +26,12 @@ public class DateTimeUtilTest extends BaseTest { |
24 | 26 | |
25 | 27 | @Test |
26 | 28 | public void export() { |
27 | - List<ArticleDO> articleDOList = articleService.list(); | |
29 | + List<ArticleDO> articleDOList = articleService.list(new LambdaQueryWrapper<ArticleDO>().select(ArticleDO::getEmailInfo)); | |
28 | 30 | JSONArray array = new JSONArray(); |
29 | 31 | for (ArticleDO articleDO : articleDOList) { |
32 | + if (StringUtils.isBlank(articleDO.getEmailInfo())) { | |
33 | + continue; | |
34 | + } | |
30 | 35 | JSONArray jsonArray = JSONArray.parseArray(articleDO.getEmailInfo()); |
31 | 36 | array.addAll(jsonArray); |
32 | 37 | } | ... | ... |