Commit b4636e361ed44bbf7eab45421002655fac557de4
1 parent
df1e30a9
feat: init项目
Showing
1 changed file
with
29 additions
and
7 deletions
src/main/java/com/canrd/webmagic/processor/NatureSearchPageProcessor.java
1 | 1 | package com.canrd.webmagic.processor; |
2 | 2 | |
3 | +import com.alibaba.fastjson.JSONArray; | |
4 | +import com.alibaba.fastjson.JSONObject; | |
5 | +import com.canrd.webmagic.common.utils.StringUtils; | |
3 | 6 | import us.codecraft.webmagic.Page; |
4 | 7 | import us.codecraft.webmagic.Site; |
5 | 8 | import us.codecraft.webmagic.Spider; |
... | ... | @@ -9,6 +12,7 @@ import us.codecraft.webmagic.selector.Selectable; |
9 | 12 | import us.codecraft.webmagic.selector.XpathSelector; |
10 | 13 | |
11 | 14 | import java.util.List; |
15 | +import java.util.Objects; | |
12 | 16 | |
13 | 17 | /** |
14 | 18 | * @author: xms |
... | ... | @@ -30,7 +34,7 @@ public class NatureSearchPageProcessor implements PageProcessor { |
30 | 34 | public void process(Page page) { |
31 | 35 | if ("https://www.nature.com/search?q=battery&page=1".equals(page.getUrl().get())) { |
32 | 36 | doArticleList(page); |
33 | - } else { | |
37 | + } else { | |
34 | 38 | doArticleContent(page); |
35 | 39 | } |
36 | 40 | |
... | ... | @@ -39,15 +43,33 @@ public class NatureSearchPageProcessor implements PageProcessor { |
39 | 43 | private void doArticleContent(Page page) { |
40 | 44 | //解析页面 |
41 | 45 | Html html = page.getHtml(); |
42 | - String title = html.xpath("//div[@class='c-article-header']/header/div/h1/text()").get(); | |
43 | - Selectable selectable = page.getHtml().$(".c-article-author-list c-article-author-list--short js-no-scroll").select( | |
44 | - new XpathSelector("li[@class='c-article-author-list__item']")); | |
45 | - List<Selectable> nodes = selectable.nodes(); | |
46 | + String[] urlArr = page.getUrl().get().split("/"); | |
47 | + String articleId = urlArr[urlArr.length - 1]; | |
48 | + Selectable headSelectable = html.xpath("//div[@class='c-article-header']/header"); | |
49 | + List<Selectable> authorEmailSelectables = html.xpath("//p[@id='corresponding-author-list']/a").nodes(); | |
50 | + | |
51 | + String title = headSelectable.xpath("//div/h1/text()").get(); | |
52 | + if (StringUtils.isBlank(title)) { | |
53 | + title = headSelectable.xpath("//h1/text()").get(); | |
54 | + } | |
55 | + String publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get(); | |
56 | + Selectable authorSelectable = headSelectable.xpath("//ul").nodes().get(1).select(new XpathSelector("li[@class='c-article-author-list__item']")); | |
57 | + List<Selectable> authorNodes = authorSelectable.nodes(); | |
46 | 58 | StringBuffer authorName = new StringBuffer(); |
47 | - for (Selectable node : nodes) { | |
59 | + for (Selectable node : authorNodes) { | |
48 | 60 | authorName.append(node.xpath("//a/text()")); |
49 | 61 | } |
50 | - System.out.println("标题:" + title); | |
62 | + JSONArray array = new JSONArray(); | |
63 | + for (Selectable authorEmailSelectable : authorEmailSelectables) { | |
64 | + String[] split = authorEmailSelectable.xpath("//a").links().get().split(":"); | |
65 | + String email = Objects.isNull(split) ? "" : split[split.length - 1]; | |
66 | + String authorEmailName = authorEmailSelectable.xpath("//a/text()").get(); | |
67 | + JSONObject jsonObject = new JSONObject(); | |
68 | + jsonObject.put("authorEmailName", authorEmailName); | |
69 | + jsonObject.put("email", email); | |
70 | + array.add(jsonObject); | |
71 | + } | |
72 | + System.out.println("id:" + articleId + ",发布时间:" + publishTime + ",标题:" + title + ",作者:" + authorName + ",邮箱信息:" + array.toJSONString()); | |
51 | 73 | } |
52 | 74 | |
53 | 75 | private void doArticleList(Page page) { | ... | ... |