Commit b4636e361ed44bbf7eab45421002655fac557de4
1 parent
df1e30a9
feat: init项目
Showing
1 changed file
with
29 additions
and
7 deletions
src/main/java/com/canrd/webmagic/processor/NatureSearchPageProcessor.java
1 | package com.canrd.webmagic.processor; | 1 | package com.canrd.webmagic.processor; |
2 | 2 | ||
3 | +import com.alibaba.fastjson.JSONArray; | ||
4 | +import com.alibaba.fastjson.JSONObject; | ||
5 | +import com.canrd.webmagic.common.utils.StringUtils; | ||
3 | import us.codecraft.webmagic.Page; | 6 | import us.codecraft.webmagic.Page; |
4 | import us.codecraft.webmagic.Site; | 7 | import us.codecraft.webmagic.Site; |
5 | import us.codecraft.webmagic.Spider; | 8 | import us.codecraft.webmagic.Spider; |
@@ -9,6 +12,7 @@ import us.codecraft.webmagic.selector.Selectable; | @@ -9,6 +12,7 @@ import us.codecraft.webmagic.selector.Selectable; | ||
9 | import us.codecraft.webmagic.selector.XpathSelector; | 12 | import us.codecraft.webmagic.selector.XpathSelector; |
10 | 13 | ||
11 | import java.util.List; | 14 | import java.util.List; |
15 | +import java.util.Objects; | ||
12 | 16 | ||
13 | /** | 17 | /** |
14 | * @author: xms | 18 | * @author: xms |
@@ -30,7 +34,7 @@ public class NatureSearchPageProcessor implements PageProcessor { | @@ -30,7 +34,7 @@ public class NatureSearchPageProcessor implements PageProcessor { | ||
30 | public void process(Page page) { | 34 | public void process(Page page) { |
31 | if ("https://www.nature.com/search?q=battery&page=1".equals(page.getUrl().get())) { | 35 | if ("https://www.nature.com/search?q=battery&page=1".equals(page.getUrl().get())) { |
32 | doArticleList(page); | 36 | doArticleList(page); |
33 | - } else { | 37 | + } else { |
34 | doArticleContent(page); | 38 | doArticleContent(page); |
35 | } | 39 | } |
36 | 40 | ||
@@ -39,15 +43,33 @@ public class NatureSearchPageProcessor implements PageProcessor { | @@ -39,15 +43,33 @@ public class NatureSearchPageProcessor implements PageProcessor { | ||
39 | private void doArticleContent(Page page) { | 43 | private void doArticleContent(Page page) { |
40 | //解析页面 | 44 | //解析页面 |
41 | Html html = page.getHtml(); | 45 | Html html = page.getHtml(); |
42 | - String title = html.xpath("//div[@class='c-article-header']/header/div/h1/text()").get(); | ||
43 | - Selectable selectable = page.getHtml().$(".c-article-author-list c-article-author-list--short js-no-scroll").select( | ||
44 | - new XpathSelector("li[@class='c-article-author-list__item']")); | ||
45 | - List<Selectable> nodes = selectable.nodes(); | 46 | + String[] urlArr = page.getUrl().get().split("/"); |
47 | + String articleId = urlArr[urlArr.length - 1]; | ||
48 | + Selectable headSelectable = html.xpath("//div[@class='c-article-header']/header"); | ||
49 | + List<Selectable> authorEmailSelectables = html.xpath("//p[@id='corresponding-author-list']/a").nodes(); | ||
50 | + | ||
51 | + String title = headSelectable.xpath("//div/h1/text()").get(); | ||
52 | + if (StringUtils.isBlank(title)) { | ||
53 | + title = headSelectable.xpath("//h1/text()").get(); | ||
54 | + } | ||
55 | + String publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get(); | ||
56 | + Selectable authorSelectable = headSelectable.xpath("//ul").nodes().get(1).select(new XpathSelector("li[@class='c-article-author-list__item']")); | ||
57 | + List<Selectable> authorNodes = authorSelectable.nodes(); | ||
46 | StringBuffer authorName = new StringBuffer(); | 58 | StringBuffer authorName = new StringBuffer(); |
47 | - for (Selectable node : nodes) { | 59 | + for (Selectable node : authorNodes) { |
48 | authorName.append(node.xpath("//a/text()")); | 60 | authorName.append(node.xpath("//a/text()")); |
49 | } | 61 | } |
50 | - System.out.println("标题:" + title); | 62 | + JSONArray array = new JSONArray(); |
63 | + for (Selectable authorEmailSelectable : authorEmailSelectables) { | ||
64 | + String[] split = authorEmailSelectable.xpath("//a").links().get().split(":"); | ||
65 | + String email = Objects.isNull(split) ? "" : split[split.length - 1]; | ||
66 | + String authorEmailName = authorEmailSelectable.xpath("//a/text()").get(); | ||
67 | + JSONObject jsonObject = new JSONObject(); | ||
68 | + jsonObject.put("authorEmailName", authorEmailName); | ||
69 | + jsonObject.put("email", email); | ||
70 | + array.add(jsonObject); | ||
71 | + } | ||
72 | + System.out.println("id:" + articleId + ",发布时间:" + publishTime + ",标题:" + title + ",作者:" + authorName + ",邮箱信息:" + array.toJSONString()); | ||
51 | } | 73 | } |
52 | 74 | ||
53 | private void doArticleList(Page page) { | 75 | private void doArticleList(Page page) { |