Commit b4636e361ed44bbf7eab45421002655fac557de4

Authored by 谢茂盛
1 parent df1e30a9

feat: init项目

src/main/java/com/canrd/webmagic/processor/NatureSearchPageProcessor.java
1 package com.canrd.webmagic.processor; 1 package com.canrd.webmagic.processor;
2 2
  3 +import com.alibaba.fastjson.JSONArray;
  4 +import com.alibaba.fastjson.JSONObject;
  5 +import com.canrd.webmagic.common.utils.StringUtils;
3 import us.codecraft.webmagic.Page; 6 import us.codecraft.webmagic.Page;
4 import us.codecraft.webmagic.Site; 7 import us.codecraft.webmagic.Site;
5 import us.codecraft.webmagic.Spider; 8 import us.codecraft.webmagic.Spider;
@@ -9,6 +12,7 @@ import us.codecraft.webmagic.selector.Selectable; @@ -9,6 +12,7 @@ import us.codecraft.webmagic.selector.Selectable;
9 import us.codecraft.webmagic.selector.XpathSelector; 12 import us.codecraft.webmagic.selector.XpathSelector;
10 13
11 import java.util.List; 14 import java.util.List;
  15 +import java.util.Objects;
12 16
13 /** 17 /**
14 * @author: xms 18 * @author: xms
@@ -30,7 +34,7 @@ public class NatureSearchPageProcessor implements PageProcessor { @@ -30,7 +34,7 @@ public class NatureSearchPageProcessor implements PageProcessor {
30 public void process(Page page) { 34 public void process(Page page) {
31 if ("https://www.nature.com/search?q=battery&page=1".equals(page.getUrl().get())) { 35 if ("https://www.nature.com/search?q=battery&page=1".equals(page.getUrl().get())) {
32 doArticleList(page); 36 doArticleList(page);
33 - } else { 37 + } else {
34 doArticleContent(page); 38 doArticleContent(page);
35 } 39 }
36 40
@@ -39,15 +43,33 @@ public class NatureSearchPageProcessor implements PageProcessor { @@ -39,15 +43,33 @@ public class NatureSearchPageProcessor implements PageProcessor {
39 private void doArticleContent(Page page) { 43 private void doArticleContent(Page page) {
40 //解析页面 44 //解析页面
41 Html html = page.getHtml(); 45 Html html = page.getHtml();
42 - String title = html.xpath("//div[@class='c-article-header']/header/div/h1/text()").get();  
43 - Selectable selectable = page.getHtml().$(".c-article-author-list c-article-author-list--short js-no-scroll").select(  
44 - new XpathSelector("li[@class='c-article-author-list__item']"));  
45 - List<Selectable> nodes = selectable.nodes(); 46 + String[] urlArr = page.getUrl().get().split("/");
  47 + String articleId = urlArr[urlArr.length - 1];
  48 + Selectable headSelectable = html.xpath("//div[@class='c-article-header']/header");
  49 + List<Selectable> authorEmailSelectables = html.xpath("//p[@id='corresponding-author-list']/a").nodes();
  50 +
  51 + String title = headSelectable.xpath("//div/h1/text()").get();
  52 + if (StringUtils.isBlank(title)) {
  53 + title = headSelectable.xpath("//h1/text()").get();
  54 + }
  55 + String publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get();
  56 + Selectable authorSelectable = headSelectable.xpath("//ul").nodes().get(1).select(new XpathSelector("li[@class='c-article-author-list__item']"));
  57 + List<Selectable> authorNodes = authorSelectable.nodes();
46 StringBuffer authorName = new StringBuffer(); 58 StringBuffer authorName = new StringBuffer();
47 - for (Selectable node : nodes) { 59 + for (Selectable node : authorNodes) {
48 authorName.append(node.xpath("//a/text()")); 60 authorName.append(node.xpath("//a/text()"));
49 } 61 }
50 - System.out.println("标题:" + title); 62 + JSONArray array = new JSONArray();
  63 + for (Selectable authorEmailSelectable : authorEmailSelectables) {
  64 + String[] split = authorEmailSelectable.xpath("//a").links().get().split(":");
  65 + String email = Objects.isNull(split) ? "" : split[split.length - 1];
  66 + String authorEmailName = authorEmailSelectable.xpath("//a/text()").get();
  67 + JSONObject jsonObject = new JSONObject();
  68 + jsonObject.put("authorEmailName", authorEmailName);
  69 + jsonObject.put("email", email);
  70 + array.add(jsonObject);
  71 + }
  72 + System.out.println("id:" + articleId + ",发布时间:" + publishTime + ",标题:" + title + ",作者:" + authorName + ",邮箱信息:" + array.toJSONString());
51 } 73 }
52 74
53 private void doArticleList(Page page) { 75 private void doArticleList(Page page) {