Commit b4636e361ed44bbf7eab45421002655fac557de4

Authored by 谢茂盛
1 parent df1e30a9

feat: init项目

src/main/java/com/canrd/webmagic/processor/NatureSearchPageProcessor.java
1 1 package com.canrd.webmagic.processor;
2 2  
  3 +import com.alibaba.fastjson.JSONArray;
  4 +import com.alibaba.fastjson.JSONObject;
  5 +import com.canrd.webmagic.common.utils.StringUtils;
3 6 import us.codecraft.webmagic.Page;
4 7 import us.codecraft.webmagic.Site;
5 8 import us.codecraft.webmagic.Spider;
... ... @@ -9,6 +12,7 @@ import us.codecraft.webmagic.selector.Selectable;
9 12 import us.codecraft.webmagic.selector.XpathSelector;
10 13  
11 14 import java.util.List;
  15 +import java.util.Objects;
12 16  
13 17 /**
14 18 * @author: xms
... ... @@ -30,7 +34,7 @@ public class NatureSearchPageProcessor implements PageProcessor {
30 34 public void process(Page page) {
31 35 if ("https://www.nature.com/search?q=battery&page=1".equals(page.getUrl().get())) {
32 36 doArticleList(page);
33   - } else {
  37 + } else {
34 38 doArticleContent(page);
35 39 }
36 40  
... ... @@ -39,15 +43,33 @@ public class NatureSearchPageProcessor implements PageProcessor {
39 43 private void doArticleContent(Page page) {
40 44 //解析页面
41 45 Html html = page.getHtml();
42   - String title = html.xpath("//div[@class='c-article-header']/header/div/h1/text()").get();
43   - Selectable selectable = page.getHtml().$(".c-article-author-list c-article-author-list--short js-no-scroll").select(
44   - new XpathSelector("li[@class='c-article-author-list__item']"));
45   - List<Selectable> nodes = selectable.nodes();
  46 + String[] urlArr = page.getUrl().get().split("/");
  47 + String articleId = urlArr[urlArr.length - 1];
  48 + Selectable headSelectable = html.xpath("//div[@class='c-article-header']/header");
  49 + List<Selectable> authorEmailSelectables = html.xpath("//p[@id='corresponding-author-list']/a").nodes();
  50 +
  51 + String title = headSelectable.xpath("//div/h1/text()").get();
  52 + if (StringUtils.isBlank(title)) {
  53 + title = headSelectable.xpath("//h1/text()").get();
  54 + }
  55 + String publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get();
  56 + Selectable authorSelectable = headSelectable.xpath("//ul").nodes().get(1).select(new XpathSelector("li[@class='c-article-author-list__item']"));
  57 + List<Selectable> authorNodes = authorSelectable.nodes();
46 58 StringBuffer authorName = new StringBuffer();
47   - for (Selectable node : nodes) {
  59 + for (Selectable node : authorNodes) {
48 60 authorName.append(node.xpath("//a/text()"));
49 61 }
50   - System.out.println("标题:" + title);
  62 + JSONArray array = new JSONArray();
  63 + for (Selectable authorEmailSelectable : authorEmailSelectables) {
  64 + String[] split = authorEmailSelectable.xpath("//a").links().get().split(":");
  65 + String email = Objects.isNull(split) ? "" : split[split.length - 1];
  66 + String authorEmailName = authorEmailSelectable.xpath("//a/text()").get();
  67 + JSONObject jsonObject = new JSONObject();
  68 + jsonObject.put("authorEmailName", authorEmailName);
  69 + jsonObject.put("email", email);
  70 + array.add(jsonObject);
  71 + }
  72 + System.out.println("id:" + articleId + ",发布时间:" + publishTime + ",标题:" + title + ",作者:" + authorName + ",邮箱信息:" + array.toJSONString());
51 73 }
52 74  
53 75 private void doArticleList(Page page) {
... ...