Commit 7b1216da57e8b31fa31810df465dd777a4594fbb

Authored by 谢茂盛
1 parent f85930c5

feat:

1、science-spj 爬取
src/main/java/com/canrd/webmagic/processor/Science4JournalArticlePageProcessor.java
... ... @@ -55,19 +55,21 @@ public class Science4JournalArticlePageProcessor implements PageProcessor {
55 55 //解析页面
56 56 Html html = page.getHtml();
57 57 String articleCode = page.getUrl().get();
58   - Selectable headSelectable = html.xpath("//div[@class='article-container']/article/header");
  58 + Selectable articleSelectable = html.xpath("//article[@xmlns='http://www.w3.org/1999/xhtml']");
59 59  
60   - String title = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='core-lede']/div/text()").get();
  60 + Selectable headSelectable = articleSelectable.xpath("//header/div");
  61 +
  62 + String title = headSelectable.xpath("//div[@class='core-lede']/div/text()").get();
61 63 if (StringUtils.isBlank(title)) {
62   - title = html.xpath("//div[@class='article-container']/article/header").xpath("//h1[@property='name']/text()").get();
  64 + title = headSelectable.xpath("//h1[@property='name']/text()").get();
63 65 }
64 66  
65   - String articleDesc = html.xpath("//div[@class='article-container']/article").xpath("//section[@id='bodymatter']/div/div/text()").get();
  67 + String articleDesc = articleSelectable.xpath("//section[@id='bodymatter']/div/div/text()").get();
66 68 if (StringUtils.isBlank(articleDesc)) {
67   - articleDesc = html.xpath("//div[@class='article-container']/article").xpath("//div[@role='paragraph']/text()").get();
  69 + articleDesc = articleSelectable.xpath("//div[@role='paragraph']/text()").get();
68 70 }
69 71  
70   - String publishTime = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='core-self-citation']").xpath("//div[@class='core-date-published']/span/text()").get();
  72 + String publishTime = headSelectable.xpath("//span[@property='datePublished']/text()").get();
71 73 Date publishTimeDateTime = null;
72 74 SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH);
73 75  
... ... @@ -76,19 +78,20 @@ public class Science4JournalArticlePageProcessor implements PageProcessor {
76 78 } catch (ParseException e) {
77 79 e.printStackTrace();
78 80 }
79   - List<Selectable> authorNodes = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='contributors']/span/span/span").nodes();
  81 + List<Selectable> authorNodes = headSelectable.xpath("//span[@property='author']").nodes();
80 82 StringBuffer authorName = new StringBuffer();
81 83 for (Selectable node : authorNodes) {
82   - String name = node.xpath("//a/span/text()").get();
83   - if (StringUtils.isBlank(name)) {
  84 + String giveName = node.xpath("//span[@property='givenName']/text()").get();
  85 + String familyName = node.xpath("//span[@property='familyName']/text()").get();
  86 + if (StringUtils.isBlank(giveName) && StringUtils.isBlank(familyName)) {
84 87 continue;
85 88 }
86   - authorName.append(name).append(" ");
  89 + authorName.append(giveName).append(" ").append(familyName).append(",");
87 90 }
88 91  
89 92  
90 93 JSONArray authorEmail = new JSONArray();
91   - List<Selectable> authorEmailSelectables = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='contributors']").xpath("//span[@class='authors']").xpath("//span[@role='list']").xpath("//span[@property='author']").nodes();
  94 + List<Selectable> authorEmailSelectables = headSelectable.xpath("//span[@property='author']").nodes();
92 95 for (Selectable authorEmailSelectable : authorEmailSelectables) {
93 96 String givenName = authorEmailSelectable.xpath("//span[@property='givenName']/text()").get();
94 97 String familyName = authorEmailSelectable.xpath("//span[@property='familyName']/text()").get();
... ... @@ -98,7 +101,7 @@ public class Science4JournalArticlePageProcessor implements PageProcessor {
98 101 }
99 102  
100 103 JSONObject jsonObject = new JSONObject();
101   - jsonObject.put("authorEmailName", givenName + "" + familyName);
  104 + jsonObject.put("authorEmailName", givenName + " " + familyName);
102 105 jsonObject.put("email", email);
103 106 authorEmail.add(jsonObject);
104 107 }
... ...