Commit 7b1216da57e8b31fa31810df465dd777a4594fbb

Authored by 谢茂盛
1 parent f85930c5

feat:

1、science-spj 爬取
src/main/java/com/canrd/webmagic/processor/Science4JournalArticlePageProcessor.java
@@ -55,19 +55,21 @@ public class Science4JournalArticlePageProcessor implements PageProcessor { @@ -55,19 +55,21 @@ public class Science4JournalArticlePageProcessor implements PageProcessor {
55 //解析页面 55 //解析页面
56 Html html = page.getHtml(); 56 Html html = page.getHtml();
57 String articleCode = page.getUrl().get(); 57 String articleCode = page.getUrl().get();
58 - Selectable headSelectable = html.xpath("//div[@class='article-container']/article/header"); 58 + Selectable articleSelectable = html.xpath("//article[@xmlns='http://www.w3.org/1999/xhtml']");
59 59
60 - String title = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='core-lede']/div/text()").get(); 60 + Selectable headSelectable = articleSelectable.xpath("//header/div");
  61 +
  62 + String title = headSelectable.xpath("//div[@class='core-lede']/div/text()").get();
61 if (StringUtils.isBlank(title)) { 63 if (StringUtils.isBlank(title)) {
62 - title = html.xpath("//div[@class='article-container']/article/header").xpath("//h1[@property='name']/text()").get(); 64 + title = headSelectable.xpath("//h1[@property='name']/text()").get();
63 } 65 }
64 66
65 - String articleDesc = html.xpath("//div[@class='article-container']/article").xpath("//section[@id='bodymatter']/div/div/text()").get(); 67 + String articleDesc = articleSelectable.xpath("//section[@id='bodymatter']/div/div/text()").get();
66 if (StringUtils.isBlank(articleDesc)) { 68 if (StringUtils.isBlank(articleDesc)) {
67 - articleDesc = html.xpath("//div[@class='article-container']/article").xpath("//div[@role='paragraph']/text()").get(); 69 + articleDesc = articleSelectable.xpath("//div[@role='paragraph']/text()").get();
68 } 70 }
69 71
70 - String publishTime = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='core-self-citation']").xpath("//div[@class='core-date-published']/span/text()").get(); 72 + String publishTime = headSelectable.xpath("//span[@property='datePublished']/text()").get();
71 Date publishTimeDateTime = null; 73 Date publishTimeDateTime = null;
72 SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH); 74 SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH);
73 75
@@ -76,19 +78,20 @@ public class Science4JournalArticlePageProcessor implements PageProcessor { @@ -76,19 +78,20 @@ public class Science4JournalArticlePageProcessor implements PageProcessor {
76 } catch (ParseException e) { 78 } catch (ParseException e) {
77 e.printStackTrace(); 79 e.printStackTrace();
78 } 80 }
79 - List<Selectable> authorNodes = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='contributors']/span/span/span").nodes(); 81 + List<Selectable> authorNodes = headSelectable.xpath("//span[@property='author']").nodes();
80 StringBuffer authorName = new StringBuffer(); 82 StringBuffer authorName = new StringBuffer();
81 for (Selectable node : authorNodes) { 83 for (Selectable node : authorNodes) {
82 - String name = node.xpath("//a/span/text()").get();  
83 - if (StringUtils.isBlank(name)) { 84 + String giveName = node.xpath("//span[@property='givenName']/text()").get();
  85 + String familyName = node.xpath("//span[@property='familyName']/text()").get();
  86 + if (StringUtils.isBlank(giveName) && StringUtils.isBlank(familyName)) {
84 continue; 87 continue;
85 } 88 }
86 - authorName.append(name).append(" "); 89 + authorName.append(giveName).append(" ").append(familyName).append(",");
87 } 90 }
88 91
89 92
90 JSONArray authorEmail = new JSONArray(); 93 JSONArray authorEmail = new JSONArray();
91 - List<Selectable> authorEmailSelectables = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='contributors']").xpath("//span[@class='authors']").xpath("//span[@role='list']").xpath("//span[@property='author']").nodes(); 94 + List<Selectable> authorEmailSelectables = headSelectable.xpath("//span[@property='author']").nodes();
92 for (Selectable authorEmailSelectable : authorEmailSelectables) { 95 for (Selectable authorEmailSelectable : authorEmailSelectables) {
93 String givenName = authorEmailSelectable.xpath("//span[@property='givenName']/text()").get(); 96 String givenName = authorEmailSelectable.xpath("//span[@property='givenName']/text()").get();
94 String familyName = authorEmailSelectable.xpath("//span[@property='familyName']/text()").get(); 97 String familyName = authorEmailSelectable.xpath("//span[@property='familyName']/text()").get();
@@ -98,7 +101,7 @@ public class Science4JournalArticlePageProcessor implements PageProcessor { @@ -98,7 +101,7 @@ public class Science4JournalArticlePageProcessor implements PageProcessor {
98 } 101 }
99 102
100 JSONObject jsonObject = new JSONObject(); 103 JSONObject jsonObject = new JSONObject();
101 - jsonObject.put("authorEmailName", givenName + "" + familyName); 104 + jsonObject.put("authorEmailName", givenName + " " + familyName);
102 jsonObject.put("email", email); 105 jsonObject.put("email", email);
103 authorEmail.add(jsonObject); 106 authorEmail.add(jsonObject);
104 } 107 }