Commit 7b1216da57e8b31fa31810df465dd777a4594fbb
1 parent
f85930c5
feat:
1、science-spj 爬取
Showing
1 changed file
with
15 additions
and
12 deletions
src/main/java/com/canrd/webmagic/processor/Science4JournalArticlePageProcessor.java
@@ -55,19 +55,21 @@ public class Science4JournalArticlePageProcessor implements PageProcessor { | @@ -55,19 +55,21 @@ public class Science4JournalArticlePageProcessor implements PageProcessor { | ||
55 | //解析页面 | 55 | //解析页面 |
56 | Html html = page.getHtml(); | 56 | Html html = page.getHtml(); |
57 | String articleCode = page.getUrl().get(); | 57 | String articleCode = page.getUrl().get(); |
58 | - Selectable headSelectable = html.xpath("//div[@class='article-container']/article/header"); | 58 | + Selectable articleSelectable = html.xpath("//article[@xmlns='http://www.w3.org/1999/xhtml']"); |
59 | 59 | ||
60 | - String title = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='core-lede']/div/text()").get(); | 60 | + Selectable headSelectable = articleSelectable.xpath("//header/div"); |
61 | + | ||
62 | + String title = headSelectable.xpath("//div[@class='core-lede']/div/text()").get(); | ||
61 | if (StringUtils.isBlank(title)) { | 63 | if (StringUtils.isBlank(title)) { |
62 | - title = html.xpath("//div[@class='article-container']/article/header").xpath("//h1[@property='name']/text()").get(); | 64 | + title = headSelectable.xpath("//h1[@property='name']/text()").get(); |
63 | } | 65 | } |
64 | 66 | ||
65 | - String articleDesc = html.xpath("//div[@class='article-container']/article").xpath("//section[@id='bodymatter']/div/div/text()").get(); | 67 | + String articleDesc = articleSelectable.xpath("//section[@id='bodymatter']/div/div/text()").get(); |
66 | if (StringUtils.isBlank(articleDesc)) { | 68 | if (StringUtils.isBlank(articleDesc)) { |
67 | - articleDesc = html.xpath("//div[@class='article-container']/article").xpath("//div[@role='paragraph']/text()").get(); | 69 | + articleDesc = articleSelectable.xpath("//div[@role='paragraph']/text()").get(); |
68 | } | 70 | } |
69 | 71 | ||
70 | - String publishTime = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='core-self-citation']").xpath("//div[@class='core-date-published']/span/text()").get(); | 72 | + String publishTime = headSelectable.xpath("//span[@property='datePublished']/text()").get(); |
71 | Date publishTimeDateTime = null; | 73 | Date publishTimeDateTime = null; |
72 | SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH); | 74 | SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH); |
73 | 75 | ||
@@ -76,19 +78,20 @@ public class Science4JournalArticlePageProcessor implements PageProcessor { | @@ -76,19 +78,20 @@ public class Science4JournalArticlePageProcessor implements PageProcessor { | ||
76 | } catch (ParseException e) { | 78 | } catch (ParseException e) { |
77 | e.printStackTrace(); | 79 | e.printStackTrace(); |
78 | } | 80 | } |
79 | - List<Selectable> authorNodes = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='contributors']/span/span/span").nodes(); | 81 | + List<Selectable> authorNodes = headSelectable.xpath("//span[@property='author']").nodes(); |
80 | StringBuffer authorName = new StringBuffer(); | 82 | StringBuffer authorName = new StringBuffer(); |
81 | for (Selectable node : authorNodes) { | 83 | for (Selectable node : authorNodes) { |
82 | - String name = node.xpath("//a/span/text()").get(); | ||
83 | - if (StringUtils.isBlank(name)) { | 84 | + String giveName = node.xpath("//span[@property='givenName']/text()").get(); |
85 | + String familyName = node.xpath("//span[@property='familyName']/text()").get(); | ||
86 | + if (StringUtils.isBlank(giveName) && StringUtils.isBlank(familyName)) { | ||
84 | continue; | 87 | continue; |
85 | } | 88 | } |
86 | - authorName.append(name).append(" "); | 89 | + authorName.append(giveName).append(" ").append(familyName).append(","); |
87 | } | 90 | } |
88 | 91 | ||
89 | 92 | ||
90 | JSONArray authorEmail = new JSONArray(); | 93 | JSONArray authorEmail = new JSONArray(); |
91 | - List<Selectable> authorEmailSelectables = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='contributors']").xpath("//span[@class='authors']").xpath("//span[@role='list']").xpath("//span[@property='author']").nodes(); | 94 | + List<Selectable> authorEmailSelectables = headSelectable.xpath("//span[@property='author']").nodes(); |
92 | for (Selectable authorEmailSelectable : authorEmailSelectables) { | 95 | for (Selectable authorEmailSelectable : authorEmailSelectables) { |
93 | String givenName = authorEmailSelectable.xpath("//span[@property='givenName']/text()").get(); | 96 | String givenName = authorEmailSelectable.xpath("//span[@property='givenName']/text()").get(); |
94 | String familyName = authorEmailSelectable.xpath("//span[@property='familyName']/text()").get(); | 97 | String familyName = authorEmailSelectable.xpath("//span[@property='familyName']/text()").get(); |
@@ -98,7 +101,7 @@ public class Science4JournalArticlePageProcessor implements PageProcessor { | @@ -98,7 +101,7 @@ public class Science4JournalArticlePageProcessor implements PageProcessor { | ||
98 | } | 101 | } |
99 | 102 | ||
100 | JSONObject jsonObject = new JSONObject(); | 103 | JSONObject jsonObject = new JSONObject(); |
101 | - jsonObject.put("authorEmailName", givenName + "" + familyName); | 104 | + jsonObject.put("authorEmailName", givenName + " " + familyName); |
102 | jsonObject.put("email", email); | 105 | jsonObject.put("email", email); |
103 | authorEmail.add(jsonObject); | 106 | authorEmail.add(jsonObject); |
104 | } | 107 | } |