Commit 7b1216da57e8b31fa31810df465dd777a4594fbb
1 parent
f85930c5
feat:
1、science-spj 爬取
Showing
1 changed file
with
15 additions
and
12 deletions
src/main/java/com/canrd/webmagic/processor/Science4JournalArticlePageProcessor.java
... | ... | @@ -55,19 +55,21 @@ public class Science4JournalArticlePageProcessor implements PageProcessor { |
55 | 55 | //解析页面 |
56 | 56 | Html html = page.getHtml(); |
57 | 57 | String articleCode = page.getUrl().get(); |
58 | - Selectable headSelectable = html.xpath("//div[@class='article-container']/article/header"); | |
58 | + Selectable articleSelectable = html.xpath("//article[@xmlns='http://www.w3.org/1999/xhtml']"); | |
59 | 59 | |
60 | - String title = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='core-lede']/div/text()").get(); | |
60 | + Selectable headSelectable = articleSelectable.xpath("//header/div"); | |
61 | + | |
62 | + String title = headSelectable.xpath("//div[@class='core-lede']/div/text()").get(); | |
61 | 63 | if (StringUtils.isBlank(title)) { |
62 | - title = html.xpath("//div[@class='article-container']/article/header").xpath("//h1[@property='name']/text()").get(); | |
64 | + title = headSelectable.xpath("//h1[@property='name']/text()").get(); | |
63 | 65 | } |
64 | 66 | |
65 | - String articleDesc = html.xpath("//div[@class='article-container']/article").xpath("//section[@id='bodymatter']/div/div/text()").get(); | |
67 | + String articleDesc = articleSelectable.xpath("//section[@id='bodymatter']/div/div/text()").get(); | |
66 | 68 | if (StringUtils.isBlank(articleDesc)) { |
67 | - articleDesc = html.xpath("//div[@class='article-container']/article").xpath("//div[@role='paragraph']/text()").get(); | |
69 | + articleDesc = articleSelectable.xpath("//div[@role='paragraph']/text()").get(); | |
68 | 70 | } |
69 | 71 | |
70 | - String publishTime = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='core-self-citation']").xpath("//div[@class='core-date-published']/span/text()").get(); | |
72 | + String publishTime = headSelectable.xpath("//span[@property='datePublished']/text()").get(); | |
71 | 73 | Date publishTimeDateTime = null; |
72 | 74 | SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH); |
73 | 75 | |
... | ... | @@ -76,19 +78,20 @@ public class Science4JournalArticlePageProcessor implements PageProcessor { |
76 | 78 | } catch (ParseException e) { |
77 | 79 | e.printStackTrace(); |
78 | 80 | } |
79 | - List<Selectable> authorNodes = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='contributors']/span/span/span").nodes(); | |
81 | + List<Selectable> authorNodes = headSelectable.xpath("//span[@property='author']").nodes(); | |
80 | 82 | StringBuffer authorName = new StringBuffer(); |
81 | 83 | for (Selectable node : authorNodes) { |
82 | - String name = node.xpath("//a/span/text()").get(); | |
83 | - if (StringUtils.isBlank(name)) { | |
84 | + String giveName = node.xpath("//span[@property='givenName']/text()").get(); | |
85 | + String familyName = node.xpath("//span[@property='familyName']/text()").get(); | |
86 | + if (StringUtils.isBlank(giveName) && StringUtils.isBlank(familyName)) { | |
84 | 87 | continue; |
85 | 88 | } |
86 | - authorName.append(name).append(" "); | |
89 | + authorName.append(giveName).append(" ").append(familyName).append(","); | |
87 | 90 | } |
88 | 91 | |
89 | 92 | |
90 | 93 | JSONArray authorEmail = new JSONArray(); |
91 | - List<Selectable> authorEmailSelectables = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='contributors']").xpath("//span[@class='authors']").xpath("//span[@role='list']").xpath("//span[@property='author']").nodes(); | |
94 | + List<Selectable> authorEmailSelectables = headSelectable.xpath("//span[@property='author']").nodes(); | |
92 | 95 | for (Selectable authorEmailSelectable : authorEmailSelectables) { |
93 | 96 | String givenName = authorEmailSelectable.xpath("//span[@property='givenName']/text()").get(); |
94 | 97 | String familyName = authorEmailSelectable.xpath("//span[@property='familyName']/text()").get(); |
... | ... | @@ -98,7 +101,7 @@ public class Science4JournalArticlePageProcessor implements PageProcessor { |
98 | 101 | } |
99 | 102 | |
100 | 103 | JSONObject jsonObject = new JSONObject(); |
101 | - jsonObject.put("authorEmailName", givenName + "" + familyName); | |
104 | + jsonObject.put("authorEmailName", givenName + " " + familyName); | |
102 | 105 | jsonObject.put("email", email); |
103 | 106 | authorEmail.add(jsonObject); |
104 | 107 | } | ... | ... |