NaturePhysicsProcessor.java
7.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
package com.canrd.webmagic.processor;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.baomidou.mybatisplus.core.toolkit.CollectionUtils;
import com.canrd.webmagic.common.utils.DateUtil;
import com.canrd.webmagic.common.utils.StringUtils;
import com.canrd.webmagic.domain.ArticleTypeEnum;
import com.canrd.webmagic.domain.dto.ArticleDO;
import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;
import us.codecraft.webmagic.selector.XpathSelector;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.stream.Collectors;
@Slf4j
@Component
public class NaturePhysicsProcessor implements PageProcessor{
@Override
public void process(Page page) {
String url = page.getUrl().get();
if (url.equals("https://www.nature.com/nphys/")) {
everyPage(page);
}else if (url.contains("https://www.nature.com/")){
doArticleContent(page);
}
}
@Override
public Site getSite() {
return PageProcessor.super.getSite();
}
public void everyPage(Page page){
List<String> all = page.getHtml().xpath("//div[@class='u-flex-direction-column']/h3/a//@href").all();
for (int i = 0; i < all.size(); i++) {
page.addTargetRequest("https://www.nature.com/"+all.get(i));
}
}
private void doArticleContent(Page page) {
Html html = page.getHtml();
String articleCode = page.getUrl().get();
Selectable headSelectable = html.xpath("//div[@class='c-article-header']/header");
List<Selectable> authorEmailSelectables = html.xpath("//p[@id='corresponding-author-list']/a").nodes();
Selectable referencesSelectable = html.xpath("//ol[@class='c-article-references']").select(new XpathSelector("li[@class='c-article-references__item js-c-reading-companion-references-item']"));
Selectable authorAddressSelectable = html.xpath("//ol[@class='c-article-author-affiliation__list']").select(new XpathSelector("li"));
String title = headSelectable.xpath("//div/h1/text()").get();
if (StringUtils.isBlank(title)) {
title = headSelectable.xpath("//h1/text()").get();
}
String articleDesc = html.xpath("//div[@class='c-article-section__content']/p/text()").get();
String publishTime;
Date publishTimeDateTime = null;
try {
publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get();
} catch (Exception e) {
try {
publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(1).xpath("//li/time/text()").get();
} catch (Exception e1) {
publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(0).xpath("//li/time/text()").get();
}
}
SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH);
try {
publishTimeDateTime = formatter.parse(publishTime);
} catch (ParseException e) {
e.printStackTrace();
}
Selectable authorSelectable = headSelectable.xpath("//ul").nodes().get(1).select(new XpathSelector("li[@class='c-article-author-list__item']"));
List<Selectable> authorNodes = authorSelectable.nodes();
StringBuffer authorName = new StringBuffer();
for (Selectable node : authorNodes) {
authorName.append(node.xpath("//a/text()"));
}
JSONArray authorAddress = new JSONArray();
List<Selectable> authorAddressList = authorAddressSelectable.nodes();
if (CollectionUtils.isNotEmpty(authorAddressList)) {
for (Selectable selectable : authorAddressList) {
String address = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__address']/text()").get();
String authorNames = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__authors-list']/text()").get();
JSONObject object = new JSONObject();
object.put("address", address);
object.put("authorNames", authorNames);
authorAddress.add(object);
}
}
JSONArray references = new JSONArray();
List<Selectable> referenceList = referencesSelectable.nodes();
if (CollectionUtils.isNotEmpty(referenceList)) {
for (Selectable reference : referenceList) {
String referenceTitle = reference.xpath("//p").xpath("//p[@class='c-article-references__text']/text()").get();
List<Selectable> referenceLinks = reference.xpath("//p").xpath("//p[@class='c-article-references__links u-hide-print']").links().nodes();
List<String> links = new ArrayList<>();
if (CollectionUtils.isNotEmpty(referenceLinks)) {
links = referenceLinks.stream().map(x -> x.get()).collect(Collectors.toList());
}
JSONObject object = new JSONObject();
object.put("referenceTitle", referenceTitle);
object.put("links", links);
// if (CollectionUtils.isNotEmpty(links)) {
// page.addTargetRequests(links.stream().filter(x -> x.contains("nature")).collect(Collectors.toList()));
// }
references.add(object);
}
}
JSONArray authorEmail = new JSONArray();
for (Selectable authorEmailSelectable : authorEmailSelectables) {
String[] split = authorEmailSelectable.xpath("//a").links().get().split(":");
String email = Objects.isNull(split) ? "" : split[split.length - 1];
String authorEmailName = authorEmailSelectable.xpath("//a/text()").get();
JSONObject jsonObject = new JSONObject();
jsonObject.put("authorEmailName", authorEmailName);
jsonObject.put("email", email);
authorEmail.add(jsonObject);
}
log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorEmail.toJSONString());
page.putField("article", ArticleDO.builder()
.articleType(ArticleTypeEnum.NATURE_PHYSICS.getType())
.articleCode(articleCode)
.authorName(authorName.toString())
.title(title)
.publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE))
.emailInfo(authorEmail.toJSONString())
.articleDesc(articleDesc)
.authorAddress(authorAddress.toJSONString())
.referenceInfo(references.toJSONString()).build());
}
public static void main(String[] args) {
Spider.create(new MatterPagePcoessor())
.addUrl("https://www.nature.com/nenergy/research-articles")
.addPipeline(new ArticlePipeline())
.thread(1).run();
}
}