谢茂盛
authored
|
1
2
|
package com.canrd.webmagic.processor;
|
谢茂盛
authored
|
3
4
|
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
|
谢茂盛
authored
|
5
|
import com.baomidou.mybatisplus.core.toolkit.CollectionUtils;
|
谢茂盛
authored
|
6
|
import com.canrd.webmagic.common.utils.DateUtil;
|
谢茂盛
authored
|
7
|
import com.canrd.webmagic.common.utils.KeywordUtil;
|
谢茂盛
authored
|
8
|
import com.canrd.webmagic.common.utils.StringUtils;
|
谢茂盛
authored
|
9
10
|
import com.canrd.webmagic.domain.ArticleTypeEnum;
import com.canrd.webmagic.domain.dto.ArticleDO;
|
谢茂盛
authored
|
11
|
import com.canrd.webmagic.processor.config.Agent;
|
谢茂盛
authored
|
12
|
import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
|
谢茂盛
authored
|
13
|
import lombok.extern.slf4j.Slf4j;
|
|
14
|
import org.springframework.stereotype.Component;
|
谢茂盛
authored
|
15
16
17
18
|
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
|
谢茂盛
authored
|
19
|
import us.codecraft.webmagic.selector.Html;
|
谢茂盛
authored
|
20
21
22
|
import us.codecraft.webmagic.selector.Selectable;
import us.codecraft.webmagic.selector.XpathSelector;
|
谢茂盛
authored
|
23
24
25
|
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
|
谢茂盛
authored
|
26
|
import java.util.stream.Collectors;
|
谢茂盛
authored
|
27
28
29
30
31
32
33
|
/**
* @author: xms
* @description: TODO
* @date: 2024/4/1 14:19
* @version: 1.0
*/
|
谢茂盛
authored
|
34
|
@Slf4j
|
|
35
|
@Component
|
谢茂盛
authored
|
36
|
public class NatureSearchPageProcessor implements PageProcessor {
|
谢茂盛
authored
|
37
|
private String agent = Agent.getRandom();
|
谢茂盛
authored
|
38
39
|
// 抓取网站的相关配置,包括编码、抓取间隔、重试次数等
|
谢茂盛
authored
|
40
|
private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
|
谢茂盛
authored
|
41
42
43
44
45
46
47
48
|
/**
* 定制爬虫逻辑的核心接口,在这里编写抽取逻辑
*
* @param page
*/
@Override
public void process(Page page) {
|
|
49
|
if (page.getUrl().get().contains("search")) {
|
谢茂盛
authored
|
50
|
doArticleList(page);
|
谢茂盛
authored
|
51
52
|
} else if (page.getUrl().get().contains("research-articles")) {
doArticleList4ReSearch(page);
|
谢茂盛
authored
|
53
54
55
|
} else if (page.getUrl().get().contains("/articles?searchType=journalSearch")) {
doArticleList4ReSearch(page);
}else {
|
谢茂盛
authored
|
56
57
58
59
60
|
doArticleContent(page);
}
}
|
谢茂盛
authored
|
61
62
63
|
/**
* @param page
*/
|
谢茂盛
authored
|
64
|
private void doArticleList4ReSearch(Page page) {
|
谢茂盛
authored
|
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
|
String url = page.getUrl().get();
/**
* 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容
* 1、通过$或css()方法获取到该page html下某元素dom
*/
Selectable selectable = page.getHtml().$(".app-article-list-row").select(
new XpathSelector("li[@class='app-article-list-row__item']")
);
List<Selectable> nodes = selectable.nodes();
/**
* 获取到指定的dom后,从这些dom中提取元素内容。
*/
for (int i = 1; i <= nodes.size() - 1; i++) {
Selectable node = nodes.get(i).$(".u-full-height").nodes().get(2).nodes().get(0).$(".u-full-height").select(new XpathSelector("a[@class='c-card__link u-link-inherit']")).nodes().get(0);
String link = node.$("a", "href").get();
String title = node.$("a", "text").get();
|
谢茂盛
authored
|
82
83
84
85
|
if (KeywordUtil.containKeywordsInTitle(title)) {
page.addTargetRequest(link);
log.info("research文章列表链接:{},标题:{},文章链接:{}", url, title, link);
}
|
谢茂盛
authored
|
86
87
88
|
}
}
|
谢茂盛
authored
|
89
90
91
|
/**
* @param page
*/
|
谢茂盛
authored
|
92
|
private void doArticleContent(Page page) {
|
谢茂盛
authored
|
93
94
95
|
if (page.getUrl().get().contains("redirect") || !page.getUrl().get().contains("nature")) {
return;
}
|
谢茂盛
authored
|
96
97
|
//解析页面
Html html = page.getHtml();
|
谢茂盛
authored
|
98
|
String articleCode = page.getUrl().get();
|
谢茂盛
authored
|
99
100
|
Selectable headSelectable = html.xpath("//div[@class='c-article-header']/header");
List<Selectable> authorEmailSelectables = html.xpath("//p[@id='corresponding-author-list']/a").nodes();
|
谢茂盛
authored
|
101
102
|
Selectable referencesSelectable = html.xpath("//ol[@class='c-article-references']").select(new XpathSelector("li[@class='c-article-references__item js-c-reading-companion-references-item']"));
Selectable authorAddressSelectable = html.xpath("//ol[@class='c-article-author-affiliation__list']").select(new XpathSelector("li"));
|
谢茂盛
authored
|
103
104
105
106
107
|
String title = headSelectable.xpath("//div/h1/text()").get();
if (StringUtils.isBlank(title)) {
title = headSelectable.xpath("//h1/text()").get();
}
|
谢茂盛
authored
|
108
|
String articleDesc = html.xpath("//div[@class='c-article-section__content']/p/text()").get();
|
|
109
|
String publishTime;
|
谢茂盛
authored
|
110
|
Date publishTimeDateTime = null;
|
|
111
112
|
try {
publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get();
|
谢茂盛
authored
|
113
|
} catch (Exception e) {
|
|
114
115
|
try {
publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(1).xpath("//li/time/text()").get();
|
谢茂盛
authored
|
116
|
} catch (Exception e1) {
|
|
117
118
119
|
publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(0).xpath("//li/time/text()").get();
}
}
|
谢茂盛
authored
|
120
121
122
123
124
125
126
|
SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH);
try {
publishTimeDateTime = formatter.parse(publishTime);
} catch (ParseException e) {
e.printStackTrace();
}
|
谢茂盛
authored
|
127
128
|
Selectable authorSelectable = headSelectable.xpath("//ul").nodes().get(1).select(new XpathSelector("li[@class='c-article-author-list__item']"));
List<Selectable> authorNodes = authorSelectable.nodes();
|
谢茂盛
authored
|
129
|
StringBuffer authorName = new StringBuffer();
|
谢茂盛
authored
|
130
|
for (Selectable node : authorNodes) {
|
谢茂盛
authored
|
131
132
|
authorName.append(node.xpath("//a/text()"));
}
|
谢茂盛
authored
|
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
|
JSONArray authorAddress = new JSONArray();
List<Selectable> authorAddressList = authorAddressSelectable.nodes();
if (CollectionUtils.isNotEmpty(authorAddressList)) {
for (Selectable selectable : authorAddressList) {
String address = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__address']/text()").get();
String authorNames = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__authors-list']/text()").get();
JSONObject object = new JSONObject();
object.put("address", address);
object.put("authorNames", authorNames);
authorAddress.add(object);
}
}
JSONArray references = new JSONArray();
List<Selectable> referenceList = referencesSelectable.nodes();
if (CollectionUtils.isNotEmpty(referenceList)) {
for (Selectable reference : referenceList) {
String referenceTitle = reference.xpath("//p").xpath("//p[@class='c-article-references__text']/text()").get();
List<Selectable> referenceLinks = reference.xpath("//p").xpath("//p[@class='c-article-references__links u-hide-print']").links().nodes();
List<String> links = new ArrayList<>();
if (CollectionUtils.isNotEmpty(referenceLinks)) {
links = referenceLinks.stream().map(x -> x.get()).collect(Collectors.toList());
}
JSONObject object = new JSONObject();
object.put("referenceTitle", referenceTitle);
object.put("links", links);
|
谢茂盛
authored
|
160
161
162
|
// if (CollectionUtils.isNotEmpty(links)) {
// page.addTargetRequests(links.stream().filter(x -> x.contains("nature")).collect(Collectors.toList()));
// }
|
谢茂盛
authored
|
163
164
165
166
167
|
references.add(object);
}
}
JSONArray authorEmail = new JSONArray();
|
谢茂盛
authored
|
168
169
170
171
172
173
174
|
for (Selectable authorEmailSelectable : authorEmailSelectables) {
String[] split = authorEmailSelectable.xpath("//a").links().get().split(":");
String email = Objects.isNull(split) ? "" : split[split.length - 1];
String authorEmailName = authorEmailSelectable.xpath("//a/text()").get();
JSONObject jsonObject = new JSONObject();
jsonObject.put("authorEmailName", authorEmailName);
jsonObject.put("email", email);
|
谢茂盛
authored
|
175
|
authorEmail.add(jsonObject);
|
谢茂盛
authored
|
176
|
}
|
谢茂盛
authored
|
177
|
log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorEmail.toJSONString());
|
|
178
|
|
谢茂盛
authored
|
179
180
|
page.putField("article", ArticleDO.builder()
.articleType(ArticleTypeEnum.NATURE.getType())
|
|
181
182
183
|
.articleCode(articleCode)
.authorName(authorName.toString())
.title(title)
|
谢茂盛
authored
|
184
|
.publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE))
|
谢茂盛
authored
|
185
186
187
188
|
.emailInfo(authorEmail.toJSONString())
.articleDesc(articleDesc)
.authorAddress(authorAddress.toJSONString())
.referenceInfo(references.toJSONString()).build());
|
谢茂盛
authored
|
189
|
}
|
谢茂盛
authored
|
190
|
|
谢茂盛
authored
|
191
192
|
private void doArticleList(Page page) {
String url = page.getUrl().get();
|
谢茂盛
authored
|
193
194
195
196
197
198
199
200
201
202
203
204
|
/**
* 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容
* 1、通过$或css()方法获取到该page html下某元素dom
*/
Selectable selectable = page.getHtml().$(".app-article-list-row").select(
new XpathSelector("li[@class='app-article-list-row__item']")
);
List<Selectable> nodes = selectable.nodes();
/**
* 获取到指定的dom后,从这些dom中提取元素内容。
*/
|
谢茂盛
authored
|
205
|
for (int i = 0; i <= nodes.size() - 1; i++) {
|
谢茂盛
authored
|
206
|
Selectable node = nodes.get(i).$(".u-full-height").nodes().get(2).nodes().get(0).$(".u-full-height").select(new XpathSelector("a[@class='c-card__link u-link-inherit']")).nodes().get(0);
|
谢茂盛
authored
|
207
|
String link = node.links().get();
|
谢茂盛
authored
|
208
|
String title = node.$("a", "text").get();
|
谢茂盛
authored
|
209
|
if (KeywordUtil.containKeywordsInTitle(title)) {
|
谢茂盛
authored
|
210
211
212
213
214
215
216
217
218
219
220
221
|
String publishTime = nodes.get(i).xpath("//div[@class='c-card__section c-meta']/time/text()").get();
SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH);
try {
Date publishTimeDateTime = formatter.parse(publishTime);
if (!publishTimeDateTime.before(DateUtil.localDate2Date(DateUtil.parseDate("2000-01-01", DateUtil.DATE)))) {
page.addTargetRequest(link);
log.info("关键字文章列表链接:{},标题:{},文章链接:{}", url, title, link);
}
} catch (ParseException e) {
e.printStackTrace();
}
|
谢茂盛
authored
|
222
|
}
|
谢茂盛
authored
|
223
|
|
谢茂盛
authored
|
224
225
226
227
228
229
230
231
232
233
234
235
|
}
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
// 创建一个Spider,并把我们的处理器放进去
Spider.create(new NatureSearchPageProcessor())
// 添加这个Spider要爬取的网页地址
|
谢茂盛
authored
|
236
|
.addUrl("https://www.nature.com/nature/research-articles?sort=PubDate&page=1")
|
谢茂盛
authored
|
237
|
.addPipeline(new ArticlePipeline())
|
谢茂盛
authored
|
238
239
240
241
|
// 开启5个线程执行,并开始爬取
.thread(5).run();
}
}
|