Science4SpjSearchPageProcessor.java
6.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
package com.canrd.webmagic.processor;
import com.canrd.webmagic.common.utils.DateUtil;
import com.canrd.webmagic.common.utils.KeywordUtil;
import com.canrd.webmagic.processor.config.Agent;
import com.canrd.webmagic.processor.download.SeleniumDownloader;
import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
import lombok.extern.slf4j.Slf4j;
import org.apache.logging.log4j.core.util.UuidUtil;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;
import us.codecraft.webmagic.selector.XpathSelector;
import javax.annotation.Resource;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
import java.util.Locale;
/**
* https://spj.science.org/action/doSearch?AllField=Nickel+foam&pageSize=20&startPage=0
*
* @author: xms
* @description: TODO
* @date: 2024/4/1 14:19
* @version: 1.0
*/
@Slf4j
@Component
public class Science4SpjSearchPageProcessor implements PageProcessor {
@Resource
private Science4SpjArticlePageProcessor science4SpjArticlePageProcessor;
@Resource
private SeleniumDownloader seleniumDownloader;
@Resource
private ArticlePipeline articlePipeline;
/**
* 抓取网站的相关配置,包括编码、抓取间隔、重试次数等
*/
private Site site = Site.me().setRetryTimes(3).setSleepTime(5).setUserAgent(Agent.getRandom());
/**
* 定制爬虫逻辑的核心接口,在这里编写抽取逻辑
*
* @param page
*/
@Override
public void process(Page page) {
if (page.getUrl().get().contains("doSearch")) {
doSearch(page);
} else {
doArticleList(page);
}
}
/**
* @param page
*/
private void doSearch(Page page) {
String url = page.getUrl().get();
/**
* 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容
* 1、通过$或css()方法获取到该page html下某元素dom
*/
Selectable selectable = page.getHtml().xpath("//div[@class=' search-result__body titles-results ']").select(new XpathSelector("div[@class='card pb-3 mb-4 border-bottom']"));
List<Selectable> nodes = selectable.nodes();
/**
* 获取到指定的dom后,从这些dom中提取元素内容。
*/
for (int i = 0; i <= nodes.size() - 1; i++) {
String title = nodes.get(i).xpath("//div[@class='card pb-3 mb-4 border-bottom']/div").xpath("//div[@class='d-flex justify-content-between align-items-end']/div/span/h2/a/text()").get();
String time = nodes.get(i).xpath("//div[@class='card-meta align-middle mb-2 text-uppercase text-darker-gray']/span").xpath("//time/text()").get();
String link = nodes.get(i).links().get();
SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH);
try {
Date publishTimeDateTime = formatter.parse(time);
if (!publishTimeDateTime.before(DateUtil.localDate2Date(DateUtil.parseDate("2000-01-01", DateUtil.DATE)))) {
Spider.create(science4SpjArticlePageProcessor)
.addUrl(link)
.addPipeline(articlePipeline)
.setDownloader(seleniumDownloader)
.setUUID(UuidUtil.getTimeBasedUuid().toString())
// 开启5个线程执行,并开始爬取
.thread(1).run();
log.info("关键字文章列表链接:{},标题:{},文章链接:{}", url, title, link);
}
} catch (ParseException e) {
e.printStackTrace();
}
}
}
/**
* @param page
*/
private void doArticleList(Page page) {
String url = page.getUrl().get();
/**
* 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容
* 1、通过$或css()方法获取到该page html下某元素dom
*/
Selectable selectable = page.getHtml().xpath("//div[@class=' search-result__body titles-results ']").select(new XpathSelector("div[@class='card pb-3 mb-4 border-bottom']"));
List<Selectable> nodes = selectable.nodes();
/**
* 获取到指定的dom后,从这些dom中提取元素内容。
*/
for (int i = 0; i <= nodes.size() - 1; i++) {
String title = nodes.get(i).xpath("//div[@class='card pb-3 mb-4 border-bottom']/div").xpath("//div[@class='d-flex justify-content-between align-items-end']/div/span/h2/a/text()").get();
String time = nodes.get(i).xpath("//div[@class='card-meta align-middle mb-2 text-uppercase text-darker-gray']/span").xpath("//time/text()").get();
String link = nodes.get(i).links().get();
if (KeywordUtil.containKeywordsInTitle(title)) {
SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH);
try {
Date publishTimeDateTime = formatter.parse(time);
if (!publishTimeDateTime.before(DateUtil.localDate2Date(DateUtil.parseDate("2000-01-01", DateUtil.DATE)))) {
Spider.create(science4SpjArticlePageProcessor)
.addUrl(link)
.addPipeline(articlePipeline)
.setDownloader(seleniumDownloader)
.setUUID(UuidUtil.getTimeBasedUuid().toString())
// 开启5个线程执行,并开始爬取
.thread(1).run();
log.info("关键字文章列表链接:{},标题:{},文章链接:{}", url, title, link);
}
} catch (ParseException e) {
e.printStackTrace();
}
}
}
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
// 创建一个Spider,并把我们的处理器放进去
Spider.create(new Science4SpjSearchPageProcessor())
// 添加这个Spider要爬取的网页地址
.addUrl("https://www.science.org/journal/science/insights?startPage=0")
.addPipeline(new ArticlePipeline())
// 开启5个线程执行,并开始爬取
.thread(5).run();
}
}