NatureJob.java
2.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
package com.canrd.webmagic.job;
import com.canrd.webmagic.common.utils.KeywordUtil;
import com.canrd.webmagic.processor.NatureSearchPageProcessor;
import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Spider;
import javax.annotation.Resource;
/**
* @author: xms
* @description: TODO
* @date: 2024/4/26 10:06
* @version: 1.0
*/
@Component
public class NatureJob {
@Resource
private NatureSearchPageProcessor natureSearchPageProcessor;
@Resource
private ArticlePipeline articlePipeline;
/**
* 每天凌晨执行一次
*/
// @Scheduled(cron = "*/20 * * * * ?")
@Scheduled(cron = "0 0 0 * * ?")
public void executeByDay() {
for (String keyword : KeywordUtil.getKeyWordList()) {
Spider.create(natureSearchPageProcessor)
// 添加这个Spider要爬取的网页地址
.addUrl("https://www.nature.com/search?q=" + keyword + "&order=date_desc&page=" + 1)
.addUrl("https://www.nature.com/search?q=" + keyword + "&order=date_desc&page=" + 2)
.addUrl("https://www.nature.com/search?q=" + keyword + "&order=date_desc&page=" + 3)
.addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nnano&order=date_desc&page=" + 1)
.addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nnano&order=date_desc&page=" + 2)
.addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nnano&order=date_desc&page=" + 3)
.addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nphys&order=date_desc&page=" + 1)
.addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nphys&order=date_desc&page=" + 2)
.addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nphys&order=date_desc&page=" + 3)
.addPipeline(articlePipeline)
// 开启20个线程执行,并开始爬取
.thread(20).run();
}
}
}