Science4JournalController.java
2.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
package com.canrd.webmagic.controller;
import com.canrd.webmagic.common.constant.ServerResult;
import com.canrd.webmagic.common.utils.KeywordUtil;
import com.canrd.webmagic.processor.Science4JournalSearchPageProcessor;
import com.canrd.webmagic.processor.download.SeleniumDownloader;
import org.apache.logging.log4j.core.util.UuidUtil;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import us.codecraft.webmagic.Spider;
import javax.annotation.Resource;
/**
* nature-文章信息(NatureArticle)表控制层
*
* @author makejava
* @since 2024-04-07 18:39:41
*/
@RestController
@RequestMapping("/science/journal")
public class Science4JournalController {
@Resource
private Science4JournalSearchPageProcessor science4JournalSearchPageProcessor;
@Resource
private SeleniumDownloader seleniumDownloader;
/**
* @return
*/
@GetMapping("/start")
public ServerResult start(@RequestParam(value = "indexSize") Integer indexSize) {
for (int i = 0; i <= indexSize; i++) {
Spider.create(science4JournalSearchPageProcessor)
.addUrl("https://www.science.org/journal/science/insights?startPage=" + i)
// 开启5个线程执行,并开始爬取
.setUUID(UuidUtil.getTimeBasedUuid().toString())
.setDownloader(seleniumDownloader)
.thread(5).run();
}
return ServerResult.success();
}
/**
* @return
*/
@GetMapping("/search")
public ServerResult search(@RequestParam(value = "indexSize") Integer indexSize) {
for (int i = 0; i <= indexSize; i++) {
for (String keyword : KeywordUtil.getKeyWordList()) {
Spider.create(science4JournalSearchPageProcessor)
.addUrl("https://www.science.org/action/doSearch?AllField=" + keyword + "&pageSize=20&startPage=" + i)
// 开启5个线程执行,并开始爬取
.setUUID(UuidUtil.getTimeBasedUuid().toString())
.setDownloader(seleniumDownloader)
.thread(5).run();
}
}
return ServerResult.success();
}
}