Commit 885a66af79d27afc398e418f8b0d0b23f74157e9
1 parent
72c685be
feat:
1、nature 爬取调整
Showing
4 changed files
with
48 additions
and
12 deletions
src/main/java/com/canrd/webmagic/controller/ArticleController.java renamed to src/main/java/com/canrd/webmagic/controller/NatureController.java
1 | 1 | package com.canrd.webmagic.controller; |
2 | 2 | |
3 | 3 | import com.canrd.webmagic.common.constant.ServerResult; |
4 | +import com.canrd.webmagic.common.utils.KeywordUtil; | |
5 | +import com.canrd.webmagic.common.utils.StringUtils; | |
4 | 6 | import com.canrd.webmagic.processor.NatureSearchPageProcessor; |
5 | 7 | import com.canrd.webmagic.processor.download.Downloader; |
6 | 8 | import com.canrd.webmagic.processor.pipeline.ArticlePipeline; |
9 | +import org.apache.logging.log4j.core.util.UuidUtil; | |
7 | 10 | import org.springframework.web.bind.annotation.GetMapping; |
8 | 11 | import org.springframework.web.bind.annotation.RequestMapping; |
9 | 12 | import org.springframework.web.bind.annotation.RequestParam; |
... | ... | @@ -20,7 +23,7 @@ import javax.annotation.Resource; |
20 | 23 | */ |
21 | 24 | @RestController |
22 | 25 | @RequestMapping("/nature/article") |
23 | -public class ArticleController { | |
26 | +public class NatureController { | |
24 | 27 | |
25 | 28 | @Resource |
26 | 29 | private NatureSearchPageProcessor natureSearchPageProcessor; |
... | ... | @@ -35,7 +38,7 @@ public class ArticleController { |
35 | 38 | * @return |
36 | 39 | */ |
37 | 40 | @GetMapping("/start") |
38 | - public ServerResult start(@RequestParam(value = "keyword") String keyword, @RequestParam(value = "indexSize") Integer indexSize) { | |
41 | + public ServerResult start(@RequestParam(value = "indexSize") Integer indexSize) { | |
39 | 42 | for (int i = 1; i <= indexSize; i++) { |
40 | 43 | Spider.create(natureSearchPageProcessor) |
41 | 44 | .addUrl("https://www.nature.com/nature/research-articles?sort=PubDate&page=" + i) |
... | ... | @@ -47,5 +50,33 @@ public class ArticleController { |
47 | 50 | |
48 | 51 | return ServerResult.success(); |
49 | 52 | } |
53 | + | |
54 | + /** | |
55 | + * @return | |
56 | + */ | |
57 | + @GetMapping("/search") | |
58 | + public ServerResult search(@RequestParam(value = "indexSize") Integer indexSize, | |
59 | + @RequestParam(value = "orderBy") String orderBy, | |
60 | + @RequestParam(value = "journal") String journal) { | |
61 | + StringBuffer url = new StringBuffer("https://www.nature.com/search?q="); | |
62 | + for (int i = 1; i <= indexSize; i++) { | |
63 | + for (String keyword : KeywordUtil.getKeyWordList()) { | |
64 | + if (StringUtils.isBlank(orderBy)) { | |
65 | + url.append(keyword).append("&page=" + i); | |
66 | + } else { | |
67 | + url.append(keyword).append("&order=" + orderBy).append("&page=" + i); | |
68 | + } | |
69 | + if (StringUtils.isNotBlank(journal)) { | |
70 | + url.append("&journal=" + journal); | |
71 | + } | |
72 | + Spider.create(natureSearchPageProcessor) | |
73 | + .addUrl(url.toString()) | |
74 | + // 开启5个线程执行,并开始爬取 | |
75 | + .setUUID(UuidUtil.getTimeBasedUuid().toString()) | |
76 | + .thread(5).run(); | |
77 | + } | |
78 | + } | |
79 | + return ServerResult.success(); | |
80 | + } | |
50 | 81 | } |
51 | 82 | ... | ... |
src/main/java/com/canrd/webmagic/job/NatureJob.java
... | ... | @@ -33,13 +33,18 @@ public class NatureJob { |
33 | 33 | for (String keyword : KeywordUtil.getKeyWordList()) { |
34 | 34 | Spider.create(natureSearchPageProcessor) |
35 | 35 | // 添加这个Spider要爬取的网页地址 |
36 | - .addUrl("https://www.nature.com/search?q=" + keyword + "&page=" + 1) | |
37 | - .addUrl("https://www.nature.com/search?q=" + keyword + "&page=" + 2) | |
38 | - .addUrl("https://www.nature.com/search?q=" + keyword + "&page=" + 3) | |
36 | + .addUrl("https://www.nature.com/search?q=" + keyword + "&order=date_desc&page=" + 1) | |
37 | + .addUrl("https://www.nature.com/search?q=" + keyword + "&order=date_desc&page=" + 2) | |
38 | + .addUrl("https://www.nature.com/search?q=" + keyword + "&order=date_desc&page=" + 3) | |
39 | + .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nnano&order=date_desc&page=" + 1) | |
40 | + .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nnano&order=date_desc&page=" + 2) | |
41 | + .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nnano&order=date_desc&page=" + 3) | |
42 | + .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nphys&order=date_desc&page=" + 1) | |
43 | + .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nphys&order=date_desc&page=" + 2) | |
44 | + .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nphys&order=date_desc&page=" + 3) | |
39 | 45 | .addPipeline(articlePipeline) |
40 | -// .setDownloader(downloader.newIpDownloader()) | |
41 | - // 开启5个线程执行,并开始爬取 | |
42 | - .thread(5).run(); | |
46 | + // 开启20个线程执行,并开始爬取 | |
47 | + .thread(20).run(); | |
43 | 48 | } |
44 | 49 | } |
45 | 50 | } | ... | ... |
src/main/java/com/canrd/webmagic/processor/NatureSearchPageProcessor.java
... | ... | @@ -155,9 +155,9 @@ public class NatureSearchPageProcessor implements PageProcessor { |
155 | 155 | JSONObject object = new JSONObject(); |
156 | 156 | object.put("referenceTitle", referenceTitle); |
157 | 157 | object.put("links", links); |
158 | - if (CollectionUtils.isNotEmpty(links)) { | |
159 | - page.addTargetRequests(links.stream().filter(x -> x.contains("nature")).collect(Collectors.toList())); | |
160 | - } | |
158 | +// if (CollectionUtils.isNotEmpty(links)) { | |
159 | +// page.addTargetRequests(links.stream().filter(x -> x.contains("nature")).collect(Collectors.toList())); | |
160 | +// } | |
161 | 161 | references.add(object); |
162 | 162 | } |
163 | 163 | } | ... | ... |
src/main/resources/application-test.yml
... | ... | @@ -57,7 +57,7 @@ spring: |
57 | 57 | testWhileIdle: true |
58 | 58 | testOnBorrow: true |
59 | 59 | testOnReturn: true |
60 | - password: 123456 | |
60 | + password: Canrd@2023 | |
61 | 61 | time-between-eviction-runs-millis: 1000 |
62 | 62 | url: jdbc:mysql://localhost:3306/webmagic?useUnicode=true&characterEncoding=UTF-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=Asia/Shanghai&useSSL=false&autoReconnect=true&failOverReadOnly=false&maxReconnects=10&allowMultiQueries=true&useAffectedRows=true&autoReconnectForPools=true |
63 | 63 | username: root | ... | ... |