Commit 885a66af79d27afc398e418f8b0d0b23f74157e9
1 parent
72c685be
feat:
1、nature 爬取调整
Showing
4 changed files
with
48 additions
and
12 deletions
src/main/java/com/canrd/webmagic/controller/ArticleController.java renamed to src/main/java/com/canrd/webmagic/controller/NatureController.java
1 | package com.canrd.webmagic.controller; | 1 | package com.canrd.webmagic.controller; |
2 | 2 | ||
3 | import com.canrd.webmagic.common.constant.ServerResult; | 3 | import com.canrd.webmagic.common.constant.ServerResult; |
4 | +import com.canrd.webmagic.common.utils.KeywordUtil; | ||
5 | +import com.canrd.webmagic.common.utils.StringUtils; | ||
4 | import com.canrd.webmagic.processor.NatureSearchPageProcessor; | 6 | import com.canrd.webmagic.processor.NatureSearchPageProcessor; |
5 | import com.canrd.webmagic.processor.download.Downloader; | 7 | import com.canrd.webmagic.processor.download.Downloader; |
6 | import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | 8 | import com.canrd.webmagic.processor.pipeline.ArticlePipeline; |
9 | +import org.apache.logging.log4j.core.util.UuidUtil; | ||
7 | import org.springframework.web.bind.annotation.GetMapping; | 10 | import org.springframework.web.bind.annotation.GetMapping; |
8 | import org.springframework.web.bind.annotation.RequestMapping; | 11 | import org.springframework.web.bind.annotation.RequestMapping; |
9 | import org.springframework.web.bind.annotation.RequestParam; | 12 | import org.springframework.web.bind.annotation.RequestParam; |
@@ -20,7 +23,7 @@ import javax.annotation.Resource; | @@ -20,7 +23,7 @@ import javax.annotation.Resource; | ||
20 | */ | 23 | */ |
21 | @RestController | 24 | @RestController |
22 | @RequestMapping("/nature/article") | 25 | @RequestMapping("/nature/article") |
23 | -public class ArticleController { | 26 | +public class NatureController { |
24 | 27 | ||
25 | @Resource | 28 | @Resource |
26 | private NatureSearchPageProcessor natureSearchPageProcessor; | 29 | private NatureSearchPageProcessor natureSearchPageProcessor; |
@@ -35,7 +38,7 @@ public class ArticleController { | @@ -35,7 +38,7 @@ public class ArticleController { | ||
35 | * @return | 38 | * @return |
36 | */ | 39 | */ |
37 | @GetMapping("/start") | 40 | @GetMapping("/start") |
38 | - public ServerResult start(@RequestParam(value = "keyword") String keyword, @RequestParam(value = "indexSize") Integer indexSize) { | 41 | + public ServerResult start(@RequestParam(value = "indexSize") Integer indexSize) { |
39 | for (int i = 1; i <= indexSize; i++) { | 42 | for (int i = 1; i <= indexSize; i++) { |
40 | Spider.create(natureSearchPageProcessor) | 43 | Spider.create(natureSearchPageProcessor) |
41 | .addUrl("https://www.nature.com/nature/research-articles?sort=PubDate&page=" + i) | 44 | .addUrl("https://www.nature.com/nature/research-articles?sort=PubDate&page=" + i) |
@@ -47,5 +50,33 @@ public class ArticleController { | @@ -47,5 +50,33 @@ public class ArticleController { | ||
47 | 50 | ||
48 | return ServerResult.success(); | 51 | return ServerResult.success(); |
49 | } | 52 | } |
53 | + | ||
54 | + /** | ||
55 | + * @return | ||
56 | + */ | ||
57 | + @GetMapping("/search") | ||
58 | + public ServerResult search(@RequestParam(value = "indexSize") Integer indexSize, | ||
59 | + @RequestParam(value = "orderBy") String orderBy, | ||
60 | + @RequestParam(value = "journal") String journal) { | ||
61 | + StringBuffer url = new StringBuffer("https://www.nature.com/search?q="); | ||
62 | + for (int i = 1; i <= indexSize; i++) { | ||
63 | + for (String keyword : KeywordUtil.getKeyWordList()) { | ||
64 | + if (StringUtils.isBlank(orderBy)) { | ||
65 | + url.append(keyword).append("&page=" + i); | ||
66 | + } else { | ||
67 | + url.append(keyword).append("&order=" + orderBy).append("&page=" + i); | ||
68 | + } | ||
69 | + if (StringUtils.isNotBlank(journal)) { | ||
70 | + url.append("&journal=" + journal); | ||
71 | + } | ||
72 | + Spider.create(natureSearchPageProcessor) | ||
73 | + .addUrl(url.toString()) | ||
74 | + // 开启5个线程执行,并开始爬取 | ||
75 | + .setUUID(UuidUtil.getTimeBasedUuid().toString()) | ||
76 | + .thread(5).run(); | ||
77 | + } | ||
78 | + } | ||
79 | + return ServerResult.success(); | ||
80 | + } | ||
50 | } | 81 | } |
51 | 82 |
src/main/java/com/canrd/webmagic/job/NatureJob.java
@@ -33,13 +33,18 @@ public class NatureJob { | @@ -33,13 +33,18 @@ public class NatureJob { | ||
33 | for (String keyword : KeywordUtil.getKeyWordList()) { | 33 | for (String keyword : KeywordUtil.getKeyWordList()) { |
34 | Spider.create(natureSearchPageProcessor) | 34 | Spider.create(natureSearchPageProcessor) |
35 | // 添加这个Spider要爬取的网页地址 | 35 | // 添加这个Spider要爬取的网页地址 |
36 | - .addUrl("https://www.nature.com/search?q=" + keyword + "&page=" + 1) | ||
37 | - .addUrl("https://www.nature.com/search?q=" + keyword + "&page=" + 2) | ||
38 | - .addUrl("https://www.nature.com/search?q=" + keyword + "&page=" + 3) | 36 | + .addUrl("https://www.nature.com/search?q=" + keyword + "&order=date_desc&page=" + 1) |
37 | + .addUrl("https://www.nature.com/search?q=" + keyword + "&order=date_desc&page=" + 2) | ||
38 | + .addUrl("https://www.nature.com/search?q=" + keyword + "&order=date_desc&page=" + 3) | ||
39 | + .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nnano&order=date_desc&page=" + 1) | ||
40 | + .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nnano&order=date_desc&page=" + 2) | ||
41 | + .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nnano&order=date_desc&page=" + 3) | ||
42 | + .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nphys&order=date_desc&page=" + 1) | ||
43 | + .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nphys&order=date_desc&page=" + 2) | ||
44 | + .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nphys&order=date_desc&page=" + 3) | ||
39 | .addPipeline(articlePipeline) | 45 | .addPipeline(articlePipeline) |
40 | -// .setDownloader(downloader.newIpDownloader()) | ||
41 | - // 开启5个线程执行,并开始爬取 | ||
42 | - .thread(5).run(); | 46 | + // 开启20个线程执行,并开始爬取 |
47 | + .thread(20).run(); | ||
43 | } | 48 | } |
44 | } | 49 | } |
45 | } | 50 | } |
src/main/java/com/canrd/webmagic/processor/NatureSearchPageProcessor.java
@@ -155,9 +155,9 @@ public class NatureSearchPageProcessor implements PageProcessor { | @@ -155,9 +155,9 @@ public class NatureSearchPageProcessor implements PageProcessor { | ||
155 | JSONObject object = new JSONObject(); | 155 | JSONObject object = new JSONObject(); |
156 | object.put("referenceTitle", referenceTitle); | 156 | object.put("referenceTitle", referenceTitle); |
157 | object.put("links", links); | 157 | object.put("links", links); |
158 | - if (CollectionUtils.isNotEmpty(links)) { | ||
159 | - page.addTargetRequests(links.stream().filter(x -> x.contains("nature")).collect(Collectors.toList())); | ||
160 | - } | 158 | +// if (CollectionUtils.isNotEmpty(links)) { |
159 | +// page.addTargetRequests(links.stream().filter(x -> x.contains("nature")).collect(Collectors.toList())); | ||
160 | +// } | ||
161 | references.add(object); | 161 | references.add(object); |
162 | } | 162 | } |
163 | } | 163 | } |
src/main/resources/application-test.yml
@@ -57,7 +57,7 @@ spring: | @@ -57,7 +57,7 @@ spring: | ||
57 | testWhileIdle: true | 57 | testWhileIdle: true |
58 | testOnBorrow: true | 58 | testOnBorrow: true |
59 | testOnReturn: true | 59 | testOnReturn: true |
60 | - password: 123456 | 60 | + password: Canrd@2023 |
61 | time-between-eviction-runs-millis: 1000 | 61 | time-between-eviction-runs-millis: 1000 |
62 | url: jdbc:mysql://localhost:3306/webmagic?useUnicode=true&characterEncoding=UTF-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=Asia/Shanghai&useSSL=false&autoReconnect=true&failOverReadOnly=false&maxReconnects=10&allowMultiQueries=true&useAffectedRows=true&autoReconnectForPools=true | 62 | url: jdbc:mysql://localhost:3306/webmagic?useUnicode=true&characterEncoding=UTF-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=Asia/Shanghai&useSSL=false&autoReconnect=true&failOverReadOnly=false&maxReconnects=10&allowMultiQueries=true&useAffectedRows=true&autoReconnectForPools=true |
63 | username: root | 63 | username: root |