Commit 885a66af79d27afc398e418f8b0d0b23f74157e9

Authored by 谢茂盛
1 parent 72c685be

feat:

1、nature 爬取调整
src/main/java/com/canrd/webmagic/controller/ArticleController.java renamed to src/main/java/com/canrd/webmagic/controller/NatureController.java
1 package com.canrd.webmagic.controller; 1 package com.canrd.webmagic.controller;
2 2
3 import com.canrd.webmagic.common.constant.ServerResult; 3 import com.canrd.webmagic.common.constant.ServerResult;
  4 +import com.canrd.webmagic.common.utils.KeywordUtil;
  5 +import com.canrd.webmagic.common.utils.StringUtils;
4 import com.canrd.webmagic.processor.NatureSearchPageProcessor; 6 import com.canrd.webmagic.processor.NatureSearchPageProcessor;
5 import com.canrd.webmagic.processor.download.Downloader; 7 import com.canrd.webmagic.processor.download.Downloader;
6 import com.canrd.webmagic.processor.pipeline.ArticlePipeline; 8 import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
  9 +import org.apache.logging.log4j.core.util.UuidUtil;
7 import org.springframework.web.bind.annotation.GetMapping; 10 import org.springframework.web.bind.annotation.GetMapping;
8 import org.springframework.web.bind.annotation.RequestMapping; 11 import org.springframework.web.bind.annotation.RequestMapping;
9 import org.springframework.web.bind.annotation.RequestParam; 12 import org.springframework.web.bind.annotation.RequestParam;
@@ -20,7 +23,7 @@ import javax.annotation.Resource; @@ -20,7 +23,7 @@ import javax.annotation.Resource;
20 */ 23 */
21 @RestController 24 @RestController
22 @RequestMapping("/nature/article") 25 @RequestMapping("/nature/article")
23 -public class ArticleController { 26 +public class NatureController {
24 27
25 @Resource 28 @Resource
26 private NatureSearchPageProcessor natureSearchPageProcessor; 29 private NatureSearchPageProcessor natureSearchPageProcessor;
@@ -35,7 +38,7 @@ public class ArticleController { @@ -35,7 +38,7 @@ public class ArticleController {
35 * @return 38 * @return
36 */ 39 */
37 @GetMapping("/start") 40 @GetMapping("/start")
38 - public ServerResult start(@RequestParam(value = "keyword") String keyword, @RequestParam(value = "indexSize") Integer indexSize) { 41 + public ServerResult start(@RequestParam(value = "indexSize") Integer indexSize) {
39 for (int i = 1; i <= indexSize; i++) { 42 for (int i = 1; i <= indexSize; i++) {
40 Spider.create(natureSearchPageProcessor) 43 Spider.create(natureSearchPageProcessor)
41 .addUrl("https://www.nature.com/nature/research-articles?sort=PubDate&page=" + i) 44 .addUrl("https://www.nature.com/nature/research-articles?sort=PubDate&page=" + i)
@@ -47,5 +50,33 @@ public class ArticleController { @@ -47,5 +50,33 @@ public class ArticleController {
47 50
48 return ServerResult.success(); 51 return ServerResult.success();
49 } 52 }
  53 +
  54 + /**
  55 + * @return
  56 + */
  57 + @GetMapping("/search")
  58 + public ServerResult search(@RequestParam(value = "indexSize") Integer indexSize,
  59 + @RequestParam(value = "orderBy") String orderBy,
  60 + @RequestParam(value = "journal") String journal) {
  61 + StringBuffer url = new StringBuffer("https://www.nature.com/search?q=");
  62 + for (int i = 1; i <= indexSize; i++) {
  63 + for (String keyword : KeywordUtil.getKeyWordList()) {
  64 + if (StringUtils.isBlank(orderBy)) {
  65 + url.append(keyword).append("&page=" + i);
  66 + } else {
  67 + url.append(keyword).append("&order=" + orderBy).append("&page=" + i);
  68 + }
  69 + if (StringUtils.isNotBlank(journal)) {
  70 + url.append("&journal=" + journal);
  71 + }
  72 + Spider.create(natureSearchPageProcessor)
  73 + .addUrl(url.toString())
  74 + // 开启5个线程执行,并开始爬取
  75 + .setUUID(UuidUtil.getTimeBasedUuid().toString())
  76 + .thread(5).run();
  77 + }
  78 + }
  79 + return ServerResult.success();
  80 + }
50 } 81 }
51 82
src/main/java/com/canrd/webmagic/job/NatureJob.java
@@ -33,13 +33,18 @@ public class NatureJob { @@ -33,13 +33,18 @@ public class NatureJob {
33 for (String keyword : KeywordUtil.getKeyWordList()) { 33 for (String keyword : KeywordUtil.getKeyWordList()) {
34 Spider.create(natureSearchPageProcessor) 34 Spider.create(natureSearchPageProcessor)
35 // 添加这个Spider要爬取的网页地址 35 // 添加这个Spider要爬取的网页地址
36 - .addUrl("https://www.nature.com/search?q=" + keyword + "&page=" + 1)  
37 - .addUrl("https://www.nature.com/search?q=" + keyword + "&page=" + 2)  
38 - .addUrl("https://www.nature.com/search?q=" + keyword + "&page=" + 3) 36 + .addUrl("https://www.nature.com/search?q=" + keyword + "&order=date_desc&page=" + 1)
  37 + .addUrl("https://www.nature.com/search?q=" + keyword + "&order=date_desc&page=" + 2)
  38 + .addUrl("https://www.nature.com/search?q=" + keyword + "&order=date_desc&page=" + 3)
  39 + .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nnano&order=date_desc&page=" + 1)
  40 + .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nnano&order=date_desc&page=" + 2)
  41 + .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nnano&order=date_desc&page=" + 3)
  42 + .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nphys&order=date_desc&page=" + 1)
  43 + .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nphys&order=date_desc&page=" + 2)
  44 + .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nphys&order=date_desc&page=" + 3)
39 .addPipeline(articlePipeline) 45 .addPipeline(articlePipeline)
40 -// .setDownloader(downloader.newIpDownloader())  
41 - // 开启5个线程执行,并开始爬取  
42 - .thread(5).run(); 46 + // 开启20个线程执行,并开始爬取
  47 + .thread(20).run();
43 } 48 }
44 } 49 }
45 } 50 }
src/main/java/com/canrd/webmagic/processor/NatureSearchPageProcessor.java
@@ -155,9 +155,9 @@ public class NatureSearchPageProcessor implements PageProcessor { @@ -155,9 +155,9 @@ public class NatureSearchPageProcessor implements PageProcessor {
155 JSONObject object = new JSONObject(); 155 JSONObject object = new JSONObject();
156 object.put("referenceTitle", referenceTitle); 156 object.put("referenceTitle", referenceTitle);
157 object.put("links", links); 157 object.put("links", links);
158 - if (CollectionUtils.isNotEmpty(links)) {  
159 - page.addTargetRequests(links.stream().filter(x -> x.contains("nature")).collect(Collectors.toList()));  
160 - } 158 +// if (CollectionUtils.isNotEmpty(links)) {
  159 +// page.addTargetRequests(links.stream().filter(x -> x.contains("nature")).collect(Collectors.toList()));
  160 +// }
161 references.add(object); 161 references.add(object);
162 } 162 }
163 } 163 }
src/main/resources/application-test.yml
@@ -57,7 +57,7 @@ spring: @@ -57,7 +57,7 @@ spring:
57 testWhileIdle: true 57 testWhileIdle: true
58 testOnBorrow: true 58 testOnBorrow: true
59 testOnReturn: true 59 testOnReturn: true
60 - password: 123456 60 + password: Canrd@2023
61 time-between-eviction-runs-millis: 1000 61 time-between-eviction-runs-millis: 1000
62 url: jdbc:mysql://localhost:3306/webmagic?useUnicode=true&characterEncoding=UTF-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=Asia/Shanghai&useSSL=false&autoReconnect=true&failOverReadOnly=false&maxReconnects=10&allowMultiQueries=true&useAffectedRows=true&autoReconnectForPools=true 62 url: jdbc:mysql://localhost:3306/webmagic?useUnicode=true&characterEncoding=UTF-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=Asia/Shanghai&useSSL=false&autoReconnect=true&failOverReadOnly=false&maxReconnects=10&allowMultiQueries=true&useAffectedRows=true&autoReconnectForPools=true
63 username: root 63 username: root