Commit 98eb2cc8cf6a2f94e8021decb0e3c9b62ef5e7d4

Authored by 谢茂盛
1 parent 885a66af

feat:

1、nature 爬取调整
src/main/java/com/canrd/webmagic/controller/NatureController.java
@@ -42,16 +42,22 @@ public class NatureController { @@ -42,16 +42,22 @@ public class NatureController {
42 for (int i = 1; i <= indexSize; i++) { 42 for (int i = 1; i <= indexSize; i++) {
43 Spider.create(natureSearchPageProcessor) 43 Spider.create(natureSearchPageProcessor)
44 .addUrl("https://www.nature.com/nature/research-articles?sort=PubDate&page=" + i) 44 .addUrl("https://www.nature.com/nature/research-articles?sort=PubDate&page=" + i)
  45 + .addUrl("https://www.nature.com/nature/articles?searchType=journalSearch&sort=PubDate&type=article&page=" + i)
  46 + .addUrl("https://www.nature.com/nenergy/research-articles?searchType=journalSearch&sort=PubDate&page=" + i)
  47 + .addUrl("https://www.nature.com/nmat/articles?searchType=journalSearch&sort=PubDate&page=" + i)
  48 + .addUrl("https://www.nature.com/ncomms/articles?searchType=journalSearch&sort=PubDate&type=editorial&page=" + i)
45 .addPipeline(articlePipeline) 49 .addPipeline(articlePipeline)
46 // .setDownloader(downloader.newIpDownloader()) 50 // .setDownloader(downloader.newIpDownloader())
47 // 开启5个线程执行,并开始爬取 51 // 开启5个线程执行,并开始爬取
48 - .thread(5).run(); 52 + .thread(20).run();
49 } 53 }
50 54
51 return ServerResult.success(); 55 return ServerResult.success();
52 } 56 }
53 57
54 /** 58 /**
  59 + * journal: natcomputsci/nnano/nphys/nmeth
  60 + *
55 * @return 61 * @return
56 */ 62 */
57 @GetMapping("/search") 63 @GetMapping("/search")
src/main/java/com/canrd/webmagic/job/NatureJob.java
@@ -42,9 +42,15 @@ public class NatureJob { @@ -42,9 +42,15 @@ public class NatureJob {
42 .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nphys&order=date_desc&page=" + 1) 42 .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nphys&order=date_desc&page=" + 1)
43 .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nphys&order=date_desc&page=" + 2) 43 .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nphys&order=date_desc&page=" + 2)
44 .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nphys&order=date_desc&page=" + 3) 44 .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nphys&order=date_desc&page=" + 3)
  45 + .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=natcomputsci&order=date_desc&page=" + 1)
  46 + .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=natcomputsci&order=date_desc&page=" + 2)
  47 + .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=natcomputsci&order=date_desc&page=" + 3)
  48 + .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nmeth&order=date_desc&page=" + 1)
  49 + .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nmeth&order=date_desc&page=" + 2)
  50 + .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nmeth&order=date_desc&page=" + 3)
45 .addPipeline(articlePipeline) 51 .addPipeline(articlePipeline)
46 // 开启20个线程执行,并开始爬取 52 // 开启20个线程执行,并开始爬取
47 - .thread(20).run(); 53 + .thread(30).run();
48 } 54 }
49 } 55 }
50 } 56 }
src/main/java/com/canrd/webmagic/processor/NatureSearchPageProcessor.java
@@ -50,7 +50,9 @@ public class NatureSearchPageProcessor implements PageProcessor { @@ -50,7 +50,9 @@ public class NatureSearchPageProcessor implements PageProcessor {
50 doArticleList(page); 50 doArticleList(page);
51 } else if (page.getUrl().get().contains("research-articles")) { 51 } else if (page.getUrl().get().contains("research-articles")) {
52 doArticleList4ReSearch(page); 52 doArticleList4ReSearch(page);
53 - } else { 53 + } else if (page.getUrl().get().contains("/articles?searchType=journalSearch")) {
  54 + doArticleList4ReSearch(page);
  55 + }else {
54 doArticleContent(page); 56 doArticleContent(page);
55 } 57 }
56 58