Commit 98eb2cc8cf6a2f94e8021decb0e3c9b62ef5e7d4

Authored by 谢茂盛
1 parent 885a66af

feat:

1、nature 爬取调整
src/main/java/com/canrd/webmagic/controller/NatureController.java
... ... @@ -42,16 +42,22 @@ public class NatureController {
42 42 for (int i = 1; i <= indexSize; i++) {
43 43 Spider.create(natureSearchPageProcessor)
44 44 .addUrl("https://www.nature.com/nature/research-articles?sort=PubDate&page=" + i)
  45 + .addUrl("https://www.nature.com/nature/articles?searchType=journalSearch&sort=PubDate&type=article&page=" + i)
  46 + .addUrl("https://www.nature.com/nenergy/research-articles?searchType=journalSearch&sort=PubDate&page=" + i)
  47 + .addUrl("https://www.nature.com/nmat/articles?searchType=journalSearch&sort=PubDate&page=" + i)
  48 + .addUrl("https://www.nature.com/ncomms/articles?searchType=journalSearch&sort=PubDate&type=editorial&page=" + i)
45 49 .addPipeline(articlePipeline)
46 50 // .setDownloader(downloader.newIpDownloader())
47 51 // 开启5个线程执行,并开始爬取
48   - .thread(5).run();
  52 + .thread(20).run();
49 53 }
50 54  
51 55 return ServerResult.success();
52 56 }
53 57  
54 58 /**
  59 + * journal: natcomputsci/nnano/nphys/nmeth
  60 + *
55 61 * @return
56 62 */
57 63 @GetMapping("/search")
... ...
src/main/java/com/canrd/webmagic/job/NatureJob.java
... ... @@ -42,9 +42,15 @@ public class NatureJob {
42 42 .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nphys&order=date_desc&page=" + 1)
43 43 .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nphys&order=date_desc&page=" + 2)
44 44 .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nphys&order=date_desc&page=" + 3)
  45 + .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=natcomputsci&order=date_desc&page=" + 1)
  46 + .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=natcomputsci&order=date_desc&page=" + 2)
  47 + .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=natcomputsci&order=date_desc&page=" + 3)
  48 + .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nmeth&order=date_desc&page=" + 1)
  49 + .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nmeth&order=date_desc&page=" + 2)
  50 + .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nmeth&order=date_desc&page=" + 3)
45 51 .addPipeline(articlePipeline)
46 52 // 开启20个线程执行,并开始爬取
47   - .thread(20).run();
  53 + .thread(30).run();
48 54 }
49 55 }
50 56 }
... ...
src/main/java/com/canrd/webmagic/processor/NatureSearchPageProcessor.java
... ... @@ -50,7 +50,9 @@ public class NatureSearchPageProcessor implements PageProcessor {
50 50 doArticleList(page);
51 51 } else if (page.getUrl().get().contains("research-articles")) {
52 52 doArticleList4ReSearch(page);
53   - } else {
  53 + } else if (page.getUrl().get().contains("/articles?searchType=journalSearch")) {
  54 + doArticleList4ReSearch(page);
  55 + }else {
54 56 doArticleContent(page);
55 57 }
56 58  
... ...