Commit 98eb2cc8cf6a2f94e8021decb0e3c9b62ef5e7d4
1 parent
885a66af
feat:
1、nature 爬取调整
Showing
3 changed files
with
17 additions
and
3 deletions
src/main/java/com/canrd/webmagic/controller/NatureController.java
@@ -42,16 +42,22 @@ public class NatureController { | @@ -42,16 +42,22 @@ public class NatureController { | ||
42 | for (int i = 1; i <= indexSize; i++) { | 42 | for (int i = 1; i <= indexSize; i++) { |
43 | Spider.create(natureSearchPageProcessor) | 43 | Spider.create(natureSearchPageProcessor) |
44 | .addUrl("https://www.nature.com/nature/research-articles?sort=PubDate&page=" + i) | 44 | .addUrl("https://www.nature.com/nature/research-articles?sort=PubDate&page=" + i) |
45 | + .addUrl("https://www.nature.com/nature/articles?searchType=journalSearch&sort=PubDate&type=article&page=" + i) | ||
46 | + .addUrl("https://www.nature.com/nenergy/research-articles?searchType=journalSearch&sort=PubDate&page=" + i) | ||
47 | + .addUrl("https://www.nature.com/nmat/articles?searchType=journalSearch&sort=PubDate&page=" + i) | ||
48 | + .addUrl("https://www.nature.com/ncomms/articles?searchType=journalSearch&sort=PubDate&type=editorial&page=" + i) | ||
45 | .addPipeline(articlePipeline) | 49 | .addPipeline(articlePipeline) |
46 | // .setDownloader(downloader.newIpDownloader()) | 50 | // .setDownloader(downloader.newIpDownloader()) |
47 | // 开启5个线程执行,并开始爬取 | 51 | // 开启5个线程执行,并开始爬取 |
48 | - .thread(5).run(); | 52 | + .thread(20).run(); |
49 | } | 53 | } |
50 | 54 | ||
51 | return ServerResult.success(); | 55 | return ServerResult.success(); |
52 | } | 56 | } |
53 | 57 | ||
54 | /** | 58 | /** |
59 | + * journal: natcomputsci/nnano/nphys/nmeth | ||
60 | + * | ||
55 | * @return | 61 | * @return |
56 | */ | 62 | */ |
57 | @GetMapping("/search") | 63 | @GetMapping("/search") |
src/main/java/com/canrd/webmagic/job/NatureJob.java
@@ -42,9 +42,15 @@ public class NatureJob { | @@ -42,9 +42,15 @@ public class NatureJob { | ||
42 | .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nphys&order=date_desc&page=" + 1) | 42 | .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nphys&order=date_desc&page=" + 1) |
43 | .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nphys&order=date_desc&page=" + 2) | 43 | .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nphys&order=date_desc&page=" + 2) |
44 | .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nphys&order=date_desc&page=" + 3) | 44 | .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nphys&order=date_desc&page=" + 3) |
45 | + .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=natcomputsci&order=date_desc&page=" + 1) | ||
46 | + .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=natcomputsci&order=date_desc&page=" + 2) | ||
47 | + .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=natcomputsci&order=date_desc&page=" + 3) | ||
48 | + .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nmeth&order=date_desc&page=" + 1) | ||
49 | + .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nmeth&order=date_desc&page=" + 2) | ||
50 | + .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nmeth&order=date_desc&page=" + 3) | ||
45 | .addPipeline(articlePipeline) | 51 | .addPipeline(articlePipeline) |
46 | // 开启20个线程执行,并开始爬取 | 52 | // 开启20个线程执行,并开始爬取 |
47 | - .thread(20).run(); | 53 | + .thread(30).run(); |
48 | } | 54 | } |
49 | } | 55 | } |
50 | } | 56 | } |
src/main/java/com/canrd/webmagic/processor/NatureSearchPageProcessor.java
@@ -50,7 +50,9 @@ public class NatureSearchPageProcessor implements PageProcessor { | @@ -50,7 +50,9 @@ public class NatureSearchPageProcessor implements PageProcessor { | ||
50 | doArticleList(page); | 50 | doArticleList(page); |
51 | } else if (page.getUrl().get().contains("research-articles")) { | 51 | } else if (page.getUrl().get().contains("research-articles")) { |
52 | doArticleList4ReSearch(page); | 52 | doArticleList4ReSearch(page); |
53 | - } else { | 53 | + } else if (page.getUrl().get().contains("/articles?searchType=journalSearch")) { |
54 | + doArticleList4ReSearch(page); | ||
55 | + }else { | ||
54 | doArticleContent(page); | 56 | doArticleContent(page); |
55 | } | 57 | } |
56 | 58 |