Commit 98eb2cc8cf6a2f94e8021decb0e3c9b62ef5e7d4
1 parent
885a66af
feat:
1、nature 爬取调整
Showing
3 changed files
with
17 additions
and
3 deletions
src/main/java/com/canrd/webmagic/controller/NatureController.java
... | ... | @@ -42,16 +42,22 @@ public class NatureController { |
42 | 42 | for (int i = 1; i <= indexSize; i++) { |
43 | 43 | Spider.create(natureSearchPageProcessor) |
44 | 44 | .addUrl("https://www.nature.com/nature/research-articles?sort=PubDate&page=" + i) |
45 | + .addUrl("https://www.nature.com/nature/articles?searchType=journalSearch&sort=PubDate&type=article&page=" + i) | |
46 | + .addUrl("https://www.nature.com/nenergy/research-articles?searchType=journalSearch&sort=PubDate&page=" + i) | |
47 | + .addUrl("https://www.nature.com/nmat/articles?searchType=journalSearch&sort=PubDate&page=" + i) | |
48 | + .addUrl("https://www.nature.com/ncomms/articles?searchType=journalSearch&sort=PubDate&type=editorial&page=" + i) | |
45 | 49 | .addPipeline(articlePipeline) |
46 | 50 | // .setDownloader(downloader.newIpDownloader()) |
47 | 51 | // 开启5个线程执行,并开始爬取 |
48 | - .thread(5).run(); | |
52 | + .thread(20).run(); | |
49 | 53 | } |
50 | 54 | |
51 | 55 | return ServerResult.success(); |
52 | 56 | } |
53 | 57 | |
54 | 58 | /** |
59 | + * journal: natcomputsci/nnano/nphys/nmeth | |
60 | + * | |
55 | 61 | * @return |
56 | 62 | */ |
57 | 63 | @GetMapping("/search") | ... | ... |
src/main/java/com/canrd/webmagic/job/NatureJob.java
... | ... | @@ -42,9 +42,15 @@ public class NatureJob { |
42 | 42 | .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nphys&order=date_desc&page=" + 1) |
43 | 43 | .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nphys&order=date_desc&page=" + 2) |
44 | 44 | .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nphys&order=date_desc&page=" + 3) |
45 | + .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=natcomputsci&order=date_desc&page=" + 1) | |
46 | + .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=natcomputsci&order=date_desc&page=" + 2) | |
47 | + .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=natcomputsci&order=date_desc&page=" + 3) | |
48 | + .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nmeth&order=date_desc&page=" + 1) | |
49 | + .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nmeth&order=date_desc&page=" + 2) | |
50 | + .addUrl("https://www.nature.com/search?q=" + keyword + "&journal=nmeth&order=date_desc&page=" + 3) | |
45 | 51 | .addPipeline(articlePipeline) |
46 | 52 | // 开启20个线程执行,并开始爬取 |
47 | - .thread(20).run(); | |
53 | + .thread(30).run(); | |
48 | 54 | } |
49 | 55 | } |
50 | 56 | } | ... | ... |
src/main/java/com/canrd/webmagic/processor/NatureSearchPageProcessor.java
... | ... | @@ -50,7 +50,9 @@ public class NatureSearchPageProcessor implements PageProcessor { |
50 | 50 | doArticleList(page); |
51 | 51 | } else if (page.getUrl().get().contains("research-articles")) { |
52 | 52 | doArticleList4ReSearch(page); |
53 | - } else { | |
53 | + } else if (page.getUrl().get().contains("/articles?searchType=journalSearch")) { | |
54 | + doArticleList4ReSearch(page); | |
55 | + }else { | |
54 | 56 | doArticleContent(page); |
55 | 57 | } |
56 | 58 | ... | ... |