Commit 47bdaf78dddddee43c3bd13d61185bf70b42d2d1
1 parent
b1b31dc6
feat: nature article爬取
Showing
2 changed files
with
35 additions
and
2 deletions
src/main/java/com/canrd/webmagic/controller/NatureArticleController.java
@@ -4,8 +4,8 @@ import com.canrd.webmagic.common.constant.ServerResult; | @@ -4,8 +4,8 @@ import com.canrd.webmagic.common.constant.ServerResult; | ||
4 | import com.canrd.webmagic.common.jsr303.OperateGroup; | 4 | import com.canrd.webmagic.common.jsr303.OperateGroup; |
5 | import com.canrd.webmagic.domain.vo.NatureArticleQueryVO; | 5 | import com.canrd.webmagic.domain.vo.NatureArticleQueryVO; |
6 | import com.canrd.webmagic.domain.vo.NatureArticleVO; | 6 | import com.canrd.webmagic.domain.vo.NatureArticleVO; |
7 | -import com.canrd.webmagic.processor.pipeline.NatureArticlePipeline; | ||
8 | import com.canrd.webmagic.processor.NatureSearchPageProcessor; | 7 | import com.canrd.webmagic.processor.NatureSearchPageProcessor; |
8 | +import com.canrd.webmagic.processor.pipeline.NatureArticlePipeline; | ||
9 | import com.canrd.webmagic.service.NatureArticleService; | 9 | import com.canrd.webmagic.service.NatureArticleService; |
10 | import org.springframework.validation.annotation.Validated; | 10 | import org.springframework.validation.annotation.Validated; |
11 | import org.springframework.web.bind.annotation.*; | 11 | import org.springframework.web.bind.annotation.*; |
@@ -43,6 +43,7 @@ public class NatureArticleController { | @@ -43,6 +43,7 @@ public class NatureArticleController { | ||
43 | Spider.create(natureSearchPageProcessor) | 43 | Spider.create(natureSearchPageProcessor) |
44 | // 添加这个Spider要爬取的网页地址 | 44 | // 添加这个Spider要爬取的网页地址 |
45 | .addUrl("https://www.nature.com/search?q=" + keyword + "&page=" + i) | 45 | .addUrl("https://www.nature.com/search?q=" + keyword + "&page=" + i) |
46 | + .addUrl("https://www.nature.com/nature/research-articles?sort=PubDate&page=" + i) | ||
46 | .addPipeline(articlePipeline) | 47 | .addPipeline(articlePipeline) |
47 | // 开启5个线程执行,并开始爬取 | 48 | // 开启5个线程执行,并开始爬取 |
48 | .thread(5).run(); | 49 | .thread(5).run(); |
src/main/java/com/canrd/webmagic/processor/NatureSearchPageProcessor.java
@@ -40,12 +40,44 @@ public class NatureSearchPageProcessor implements PageProcessor { | @@ -40,12 +40,44 @@ public class NatureSearchPageProcessor implements PageProcessor { | ||
40 | public void process(Page page) { | 40 | public void process(Page page) { |
41 | if (page.getUrl().get().contains("search")) { | 41 | if (page.getUrl().get().contains("search")) { |
42 | doArticleList(page); | 42 | doArticleList(page); |
43 | - } else { | 43 | + } else if (page.getUrl().get().contains("research-articles")) { |
44 | + doArticleList4ReSearch(page); | ||
45 | + }else { | ||
44 | doArticleContent(page); | 46 | doArticleContent(page); |
45 | } | 47 | } |
46 | 48 | ||
47 | } | 49 | } |
48 | 50 | ||
51 | + /** | ||
52 | + * | ||
53 | + * @param page | ||
54 | + */ | ||
55 | + private void doArticleList4ReSearch(Page page){ | ||
56 | + String url = page.getUrl().get(); | ||
57 | + String[] split = url.split("="); | ||
58 | + Integer pageIndex = Integer.parseInt(split[split.length - 1]); | ||
59 | + /** | ||
60 | + * 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容 | ||
61 | + * 1、通过$或css()方法获取到该page html下某元素dom | ||
62 | + */ | ||
63 | + Selectable selectable = page.getHtml().$(".app-article-list-row").select( | ||
64 | + new XpathSelector("li[@class='app-article-list-row__item']") | ||
65 | + ); | ||
66 | + List<Selectable> nodes = selectable.nodes(); | ||
67 | + | ||
68 | + /** | ||
69 | + * 获取到指定的dom后,从这些dom中提取元素内容。 | ||
70 | + */ | ||
71 | + for (int i = 1; i <= nodes.size() - 1; i++) { | ||
72 | + Selectable node = nodes.get(i).$(".u-full-height").nodes().get(2).nodes().get(0).$(".u-full-height").select(new XpathSelector("a[@class='c-card__link u-link-inherit']")).nodes().get(0); | ||
73 | + String link = node.$("a", "href").get(); | ||
74 | + page.addTargetRequest(link); | ||
75 | + String link1 = node.links().get(); | ||
76 | + String title = node.$("a", "text").get(); | ||
77 | + System.out.printf("%d、%s,访问地址:%s%n", i, title, link1); | ||
78 | + } | ||
79 | + } | ||
80 | + | ||
49 | private void doArticleContent(Page page) { | 81 | private void doArticleContent(Page page) { |
50 | //解析页面 | 82 | //解析页面 |
51 | Html html = page.getHtml(); | 83 | Html html = page.getHtml(); |