Commit 47bdaf78dddddee43c3bd13d61185bf70b42d2d1
1 parent
b1b31dc6
feat: nature article爬取
Showing
2 changed files
with
35 additions
and
2 deletions
src/main/java/com/canrd/webmagic/controller/NatureArticleController.java
... | ... | @@ -4,8 +4,8 @@ import com.canrd.webmagic.common.constant.ServerResult; |
4 | 4 | import com.canrd.webmagic.common.jsr303.OperateGroup; |
5 | 5 | import com.canrd.webmagic.domain.vo.NatureArticleQueryVO; |
6 | 6 | import com.canrd.webmagic.domain.vo.NatureArticleVO; |
7 | -import com.canrd.webmagic.processor.pipeline.NatureArticlePipeline; | |
8 | 7 | import com.canrd.webmagic.processor.NatureSearchPageProcessor; |
8 | +import com.canrd.webmagic.processor.pipeline.NatureArticlePipeline; | |
9 | 9 | import com.canrd.webmagic.service.NatureArticleService; |
10 | 10 | import org.springframework.validation.annotation.Validated; |
11 | 11 | import org.springframework.web.bind.annotation.*; |
... | ... | @@ -43,6 +43,7 @@ public class NatureArticleController { |
43 | 43 | Spider.create(natureSearchPageProcessor) |
44 | 44 | // 添加这个Spider要爬取的网页地址 |
45 | 45 | .addUrl("https://www.nature.com/search?q=" + keyword + "&page=" + i) |
46 | + .addUrl("https://www.nature.com/nature/research-articles?sort=PubDate&page=" + i) | |
46 | 47 | .addPipeline(articlePipeline) |
47 | 48 | // 开启5个线程执行,并开始爬取 |
48 | 49 | .thread(5).run(); | ... | ... |
src/main/java/com/canrd/webmagic/processor/NatureSearchPageProcessor.java
... | ... | @@ -40,12 +40,44 @@ public class NatureSearchPageProcessor implements PageProcessor { |
40 | 40 | public void process(Page page) { |
41 | 41 | if (page.getUrl().get().contains("search")) { |
42 | 42 | doArticleList(page); |
43 | - } else { | |
43 | + } else if (page.getUrl().get().contains("research-articles")) { | |
44 | + doArticleList4ReSearch(page); | |
45 | + }else { | |
44 | 46 | doArticleContent(page); |
45 | 47 | } |
46 | 48 | |
47 | 49 | } |
48 | 50 | |
51 | + /** | |
52 | + * | |
53 | + * @param page | |
54 | + */ | |
55 | + private void doArticleList4ReSearch(Page page){ | |
56 | + String url = page.getUrl().get(); | |
57 | + String[] split = url.split("="); | |
58 | + Integer pageIndex = Integer.parseInt(split[split.length - 1]); | |
59 | + /** | |
60 | + * 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容 | |
61 | + * 1、通过$或css()方法获取到该page html下某元素dom | |
62 | + */ | |
63 | + Selectable selectable = page.getHtml().$(".app-article-list-row").select( | |
64 | + new XpathSelector("li[@class='app-article-list-row__item']") | |
65 | + ); | |
66 | + List<Selectable> nodes = selectable.nodes(); | |
67 | + | |
68 | + /** | |
69 | + * 获取到指定的dom后,从这些dom中提取元素内容。 | |
70 | + */ | |
71 | + for (int i = 1; i <= nodes.size() - 1; i++) { | |
72 | + Selectable node = nodes.get(i).$(".u-full-height").nodes().get(2).nodes().get(0).$(".u-full-height").select(new XpathSelector("a[@class='c-card__link u-link-inherit']")).nodes().get(0); | |
73 | + String link = node.$("a", "href").get(); | |
74 | + page.addTargetRequest(link); | |
75 | + String link1 = node.links().get(); | |
76 | + String title = node.$("a", "text").get(); | |
77 | + System.out.printf("%d、%s,访问地址:%s%n", i, title, link1); | |
78 | + } | |
79 | + } | |
80 | + | |
49 | 81 | private void doArticleContent(Page page) { |
50 | 82 | //解析页面 |
51 | 83 | Html html = page.getHtml(); | ... | ... |