Commit 47bdaf78dddddee43c3bd13d61185bf70b42d2d1

Authored by 谢茂盛
1 parent b1b31dc6

feat: nature article爬取

src/main/java/com/canrd/webmagic/controller/NatureArticleController.java
... ... @@ -4,8 +4,8 @@ import com.canrd.webmagic.common.constant.ServerResult;
4 4 import com.canrd.webmagic.common.jsr303.OperateGroup;
5 5 import com.canrd.webmagic.domain.vo.NatureArticleQueryVO;
6 6 import com.canrd.webmagic.domain.vo.NatureArticleVO;
7   -import com.canrd.webmagic.processor.pipeline.NatureArticlePipeline;
8 7 import com.canrd.webmagic.processor.NatureSearchPageProcessor;
  8 +import com.canrd.webmagic.processor.pipeline.NatureArticlePipeline;
9 9 import com.canrd.webmagic.service.NatureArticleService;
10 10 import org.springframework.validation.annotation.Validated;
11 11 import org.springframework.web.bind.annotation.*;
... ... @@ -43,6 +43,7 @@ public class NatureArticleController {
43 43 Spider.create(natureSearchPageProcessor)
44 44 // 添加这个Spider要爬取的网页地址
45 45 .addUrl("https://www.nature.com/search?q=" + keyword + "&page=" + i)
  46 + .addUrl("https://www.nature.com/nature/research-articles?sort=PubDate&page=" + i)
46 47 .addPipeline(articlePipeline)
47 48 // 开启5个线程执行,并开始爬取
48 49 .thread(5).run();
... ...
src/main/java/com/canrd/webmagic/processor/NatureSearchPageProcessor.java
... ... @@ -40,12 +40,44 @@ public class NatureSearchPageProcessor implements PageProcessor {
40 40 public void process(Page page) {
41 41 if (page.getUrl().get().contains("search")) {
42 42 doArticleList(page);
43   - } else {
  43 + } else if (page.getUrl().get().contains("research-articles")) {
  44 + doArticleList4ReSearch(page);
  45 + }else {
44 46 doArticleContent(page);
45 47 }
46 48  
47 49 }
48 50  
  51 + /**
  52 + *
  53 + * @param page
  54 + */
  55 + private void doArticleList4ReSearch(Page page){
  56 + String url = page.getUrl().get();
  57 + String[] split = url.split("=");
  58 + Integer pageIndex = Integer.parseInt(split[split.length - 1]);
  59 + /**
  60 + * 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容
  61 + * 1、通过$或css()方法获取到该page html下某元素dom
  62 + */
  63 + Selectable selectable = page.getHtml().$(".app-article-list-row").select(
  64 + new XpathSelector("li[@class='app-article-list-row__item']")
  65 + );
  66 + List<Selectable> nodes = selectable.nodes();
  67 +
  68 + /**
  69 + * 获取到指定的dom后,从这些dom中提取元素内容。
  70 + */
  71 + for (int i = 1; i <= nodes.size() - 1; i++) {
  72 + Selectable node = nodes.get(i).$(".u-full-height").nodes().get(2).nodes().get(0).$(".u-full-height").select(new XpathSelector("a[@class='c-card__link u-link-inherit']")).nodes().get(0);
  73 + String link = node.$("a", "href").get();
  74 + page.addTargetRequest(link);
  75 + String link1 = node.links().get();
  76 + String title = node.$("a", "text").get();
  77 + System.out.printf("%d、%s,访问地址:%s%n", i, title, link1);
  78 + }
  79 + }
  80 +
49 81 private void doArticleContent(Page page) {
50 82 //解析页面
51 83 Html html = page.getHtml();
... ...