Commit 47bdaf78dddddee43c3bd13d61185bf70b42d2d1

Authored by 谢茂盛
1 parent b1b31dc6

feat: nature article爬取

src/main/java/com/canrd/webmagic/controller/NatureArticleController.java
@@ -4,8 +4,8 @@ import com.canrd.webmagic.common.constant.ServerResult; @@ -4,8 +4,8 @@ import com.canrd.webmagic.common.constant.ServerResult;
4 import com.canrd.webmagic.common.jsr303.OperateGroup; 4 import com.canrd.webmagic.common.jsr303.OperateGroup;
5 import com.canrd.webmagic.domain.vo.NatureArticleQueryVO; 5 import com.canrd.webmagic.domain.vo.NatureArticleQueryVO;
6 import com.canrd.webmagic.domain.vo.NatureArticleVO; 6 import com.canrd.webmagic.domain.vo.NatureArticleVO;
7 -import com.canrd.webmagic.processor.pipeline.NatureArticlePipeline;  
8 import com.canrd.webmagic.processor.NatureSearchPageProcessor; 7 import com.canrd.webmagic.processor.NatureSearchPageProcessor;
  8 +import com.canrd.webmagic.processor.pipeline.NatureArticlePipeline;
9 import com.canrd.webmagic.service.NatureArticleService; 9 import com.canrd.webmagic.service.NatureArticleService;
10 import org.springframework.validation.annotation.Validated; 10 import org.springframework.validation.annotation.Validated;
11 import org.springframework.web.bind.annotation.*; 11 import org.springframework.web.bind.annotation.*;
@@ -43,6 +43,7 @@ public class NatureArticleController { @@ -43,6 +43,7 @@ public class NatureArticleController {
43 Spider.create(natureSearchPageProcessor) 43 Spider.create(natureSearchPageProcessor)
44 // 添加这个Spider要爬取的网页地址 44 // 添加这个Spider要爬取的网页地址
45 .addUrl("https://www.nature.com/search?q=" + keyword + "&page=" + i) 45 .addUrl("https://www.nature.com/search?q=" + keyword + "&page=" + i)
  46 + .addUrl("https://www.nature.com/nature/research-articles?sort=PubDate&page=" + i)
46 .addPipeline(articlePipeline) 47 .addPipeline(articlePipeline)
47 // 开启5个线程执行,并开始爬取 48 // 开启5个线程执行,并开始爬取
48 .thread(5).run(); 49 .thread(5).run();
src/main/java/com/canrd/webmagic/processor/NatureSearchPageProcessor.java
@@ -40,12 +40,44 @@ public class NatureSearchPageProcessor implements PageProcessor { @@ -40,12 +40,44 @@ public class NatureSearchPageProcessor implements PageProcessor {
40 public void process(Page page) { 40 public void process(Page page) {
41 if (page.getUrl().get().contains("search")) { 41 if (page.getUrl().get().contains("search")) {
42 doArticleList(page); 42 doArticleList(page);
43 - } else { 43 + } else if (page.getUrl().get().contains("research-articles")) {
  44 + doArticleList4ReSearch(page);
  45 + }else {
44 doArticleContent(page); 46 doArticleContent(page);
45 } 47 }
46 48
47 } 49 }
48 50
  51 + /**
  52 + *
  53 + * @param page
  54 + */
  55 + private void doArticleList4ReSearch(Page page){
  56 + String url = page.getUrl().get();
  57 + String[] split = url.split("=");
  58 + Integer pageIndex = Integer.parseInt(split[split.length - 1]);
  59 + /**
  60 + * 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容
  61 + * 1、通过$或css()方法获取到该page html下某元素dom
  62 + */
  63 + Selectable selectable = page.getHtml().$(".app-article-list-row").select(
  64 + new XpathSelector("li[@class='app-article-list-row__item']")
  65 + );
  66 + List<Selectable> nodes = selectable.nodes();
  67 +
  68 + /**
  69 + * 获取到指定的dom后,从这些dom中提取元素内容。
  70 + */
  71 + for (int i = 1; i <= nodes.size() - 1; i++) {
  72 + Selectable node = nodes.get(i).$(".u-full-height").nodes().get(2).nodes().get(0).$(".u-full-height").select(new XpathSelector("a[@class='c-card__link u-link-inherit']")).nodes().get(0);
  73 + String link = node.$("a", "href").get();
  74 + page.addTargetRequest(link);
  75 + String link1 = node.links().get();
  76 + String title = node.$("a", "text").get();
  77 + System.out.printf("%d、%s,访问地址:%s%n", i, title, link1);
  78 + }
  79 + }
  80 +
49 private void doArticleContent(Page page) { 81 private void doArticleContent(Page page) {
50 //解析页面 82 //解析页面
51 Html html = page.getHtml(); 83 Html html = page.getHtml();