Commit df1e30a9a5be3aabae545b5c8702c060d922722b

Authored by 谢茂盛
1 parent 54e88191

feat: init项目

src/main/java/com/canrd/webmagic/processor/NatureSearchPageProcessor.java
1 package com.canrd.webmagic.processor; 1 package com.canrd.webmagic.processor;
2 2
3 -import org.jsoup.nodes.Element;  
4 -import org.jsoup.nodes.TextNode;  
5 import us.codecraft.webmagic.Page; 3 import us.codecraft.webmagic.Page;
6 import us.codecraft.webmagic.Site; 4 import us.codecraft.webmagic.Site;
7 import us.codecraft.webmagic.Spider; 5 import us.codecraft.webmagic.Spider;
8 import us.codecraft.webmagic.processor.PageProcessor; 6 import us.codecraft.webmagic.processor.PageProcessor;
9 -import us.codecraft.webmagic.selector.HtmlNode; 7 +import us.codecraft.webmagic.selector.Html;
10 import us.codecraft.webmagic.selector.Selectable; 8 import us.codecraft.webmagic.selector.Selectable;
11 import us.codecraft.webmagic.selector.XpathSelector; 9 import us.codecraft.webmagic.selector.XpathSelector;
12 10
@@ -30,8 +28,32 @@ public class NatureSearchPageProcessor implements PageProcessor { @@ -30,8 +28,32 @@ public class NatureSearchPageProcessor implements PageProcessor {
30 */ 28 */
31 @Override 29 @Override
32 public void process(Page page) { 30 public void process(Page page) {
  31 + if ("https://www.nature.com/search?q=battery&page=1".equals(page.getUrl().get())) {
  32 + doArticleList(page);
  33 + } else {
  34 + doArticleContent(page);
  35 + }
  36 +
  37 + }
  38 +
  39 + private void doArticleContent(Page page) {
  40 + //解析页面
  41 + Html html = page.getHtml();
  42 + String title = html.xpath("//div[@class='c-article-header']/header/div/h1/text()").get();
  43 + Selectable selectable = page.getHtml().$(".c-article-author-list c-article-author-list--short js-no-scroll").select(
  44 + new XpathSelector("li[@class='c-article-author-list__item']"));
  45 + List<Selectable> nodes = selectable.nodes();
  46 + StringBuffer authorName = new StringBuffer();
  47 + for (Selectable node : nodes) {
  48 + authorName.append(node.xpath("//a/text()"));
  49 + }
  50 + System.out.println("标题:" + title);
  51 + }
33 52
34 - System.out.println(page.getHtml()); 53 + private void doArticleList(Page page) {
  54 + String url = page.getUrl().get();
  55 + String[] split = url.split("=");
  56 + Integer pageIndex = Integer.parseInt(split[split.length - 1]);
35 /** 57 /**
36 * 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容 58 * 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容
37 * 1、通过$或css()方法获取到该page html下某元素dom 59 * 1、通过$或css()方法获取到该page html下某元素dom
@@ -44,13 +66,15 @@ public class NatureSearchPageProcessor implements PageProcessor { @@ -44,13 +66,15 @@ public class NatureSearchPageProcessor implements PageProcessor {
44 /** 66 /**
45 * 获取到指定的dom后,从这些dom中提取元素内容。 67 * 获取到指定的dom后,从这些dom中提取元素内容。
46 */ 68 */
47 - System.out.println("今日百度热搜:");  
48 for (int i = 1; i <= nodes.size() - 1; i++) { 69 for (int i = 1; i <= nodes.size() - 1; i++) {
49 Selectable node = nodes.get(i).$(".u-full-height").nodes().get(2).nodes().get(0).$(".u-full-height").select(new XpathSelector("a[@class='c-card__link u-link-inherit']")).nodes().get(0); 70 Selectable node = nodes.get(i).$(".u-full-height").nodes().get(2).nodes().get(0).$(".u-full-height").select(new XpathSelector("a[@class='c-card__link u-link-inherit']")).nodes().get(0);
50 - String link = node.$("a","href").get();  
51 - String title = node.$("a","text").get();  
52 - System.out.printf("%d、%s,访问地址:%s%n", i, title, link); 71 + String link = node.$("a", "href").get();
  72 + page.addTargetRequest(link);
  73 + String link1 = node.links().get();
  74 + String title = node.$("a", "text").get();
  75 + System.out.printf("%d、%s,访问地址:%s%n", i, title, link1);
53 } 76 }
  77 +// page.addTargetRequest("https://www.nature.com/search?q=battery&page=" + pageIndex);
54 } 78 }
55 79
56 @Override 80 @Override