diff --git a/src/main/java/com/canrd/webmagic/processor/NatureSearchPageProcessor.java b/src/main/java/com/canrd/webmagic/processor/NatureSearchPageProcessor.java index 4439fbe..9bbbae1 100644 --- a/src/main/java/com/canrd/webmagic/processor/NatureSearchPageProcessor.java +++ b/src/main/java/com/canrd/webmagic/processor/NatureSearchPageProcessor.java @@ -1,12 +1,10 @@ package com.canrd.webmagic.processor; -import org.jsoup.nodes.Element; -import org.jsoup.nodes.TextNode; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; -import us.codecraft.webmagic.selector.HtmlNode; +import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Selectable; import us.codecraft.webmagic.selector.XpathSelector; @@ -30,8 +28,32 @@ public class NatureSearchPageProcessor implements PageProcessor { */ @Override public void process(Page page) { + if ("https://www.nature.com/search?q=battery&page=1".equals(page.getUrl().get())) { + doArticleList(page); + } else { + doArticleContent(page); + } + + } + + private void doArticleContent(Page page) { + //解析页面 + Html html = page.getHtml(); + String title = html.xpath("//div[@class='c-article-header']/header/div/h1/text()").get(); + Selectable selectable = page.getHtml().$(".c-article-author-list c-article-author-list--short js-no-scroll").select( + new XpathSelector("li[@class='c-article-author-list__item']")); + List<Selectable> nodes = selectable.nodes(); + StringBuffer authorName = new StringBuffer(); + for (Selectable node : nodes) { + authorName.append(node.xpath("//a/text()")); + } + System.out.println("标题:" + title); + } - System.out.println(page.getHtml()); + private void doArticleList(Page page) { + String url = page.getUrl().get(); + String[] split = url.split("="); + Integer pageIndex = Integer.parseInt(split[split.length - 1]); /** * 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容 * 1、通过$或css()方法获取到该page html下某元素dom @@ -44,13 +66,15 @@ public class NatureSearchPageProcessor implements PageProcessor { /** * 获取到指定的dom后,从这些dom中提取元素内容。 */ - System.out.println("今日百度热搜:"); for (int i = 1; i <= nodes.size() - 1; i++) { Selectable node = nodes.get(i).$(".u-full-height").nodes().get(2).nodes().get(0).$(".u-full-height").select(new XpathSelector("a[@class='c-card__link u-link-inherit']")).nodes().get(0); - String link = node.$("a","href").get(); - String title = node.$("a","text").get(); - System.out.printf("%d、%s,访问地址:%s%n", i, title, link); + String link = node.$("a", "href").get(); + page.addTargetRequest(link); + String link1 = node.links().get(); + String title = node.$("a", "text").get(); + System.out.printf("%d、%s,访问地址:%s%n", i, title, link1); } +// page.addTargetRequest("https://www.nature.com/search?q=battery&page=" + pageIndex); } @Override