Commit df1e30a9a5be3aabae545b5c8702c060d922722b
1 parent
54e88191
feat: init项目
Showing
1 changed file
with
32 additions
and
8 deletions
src/main/java/com/canrd/webmagic/processor/NatureSearchPageProcessor.java
1 | package com.canrd.webmagic.processor; | 1 | package com.canrd.webmagic.processor; |
2 | 2 | ||
3 | -import org.jsoup.nodes.Element; | ||
4 | -import org.jsoup.nodes.TextNode; | ||
5 | import us.codecraft.webmagic.Page; | 3 | import us.codecraft.webmagic.Page; |
6 | import us.codecraft.webmagic.Site; | 4 | import us.codecraft.webmagic.Site; |
7 | import us.codecraft.webmagic.Spider; | 5 | import us.codecraft.webmagic.Spider; |
8 | import us.codecraft.webmagic.processor.PageProcessor; | 6 | import us.codecraft.webmagic.processor.PageProcessor; |
9 | -import us.codecraft.webmagic.selector.HtmlNode; | 7 | +import us.codecraft.webmagic.selector.Html; |
10 | import us.codecraft.webmagic.selector.Selectable; | 8 | import us.codecraft.webmagic.selector.Selectable; |
11 | import us.codecraft.webmagic.selector.XpathSelector; | 9 | import us.codecraft.webmagic.selector.XpathSelector; |
12 | 10 | ||
@@ -30,8 +28,32 @@ public class NatureSearchPageProcessor implements PageProcessor { | @@ -30,8 +28,32 @@ public class NatureSearchPageProcessor implements PageProcessor { | ||
30 | */ | 28 | */ |
31 | @Override | 29 | @Override |
32 | public void process(Page page) { | 30 | public void process(Page page) { |
31 | + if ("https://www.nature.com/search?q=battery&page=1".equals(page.getUrl().get())) { | ||
32 | + doArticleList(page); | ||
33 | + } else { | ||
34 | + doArticleContent(page); | ||
35 | + } | ||
36 | + | ||
37 | + } | ||
38 | + | ||
39 | + private void doArticleContent(Page page) { | ||
40 | + //解析页面 | ||
41 | + Html html = page.getHtml(); | ||
42 | + String title = html.xpath("//div[@class='c-article-header']/header/div/h1/text()").get(); | ||
43 | + Selectable selectable = page.getHtml().$(".c-article-author-list c-article-author-list--short js-no-scroll").select( | ||
44 | + new XpathSelector("li[@class='c-article-author-list__item']")); | ||
45 | + List<Selectable> nodes = selectable.nodes(); | ||
46 | + StringBuffer authorName = new StringBuffer(); | ||
47 | + for (Selectable node : nodes) { | ||
48 | + authorName.append(node.xpath("//a/text()")); | ||
49 | + } | ||
50 | + System.out.println("标题:" + title); | ||
51 | + } | ||
33 | 52 | ||
34 | - System.out.println(page.getHtml()); | 53 | + private void doArticleList(Page page) { |
54 | + String url = page.getUrl().get(); | ||
55 | + String[] split = url.split("="); | ||
56 | + Integer pageIndex = Integer.parseInt(split[split.length - 1]); | ||
35 | /** | 57 | /** |
36 | * 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容 | 58 | * 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容 |
37 | * 1、通过$或css()方法获取到该page html下某元素dom | 59 | * 1、通过$或css()方法获取到该page html下某元素dom |
@@ -44,13 +66,15 @@ public class NatureSearchPageProcessor implements PageProcessor { | @@ -44,13 +66,15 @@ public class NatureSearchPageProcessor implements PageProcessor { | ||
44 | /** | 66 | /** |
45 | * 获取到指定的dom后,从这些dom中提取元素内容。 | 67 | * 获取到指定的dom后,从这些dom中提取元素内容。 |
46 | */ | 68 | */ |
47 | - System.out.println("今日百度热搜:"); | ||
48 | for (int i = 1; i <= nodes.size() - 1; i++) { | 69 | for (int i = 1; i <= nodes.size() - 1; i++) { |
49 | Selectable node = nodes.get(i).$(".u-full-height").nodes().get(2).nodes().get(0).$(".u-full-height").select(new XpathSelector("a[@class='c-card__link u-link-inherit']")).nodes().get(0); | 70 | Selectable node = nodes.get(i).$(".u-full-height").nodes().get(2).nodes().get(0).$(".u-full-height").select(new XpathSelector("a[@class='c-card__link u-link-inherit']")).nodes().get(0); |
50 | - String link = node.$("a","href").get(); | ||
51 | - String title = node.$("a","text").get(); | ||
52 | - System.out.printf("%d、%s,访问地址:%s%n", i, title, link); | 71 | + String link = node.$("a", "href").get(); |
72 | + page.addTargetRequest(link); | ||
73 | + String link1 = node.links().get(); | ||
74 | + String title = node.$("a", "text").get(); | ||
75 | + System.out.printf("%d、%s,访问地址:%s%n", i, title, link1); | ||
53 | } | 76 | } |
77 | +// page.addTargetRequest("https://www.nature.com/search?q=battery&page=" + pageIndex); | ||
54 | } | 78 | } |
55 | 79 | ||
56 | @Override | 80 | @Override |