Commit b04d3e62cfb7604579ba7b73142d0337c74672b8
1 parent
7182dcd5
feat:
1、univie-physnano 爬取
Showing
2 changed files
with
0 additions
and
104 deletions
src/main/java/com/canrd/webmagic/processor/BaiduHotSearchPageProcessor.java deleted
100644 → 0
1 | -package com.canrd.webmagic.processor; | |
2 | - | |
3 | -import us.codecraft.webmagic.Page; | |
4 | -import us.codecraft.webmagic.Site; | |
5 | -import us.codecraft.webmagic.Spider; | |
6 | -import us.codecraft.webmagic.processor.PageProcessor; | |
7 | -import us.codecraft.webmagic.selector.Selectable; | |
8 | -import us.codecraft.webmagic.selector.XpathSelector; | |
9 | - | |
10 | -import java.util.List; | |
11 | - | |
12 | -/** | |
13 | - * @author: xms | |
14 | - * @description: TODO | |
15 | - * @date: 2024/4/1 14:19 | |
16 | - * @version: 1.0 | |
17 | - */ | |
18 | -public class BaiduHotSearchPageProcessor implements PageProcessor { | |
19 | - | |
20 | - // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等 | |
21 | - private Site site = Site.me().setRetryTimes(3).setSleepTime(100); | |
22 | - | |
23 | - /** | |
24 | - * 定制爬虫逻辑的核心接口,在这里编写抽取逻辑 | |
25 | - * | |
26 | - * @param page | |
27 | - */ | |
28 | - @Override | |
29 | - public void process(Page page) { | |
30 | - | |
31 | - System.out.println(page.getHtml()); | |
32 | - /** | |
33 | - * 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容 | |
34 | - * 1、通过$或css()方法获取到该page html下某元素dom | |
35 | - */ | |
36 | - Selectable selectable = page.getHtml().$(".theme-hot").select( | |
37 | - new XpathSelector("a[@class='item-wrap_2oCLZ']") | |
38 | - ); | |
39 | - List<Selectable> nodes = selectable.nodes(); | |
40 | - | |
41 | - /** | |
42 | - * 获取到指定的dom后,从这些dom中提取元素内容。 | |
43 | - */ | |
44 | - System.out.println("今日百度热搜:"); | |
45 | - for (int i = 1; i <= nodes.size() - 1; i++) { | |
46 | - Selectable node = nodes.get(i); | |
47 | - String link = node.$(".item-wrap_2oCLZ", "href").get(); | |
48 | - String title = node.$(".c-single-text-ellipsis", "text").get(); | |
49 | - System.out.printf("%d、%s,访问地址:%s%n", i, title, link); | |
50 | - } | |
51 | - } | |
52 | - | |
53 | - @Override | |
54 | - public Site getSite() { | |
55 | - return site; | |
56 | - } | |
57 | - | |
58 | - public static void main(String[] args) { | |
59 | - // 创建一个Spider,并把我们的处理器放进去 | |
60 | - Spider.create(new BaiduHotSearchPageProcessor()) | |
61 | - // 添加这个Spider要爬取的网页地址 | |
62 | - .addUrl("https://top.baidu.com/board?platform=pc&sa=pcindex_entry") | |
63 | - // 开启5个线程执行,并开始爬取 | |
64 | - .thread(5).run(); | |
65 | - } | |
66 | -} | |
67 | 0 | \ No newline at end of file |
src/main/java/com/canrd/webmagic/processor/GithubRepoPageProcessor.java deleted
100644 → 0
1 | -package com.canrd.webmagic.processor; | |
2 | - | |
3 | -import us.codecraft.webmagic.Page; | |
4 | -import us.codecraft.webmagic.Site; | |
5 | -import us.codecraft.webmagic.Spider; | |
6 | -import us.codecraft.webmagic.processor.PageProcessor; | |
7 | - | |
8 | -/** | |
9 | - * @author: xms | |
10 | - * @description: TODO | |
11 | - * @date: 2024/4/1 12:11 | |
12 | - * @version: 1.0 | |
13 | - */ | |
14 | -public class GithubRepoPageProcessor implements PageProcessor { | |
15 | - | |
16 | - private Site site = Site.me().setRetryTimes(3).setSleepTime(1000); | |
17 | - | |
18 | - @Override | |
19 | - public void process(Page page) { | |
20 | - page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()); | |
21 | - page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString()); | |
22 | - page.putField("name", page.getHtml().xpath("//h1[@class='public']/strong/a/text()").toString()); | |
23 | - if (page.getResultItems().get("name")==null){ | |
24 | - //skip this page | |
25 | - page.setSkip(true); | |
26 | - } | |
27 | - page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()")); | |
28 | - } | |
29 | - | |
30 | - @Override | |
31 | - public Site getSite() { | |
32 | - return site; | |
33 | - } | |
34 | - | |
35 | - public static void main(String[] args) { | |
36 | - Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run(); | |
37 | - } | |
38 | -} | |
39 | 0 | \ No newline at end of file |