Commit b04d3e62cfb7604579ba7b73142d0337c74672b8
1 parent
7182dcd5
feat:
1、univie-physnano 爬取
Showing
2 changed files
with
0 additions
and
104 deletions
src/main/java/com/canrd/webmagic/processor/BaiduHotSearchPageProcessor.java deleted
100644 → 0
1 | -package com.canrd.webmagic.processor; | ||
2 | - | ||
3 | -import us.codecraft.webmagic.Page; | ||
4 | -import us.codecraft.webmagic.Site; | ||
5 | -import us.codecraft.webmagic.Spider; | ||
6 | -import us.codecraft.webmagic.processor.PageProcessor; | ||
7 | -import us.codecraft.webmagic.selector.Selectable; | ||
8 | -import us.codecraft.webmagic.selector.XpathSelector; | ||
9 | - | ||
10 | -import java.util.List; | ||
11 | - | ||
12 | -/** | ||
13 | - * @author: xms | ||
14 | - * @description: TODO | ||
15 | - * @date: 2024/4/1 14:19 | ||
16 | - * @version: 1.0 | ||
17 | - */ | ||
18 | -public class BaiduHotSearchPageProcessor implements PageProcessor { | ||
19 | - | ||
20 | - // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等 | ||
21 | - private Site site = Site.me().setRetryTimes(3).setSleepTime(100); | ||
22 | - | ||
23 | - /** | ||
24 | - * 定制爬虫逻辑的核心接口,在这里编写抽取逻辑 | ||
25 | - * | ||
26 | - * @param page | ||
27 | - */ | ||
28 | - @Override | ||
29 | - public void process(Page page) { | ||
30 | - | ||
31 | - System.out.println(page.getHtml()); | ||
32 | - /** | ||
33 | - * 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容 | ||
34 | - * 1、通过$或css()方法获取到该page html下某元素dom | ||
35 | - */ | ||
36 | - Selectable selectable = page.getHtml().$(".theme-hot").select( | ||
37 | - new XpathSelector("a[@class='item-wrap_2oCLZ']") | ||
38 | - ); | ||
39 | - List<Selectable> nodes = selectable.nodes(); | ||
40 | - | ||
41 | - /** | ||
42 | - * 获取到指定的dom后,从这些dom中提取元素内容。 | ||
43 | - */ | ||
44 | - System.out.println("今日百度热搜:"); | ||
45 | - for (int i = 1; i <= nodes.size() - 1; i++) { | ||
46 | - Selectable node = nodes.get(i); | ||
47 | - String link = node.$(".item-wrap_2oCLZ", "href").get(); | ||
48 | - String title = node.$(".c-single-text-ellipsis", "text").get(); | ||
49 | - System.out.printf("%d、%s,访问地址:%s%n", i, title, link); | ||
50 | - } | ||
51 | - } | ||
52 | - | ||
53 | - @Override | ||
54 | - public Site getSite() { | ||
55 | - return site; | ||
56 | - } | ||
57 | - | ||
58 | - public static void main(String[] args) { | ||
59 | - // 创建一个Spider,并把我们的处理器放进去 | ||
60 | - Spider.create(new BaiduHotSearchPageProcessor()) | ||
61 | - // 添加这个Spider要爬取的网页地址 | ||
62 | - .addUrl("https://top.baidu.com/board?platform=pc&sa=pcindex_entry") | ||
63 | - // 开启5个线程执行,并开始爬取 | ||
64 | - .thread(5).run(); | ||
65 | - } | ||
66 | -} | ||
67 | \ No newline at end of file | 0 | \ No newline at end of file |
src/main/java/com/canrd/webmagic/processor/GithubRepoPageProcessor.java deleted
100644 → 0
1 | -package com.canrd.webmagic.processor; | ||
2 | - | ||
3 | -import us.codecraft.webmagic.Page; | ||
4 | -import us.codecraft.webmagic.Site; | ||
5 | -import us.codecraft.webmagic.Spider; | ||
6 | -import us.codecraft.webmagic.processor.PageProcessor; | ||
7 | - | ||
8 | -/** | ||
9 | - * @author: xms | ||
10 | - * @description: TODO | ||
11 | - * @date: 2024/4/1 12:11 | ||
12 | - * @version: 1.0 | ||
13 | - */ | ||
14 | -public class GithubRepoPageProcessor implements PageProcessor { | ||
15 | - | ||
16 | - private Site site = Site.me().setRetryTimes(3).setSleepTime(1000); | ||
17 | - | ||
18 | - @Override | ||
19 | - public void process(Page page) { | ||
20 | - page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()); | ||
21 | - page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString()); | ||
22 | - page.putField("name", page.getHtml().xpath("//h1[@class='public']/strong/a/text()").toString()); | ||
23 | - if (page.getResultItems().get("name")==null){ | ||
24 | - //skip this page | ||
25 | - page.setSkip(true); | ||
26 | - } | ||
27 | - page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()")); | ||
28 | - } | ||
29 | - | ||
30 | - @Override | ||
31 | - public Site getSite() { | ||
32 | - return site; | ||
33 | - } | ||
34 | - | ||
35 | - public static void main(String[] args) { | ||
36 | - Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run(); | ||
37 | - } | ||
38 | -} | ||
39 | \ No newline at end of file | 0 | \ No newline at end of file |