feat: (b04d3e62) | Commits | trush / webmagic-canrd-service

Commit b04d3e62cfb7604579ba7b73142d0337c74672b8

Authored by 谢茂盛 2024-04-30 15:56:24 +0800

feat:

1、univie-physnano 爬取

Inline Side-by-side

Showing 2 changed files with 0 additions and 104 deletions

src/main/java/com/canrd/webmagic/processor/BaiduHotSearchPageProcessor.java deleted 100644 → 0

View file @7182dcd

1		-package com.canrd.webmagic.processor;
2		-
3		-import us.codecraft.webmagic.Page;
4		-import us.codecraft.webmagic.Site;
5		-import us.codecraft.webmagic.Spider;
6		-import us.codecraft.webmagic.processor.PageProcessor;
7		-import us.codecraft.webmagic.selector.Selectable;
8		-import us.codecraft.webmagic.selector.XpathSelector;
9		-
10		-import java.util.List;
11		-
12		-/**
13		- * @author: xms
14		- * @description: TODO
15		- * @date: 2024/4/1 14:19
16		- * @version: 1.0
17		- */
18		-public class BaiduHotSearchPageProcessor implements PageProcessor {
19		-
20		- // 抓取网站的相关配置，包括编码、抓取间隔、重试次数等
21		- private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
22		-
23		- /**
24		- * 定制爬虫逻辑的核心接口，在这里编写抽取逻辑
25		- *
26		- * @param page
27		- */
28		- @Override
29		- public void process(Page page) {
30		-
31		- System.out.println(page.getHtml());
32		- /**
33		- * 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容
34		- * 1、通过$或css()方法获取到该page html下某元素dom
35		- */
36		- Selectable selectable = page.getHtml().$(".theme-hot").select(
37		- new XpathSelector("a[@class='item-wrap_2oCLZ']")
38		- );
39		- List<Selectable> nodes = selectable.nodes();
40		-
41		- /**
42		- * 获取到指定的dom后，从这些dom中提取元素内容。
43		- */
44		- System.out.println("今日百度热搜：");
45		- for (int i = 1; i <= nodes.size() - 1; i++) {
46		- Selectable node = nodes.get(i);
47		- String link = node.$(".item-wrap_2oCLZ", "href").get();
48		- String title = node.$(".c-single-text-ellipsis", "text").get();
49		- System.out.printf("%d、%s，访问地址：%s%n", i, title, link);
50		- }
51		- }
52		-
53		- @Override
54		- public Site getSite() {
55		- return site;
56		- }
57		-
58		- public static void main(String[] args) {
59		- // 创建一个Spider，并把我们的处理器放进去
60		- Spider.create(new BaiduHotSearchPageProcessor())
61		- // 添加这个Spider要爬取的网页地址
62		- .addUrl("https://top.baidu.com/board?platform=pc&sa=pcindex_entry")
63		- // 开启5个线程执行，并开始爬取
64		- .thread(5).run();
65		- }
66		-}
67	0	\ No newline at end of file

src/main/java/com/canrd/webmagic/processor/GithubRepoPageProcessor.java deleted 100644 → 0

View file @7182dcd

1		-package com.canrd.webmagic.processor;
2		-
3		-import us.codecraft.webmagic.Page;
4		-import us.codecraft.webmagic.Site;
5		-import us.codecraft.webmagic.Spider;
6		-import us.codecraft.webmagic.processor.PageProcessor;
7		-
8		-/**
9		- * @author: xms
10		- * @description: TODO
11		- * @date: 2024/4/1 12:11
12		- * @version: 1.0
13		- */
14		-public class GithubRepoPageProcessor implements PageProcessor {
15		-
16		- private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);
17		-
18		- @Override
19		- public void process(Page page) {
20		- page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
21		- page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
22		- page.putField("name", page.getHtml().xpath("//h1[@class='public']/strong/a/text()").toString());
23		- if (page.getResultItems().get("name")==null){
24		- //skip this page
25		- page.setSkip(true);
26		- }
27		- page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));
28		- }
29		-
30		- @Override
31		- public Site getSite() {
32		- return site;
33		- }
34		-
35		- public static void main(String[] args) {
36		- Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run();
37		- }
38		-}
39	0	\ No newline at end of file

trush / webmagic-canrd-service · Commits

GitLab

feat: