Commit b04d3e62cfb7604579ba7b73142d0337c74672b8

Authored by 谢茂盛
1 parent 7182dcd5

feat:

1、univie-physnano 爬取
src/main/java/com/canrd/webmagic/processor/BaiduHotSearchPageProcessor.java deleted 100644 → 0
1   -package com.canrd.webmagic.processor;
2   -
3   -import us.codecraft.webmagic.Page;
4   -import us.codecraft.webmagic.Site;
5   -import us.codecraft.webmagic.Spider;
6   -import us.codecraft.webmagic.processor.PageProcessor;
7   -import us.codecraft.webmagic.selector.Selectable;
8   -import us.codecraft.webmagic.selector.XpathSelector;
9   -
10   -import java.util.List;
11   -
12   -/**
13   - * @author: xms
14   - * @description: TODO
15   - * @date: 2024/4/1 14:19
16   - * @version: 1.0
17   - */
18   -public class BaiduHotSearchPageProcessor implements PageProcessor {
19   -
20   - // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等
21   - private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
22   -
23   - /**
24   - * 定制爬虫逻辑的核心接口,在这里编写抽取逻辑
25   - *
26   - * @param page
27   - */
28   - @Override
29   - public void process(Page page) {
30   -
31   - System.out.println(page.getHtml());
32   - /**
33   - * 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容
34   - * 1、通过$或css()方法获取到该page html下某元素dom
35   - */
36   - Selectable selectable = page.getHtml().$(".theme-hot").select(
37   - new XpathSelector("a[@class='item-wrap_2oCLZ']")
38   - );
39   - List<Selectable> nodes = selectable.nodes();
40   -
41   - /**
42   - * 获取到指定的dom后,从这些dom中提取元素内容。
43   - */
44   - System.out.println("今日百度热搜:");
45   - for (int i = 1; i <= nodes.size() - 1; i++) {
46   - Selectable node = nodes.get(i);
47   - String link = node.$(".item-wrap_2oCLZ", "href").get();
48   - String title = node.$(".c-single-text-ellipsis", "text").get();
49   - System.out.printf("%d、%s,访问地址:%s%n", i, title, link);
50   - }
51   - }
52   -
53   - @Override
54   - public Site getSite() {
55   - return site;
56   - }
57   -
58   - public static void main(String[] args) {
59   - // 创建一个Spider,并把我们的处理器放进去
60   - Spider.create(new BaiduHotSearchPageProcessor())
61   - // 添加这个Spider要爬取的网页地址
62   - .addUrl("https://top.baidu.com/board?platform=pc&sa=pcindex_entry")
63   - // 开启5个线程执行,并开始爬取
64   - .thread(5).run();
65   - }
66   -}
67 0 \ No newline at end of file
src/main/java/com/canrd/webmagic/processor/GithubRepoPageProcessor.java deleted 100644 → 0
1   -package com.canrd.webmagic.processor;
2   -
3   -import us.codecraft.webmagic.Page;
4   -import us.codecraft.webmagic.Site;
5   -import us.codecraft.webmagic.Spider;
6   -import us.codecraft.webmagic.processor.PageProcessor;
7   -
8   -/**
9   - * @author: xms
10   - * @description: TODO
11   - * @date: 2024/4/1 12:11
12   - * @version: 1.0
13   - */
14   -public class GithubRepoPageProcessor implements PageProcessor {
15   -
16   - private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);
17   -
18   - @Override
19   - public void process(Page page) {
20   - page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
21   - page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
22   - page.putField("name", page.getHtml().xpath("//h1[@class='public']/strong/a/text()").toString());
23   - if (page.getResultItems().get("name")==null){
24   - //skip this page
25   - page.setSkip(true);
26   - }
27   - page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));
28   - }
29   -
30   - @Override
31   - public Site getSite() {
32   - return site;
33   - }
34   -
35   - public static void main(String[] args) {
36   - Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run();
37   - }
38   -}
39 0 \ No newline at end of file