Commit b04d3e62cfb7604579ba7b73142d0337c74672b8

Authored by 谢茂盛
1 parent 7182dcd5

feat:

1、univie-physnano 爬取
src/main/java/com/canrd/webmagic/processor/BaiduHotSearchPageProcessor.java deleted 100644 → 0
1 -package com.canrd.webmagic.processor;  
2 -  
3 -import us.codecraft.webmagic.Page;  
4 -import us.codecraft.webmagic.Site;  
5 -import us.codecraft.webmagic.Spider;  
6 -import us.codecraft.webmagic.processor.PageProcessor;  
7 -import us.codecraft.webmagic.selector.Selectable;  
8 -import us.codecraft.webmagic.selector.XpathSelector;  
9 -  
10 -import java.util.List;  
11 -  
12 -/**  
13 - * @author: xms  
14 - * @description: TODO  
15 - * @date: 2024/4/1 14:19  
16 - * @version: 1.0  
17 - */  
18 -public class BaiduHotSearchPageProcessor implements PageProcessor {  
19 -  
20 - // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等  
21 - private Site site = Site.me().setRetryTimes(3).setSleepTime(100);  
22 -  
23 - /**  
24 - * 定制爬虫逻辑的核心接口,在这里编写抽取逻辑  
25 - *  
26 - * @param page  
27 - */  
28 - @Override  
29 - public void process(Page page) {  
30 -  
31 - System.out.println(page.getHtml());  
32 - /**  
33 - * 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容  
34 - * 1、通过$或css()方法获取到该page html下某元素dom  
35 - */  
36 - Selectable selectable = page.getHtml().$(".theme-hot").select(  
37 - new XpathSelector("a[@class='item-wrap_2oCLZ']")  
38 - );  
39 - List<Selectable> nodes = selectable.nodes();  
40 -  
41 - /**  
42 - * 获取到指定的dom后,从这些dom中提取元素内容。  
43 - */  
44 - System.out.println("今日百度热搜:");  
45 - for (int i = 1; i <= nodes.size() - 1; i++) {  
46 - Selectable node = nodes.get(i);  
47 - String link = node.$(".item-wrap_2oCLZ", "href").get();  
48 - String title = node.$(".c-single-text-ellipsis", "text").get();  
49 - System.out.printf("%d、%s,访问地址:%s%n", i, title, link);  
50 - }  
51 - }  
52 -  
53 - @Override  
54 - public Site getSite() {  
55 - return site;  
56 - }  
57 -  
58 - public static void main(String[] args) {  
59 - // 创建一个Spider,并把我们的处理器放进去  
60 - Spider.create(new BaiduHotSearchPageProcessor())  
61 - // 添加这个Spider要爬取的网页地址  
62 - .addUrl("https://top.baidu.com/board?platform=pc&sa=pcindex_entry")  
63 - // 开启5个线程执行,并开始爬取  
64 - .thread(5).run();  
65 - }  
66 -}  
67 \ No newline at end of file 0 \ No newline at end of file
src/main/java/com/canrd/webmagic/processor/GithubRepoPageProcessor.java deleted 100644 → 0
1 -package com.canrd.webmagic.processor;  
2 -  
3 -import us.codecraft.webmagic.Page;  
4 -import us.codecraft.webmagic.Site;  
5 -import us.codecraft.webmagic.Spider;  
6 -import us.codecraft.webmagic.processor.PageProcessor;  
7 -  
8 -/**  
9 - * @author: xms  
10 - * @description: TODO  
11 - * @date: 2024/4/1 12:11  
12 - * @version: 1.0  
13 - */  
14 -public class GithubRepoPageProcessor implements PageProcessor {  
15 -  
16 - private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);  
17 -  
18 - @Override  
19 - public void process(Page page) {  
20 - page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());  
21 - page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());  
22 - page.putField("name", page.getHtml().xpath("//h1[@class='public']/strong/a/text()").toString());  
23 - if (page.getResultItems().get("name")==null){  
24 - //skip this page  
25 - page.setSkip(true);  
26 - }  
27 - page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));  
28 - }  
29 -  
30 - @Override  
31 - public Site getSite() {  
32 - return site;  
33 - }  
34 -  
35 - public static void main(String[] args) {  
36 - Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run();  
37 - }  
38 -}  
39 \ No newline at end of file 0 \ No newline at end of file