Blame view

src/main/java/com/canrd/webmagic/processor/Science4JournalSearchPageProcessor.java 6.74 KB
谢茂盛 authored
1
2
3
4
5
6
7
8
package com.canrd.webmagic.processor;

import com.canrd.webmagic.common.utils.DateUtil;
import com.canrd.webmagic.common.utils.KeywordUtil;
import com.canrd.webmagic.processor.config.Agent;
import com.canrd.webmagic.processor.download.SeleniumDownloader;
import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
import lombok.extern.slf4j.Slf4j;
谢茂盛 authored
9
import org.apache.logging.log4j.core.util.UuidUtil;
谢茂盛 authored
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;
import us.codecraft.webmagic.selector.XpathSelector;

import javax.annotation.Resource;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
import java.util.Locale;

/**
 * @author: xms
 * @description: TODO
 * @date: 2024/4/1 14:19
 * @version: 1.0
 */
@Slf4j
@Component
public class Science4JournalSearchPageProcessor implements PageProcessor {

    @Resource
    private Science4JournalArticlePageProcessor science4JournalArticlePageProcessor;

    @Resource
    private SeleniumDownloader seleniumDownloader;

    @Resource
    private ArticlePipeline articlePipeline;

    /**
     * 抓取网站的相关配置,包括编码、抓取间隔、重试次数等
     */
谢茂盛 authored
47
    private Site site = Site.me().setRetryTimes(3).setSleepTime(5).setUserAgent(Agent.getRandom());
谢茂盛 authored
48
49
50
51
52
53
54
55

    /**
     * 定制爬虫逻辑的核心接口,在这里编写抽取逻辑
     *
     * @param page
     */
    @Override
    public void process(Page page) {
谢茂盛 authored
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
        if (page.getUrl().get().contains("doSearch")) {
            doSearch(page);
        } else {
            doArticleList(page);
        }
    }

    /**
     * @param page
     */
    private void doSearch(Page page) {
        String url = page.getUrl().get();
        /**
         * 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容
         * 1、通过$或css()方法获取到该page html下某元素dom
         */
        Selectable selectable = page.getHtml().xpath("//div[@class=' search-result__body titles-results ']").select(new XpathSelector("div[@class='card pb-3 mb-4 border-bottom']"));
        List<Selectable> nodes = selectable.nodes();

        /**
         * 获取到指定的dom后,从这些dom中提取元素内容。
         */
        for (int i = 0; i <= nodes.size() - 1; i++) {
            String title = nodes.get(i).xpath("//div[@class='card pb-3 mb-4 border-bottom']/div").xpath("//div[@class='d-flex justify-content-between align-items-end']/div/span/h2/a/text()").get();
            String time = nodes.get(i).xpath("//div[@class='card-meta align-middle mb-2 text-uppercase text-darker-gray']/span").nodes().get(2).xpath("//time/text()").get();
            String link = nodes.get(i).links().get();
            SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH);
            try {
                Date publishTimeDateTime = formatter.parse(time);
                if (!publishTimeDateTime.before(DateUtil.localDate2Date(DateUtil.parseDate("2000-01-01", DateUtil.DATE)))) {
//                        page.addTargetRequest(link);
                    Spider.create(science4JournalArticlePageProcessor)
                            .addUrl(link)
                            .addPipeline(articlePipeline)
                            .setDownloader(seleniumDownloader)
                            .setUUID(UuidUtil.getTimeBasedUuid().toString())
                            // 开启5个线程执行,并开始爬取
                            .thread(1).run();
                    log.info("关键字文章列表链接:{},标题:{},文章链接:{}", url, title, link);
                }
            } catch (ParseException e) {
                e.printStackTrace();
            }
        }
谢茂盛 authored
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
    }

    /**
     * @param page
     */
    private void doArticleList(Page page) {
        String url = page.getUrl().get();
        /**
         * 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容
         * 1、通过$或css()方法获取到该page html下某元素dom
         */
        Selectable selectable = page.getHtml().xpath("//div[@class=' search-result__body titles-results ']").select(new XpathSelector("div[@class='card pb-3 mb-4 border-bottom']"));
        List<Selectable> nodes = selectable.nodes();

        /**
         * 获取到指定的dom后,从这些dom中提取元素内容。
         */
        for (int i = 0; i <= nodes.size() - 1; i++) {
            String title = nodes.get(i).xpath("//div[@class='card pb-3 mb-4 border-bottom']/div").xpath("//div[@class='d-flex justify-content-between align-items-end']/div/span/h2/a/text()").get();
            String time = nodes.get(i).xpath("//div[@class='card-meta align-middle mb-2 text-uppercase text-darker-gray']/span").nodes().get(2).xpath("//time/text()").get();
谢茂盛 authored
121
            String link = nodes.get(i).links().get();
谢茂盛 authored
122
            if (KeywordUtil.containKeywordsInTitle(title)) {
谢茂盛 authored
123
124
125
126
127
128
129
130
131
                SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH);
                try {
                    Date publishTimeDateTime = formatter.parse(time);
                    if (!publishTimeDateTime.before(DateUtil.localDate2Date(DateUtil.parseDate("2000-01-01", DateUtil.DATE)))) {
//                        page.addTargetRequest(link);
                        Spider.create(science4JournalArticlePageProcessor)
                                .addUrl(link)
                                .addPipeline(articlePipeline)
                                .setDownloader(seleniumDownloader)
谢茂盛 authored
132
                                .setUUID(UuidUtil.getTimeBasedUuid().toString())
谢茂盛 authored
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
                                // 开启5个线程执行,并开始爬取
                                .thread(1).run();
                        log.info("关键字文章列表链接:{},标题:{},文章链接:{}", url, title, link);
                    }
                } catch (ParseException e) {
                    e.printStackTrace();
                }

            }
        }

    }

    @Override
    public Site getSite() {
        return site;
    }

    public static void main(String[] args) {
        // 创建一个Spider,并把我们的处理器放进去
        Spider.create(new Science4JournalSearchPageProcessor())
                // 添加这个Spider要爬取的网页地址
                .addUrl("https://www.science.org/journal/science/insights?startPage=0")
                .addPipeline(new ArticlePipeline())
                // 开启5个线程执行,并开始爬取
                .thread(5).run();
    }
}