Science4JournalArticlePageProcessor.java 5.02 KB
package com.canrd.webmagic.processor;

import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.canrd.webmagic.common.utils.DateUtil;
import com.canrd.webmagic.domain.ArticleTypeEnum;
import com.canrd.webmagic.domain.dto.ArticleDO;
import com.canrd.webmagic.processor.config.Agent;
import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;

import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
import java.util.Locale;
import java.util.Objects;

/**
 * @author: xms
 * @description: TODO
 * @date: 2024/4/1 14:19
 * @version: 1.0
 */
@Slf4j
@Component
public class Science4JournalArticlePageProcessor implements PageProcessor {

    // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等
    private Site site = Site.me().setRetryTimes(3).setSleepTime(5).setUserAgent(Agent.getRandom());

    /**
     * 定制爬虫逻辑的核心接口,在这里编写抽取逻辑
     *
     * @param page
     */
    @Override
    public void process(Page page) {
        doArticleContent(page);
    }

    /**
     * @param page
     */
    private void doArticleContent(Page page) {

        //解析页面
        Html html = page.getHtml();
        String articleCode = page.getUrl().get();
        Selectable headSelectable = html.xpath("//div[@class='article-container']/article/header");

        String title = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='core-lede']/div/text()").get();

        String articleDesc = html.xpath("//div[@class='article-container']/article").xpath("//section[@id='bodymatter']/div/div/text()").get();

        String publishTime = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='core-self-citation']").xpath("//div[@class='core-date-published']/span/text()").get();
        Date publishTimeDateTime = null;
        SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH);

        try {
            publishTimeDateTime = formatter.parse(publishTime);
        } catch (ParseException e) {
            e.printStackTrace();
        }
        List<Selectable> authorNodes = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='contributors']/span/span/span").nodes();
        StringBuffer authorName = new StringBuffer();
        for (Selectable node : authorNodes) {
            authorName.append(node.xpath("//a/span/text()").get()).append(" ");
        }


        JSONArray authorEmail = new JSONArray();
        List<Selectable> authorEmailSelectables = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='contributors']").xpath("//span[@class='authors']").xpath("//span[@role='list']").xpath("//span[@property='author']").nodes();
        for (Selectable authorEmailSelectable : authorEmailSelectables) {
            String givenName = authorEmailSelectable.xpath("//span[@property='givenName']/text()").get();
            String familyName = authorEmailSelectable.xpath("//span[@property='familyName']/text()").get();
            String email = authorEmailSelectable.xpath("//a[@property='email']/text()").get();

            JSONObject jsonObject = new JSONObject();
            jsonObject.put("authorEmailName", givenName + "" + familyName);
            jsonObject.put("email", email);
            authorEmail.add(jsonObject);
        }
        log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorEmail.toJSONString());

        page.putField("article", ArticleDO.builder()
                .articleType(ArticleTypeEnum.SCIENCE.getType())
                .articleCode(articleCode)
                .authorName(authorName.toString())
                .title(title)
                .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE))
                .emailInfo(authorEmail.toJSONString())
                .articleDesc(articleDesc)
                .authorAddress(null)
                .referenceInfo(null).build());
    }

    @Override
    public Site getSite() {
        return site;
    }

    public void setSite(Site site) {
        this.site = site;
    }

    public static void main(String[] args) {
        // 创建一个Spider,并把我们的处理器放进去
        Spider.create(new Science4JournalArticlePageProcessor())
                // 添加这个Spider要爬取的网页地址
                .addUrl("https://www.science.org/journal/science/insights?startPage=0")
                .addPipeline(new ArticlePipeline())
                // 开启5个线程执行,并开始爬取
                .thread(5).run();
    }
}