Univie4PhysnanoArticlePageProcessor.java 4.66 KB
package com.canrd.webmagic.processor;

import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.canrd.webmagic.common.utils.DateUtil;
import com.canrd.webmagic.common.utils.StringUtils;
import com.canrd.webmagic.domain.ArticleTypeEnum;
import com.canrd.webmagic.domain.dto.ArticleDO;
import com.canrd.webmagic.processor.config.Agent;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;

import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;
import java.util.Objects;

/**
 * @author: xms
 * @description: TODO
 * @date: 2024/4/1 14:19
 * @version: 1.0
 */
@Slf4j
@Component
public class Univie4PhysnanoArticlePageProcessor implements PageProcessor {

    // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等
    private Site site = Site.me().setRetryTimes(3).setSleepTime(5).setUserAgent(Agent.getRandom());

    /**
     * 定制爬虫逻辑的核心接口,在这里编写抽取逻辑
     *
     * @param page
     */
    @Override
    public void process(Page page) {
        doArticleContent(page);
    }

    /**
     * @param page
     */
    private void doArticleContent(Page page) {

        //解析页面
        Html html = page.getHtml();
        String articleCode = page.getUrl().get();

        String title = html.xpath("//h1[@class=' content-element-margin']/text()").get();

        String articleDesc = html.xpath("//dl[@class='row']").xpath("//dd[@class='col-sm-9 content-element-margin-small']").nodes().get(1).xpath("//p/text()").get();


        Date publishTimeDateTime = null;
        SimpleDateFormat formatter = new SimpleDateFormat("dd-yyyy", Locale.ENGLISH);
        String publishTime = html.xpath("//dl[@class='row']").xpath("//dd[@class='col-sm-9']").nodes().get(10).xpath("//dd/text()").get();
        try {

            publishTimeDateTime = formatter.parse(publishTime.trim());
        } catch (ParseException e) {
            try {
                publishTime = html.xpath("//dl[@class='row']").xpath("//dd[@class='col-sm-9']").nodes().get(9).xpath("//dd/text()").get();
                publishTimeDateTime = formatter.parse(publishTime.trim());
            }catch (Exception e1) {

            }

        }
        StringBuffer authorName = new StringBuffer();
        authorName.append(html.xpath("//dl[@class='row']/").xpath("//dd[@class='col-sm-9 content-element-margin-small']").nodes().get(0).xpath("//dd/text()").get());

        StringBuffer authorAddress = new StringBuffer();
        authorAddress.append(html.xpath("//dl[@class='row']/").xpath("//dd[@class='col-sm-9 content-element-margin-small']").nodes().get(3).xpath("//dd/text()").get());

        JSONArray authorEmail = new JSONArray();
        String contractStr = html.xpath("//div[@class='col-md-3 sidebar content-element-margin']/aside/address/text()").get();
        JSONObject jsonObject = new JSONObject();

        if (StringUtils.isNotBlank(contractStr)) {
            String authorEmailName = contractStr.split(":")[0];
            String telephone = contractStr.split(":")[1];
            jsonObject.put("authorEmailName", authorEmailName);
            jsonObject.put("telephone", telephone);
        }
        String splitStr = html.xpath("//div[@class='col-md-3 sidebar content-element-margin']/aside/address/a").get().replaceAll("<span>","")
                .replaceAll("</span>","").replaceAll("</a>","");
        if (StringUtils.isNotBlank(splitStr)) {
            String email = splitStr.split(">")[1];
            jsonObject.put("email", email);
        }

        authorEmail.add(jsonObject);
        log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorEmail.toJSONString());

        page.putField("article", ArticleDO.builder()
                .articleType(ArticleTypeEnum.UNIVIE_PHYSNANO.getType())
                .articleCode(articleCode)
                .authorName(authorName.toString())
                .title(title)
                .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE))
                .emailInfo(authorEmail.toJSONString())
                .articleDesc(articleDesc)
                .authorAddress(authorAddress.toString())
                .referenceInfo(null).build());
    }

    @Override
    public Site getSite() {
        return site;
    }

    public void setSite(Site site) {
        this.site = site;
    }
}