NatureMaterialPagePcoessor.java 10.3 KB
package com.canrd.webmagic.processor;

import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.baomidou.mybatisplus.core.toolkit.CollectionUtils;
import com.canrd.webmagic.common.utils.DateUtil;
import com.canrd.webmagic.common.utils.StringUtils;
import com.canrd.webmagic.domain.ArticleTypeEnum;
import com.canrd.webmagic.domain.dto.ArticleDO;
import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;
import us.codecraft.webmagic.selector.XpathSelector;

import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.stream.Collectors;

@Component
@Slf4j
public class NatureMaterialPagePcoessor implements PageProcessor {
    @Override
    public void process(Page page) {
        String url = page.getUrl().get();
        if (url.equals("https://www.nature.com/nmat/articles")){
            getIndex(page);
        } else if (url.contains("https://www.nature.com/nmat/articles?searchType=journalSearch&sort=PubDate&page=")){
            everyPage(page);
        } else if (url.equals("https://www.nature.com/search?q=battery")) {
            getIndex(page);
        } else if (url.contains("https://www.nature.com/search?q=battery&page=")) {
            everyPage(page);
        } else if (url.contains("https://www.nature.com/articles")){
            doArticleContent(page);
        } else if (url.equals("https://www.nature.com/nature/research-articles")) {
            getIndex(page);
        } else if (url.contains("https://www.nature.com/nature/research-articles?searchType=journalSearch&sort=PubDate&page=")) {
            everyPage(page);
        } else if (url.equals("https://www.nature.com/search?q=battery&journal=nmat&order=relevance")) {
            getIndex(page);
        }else if (url.equals("https://www.nature.com/search?q=batteries&journal=nmat&order=relevance")) {
            getIndex(page);
        }else if (url.contains("https://www.nature.com/search?q=battery&order=relevance&journal=nmat&page=")) {
            everyPage(page);
        }else if (url.contains("https://www.nature.com/search?q=batteries&journal=nmat&page=")) {
            everyPage(page);
        }
    }

    @Override
    public Site getSite() {
        return PageProcessor.super.getSite().setRetryTimes(3).setSleepTime(100);
    }

    public void getIndex(Page page){
        String maxIndex = page.getHtml().xpath("//div[@class='u-mb-48']/nav/ul/li[6]/a/text()").get();
        log.info(maxIndex);
        String trim = maxIndex.trim();
        int number = Integer.parseInt(trim);
        if (page.getUrl().get().equals("https://www.nature.com/nmat/articles")){
            for (int i = 1; i <= number; i++) {
                page.addTargetRequest("https://www.nature.com/nmat/articles?searchType=journalSearch&sort=PubDate&page="+i);
            }
        }
        if (page.getUrl().get().equals("https://www.nature.com/search?q=battery")){
            for (int i = 1; i <= number; i++) {
                page.addTargetRequest("https://www.nature.com/search?q=battery&page="+i);
            }
        }
        if (page.getUrl().get().equals("https://www.nature.com/nature/research-articles")){
            for (int i = 1; i <= number; i++) {
                page.addTargetRequest("https://www.nature.com/nature/research-articles?searchType=journalSearch&sort=PubDate&page="+i);
            }
        }
        if (page.getUrl().get().equals("https://www.nature.com/search?q=battery&journal=nmat&order=relevance")){
            for (int i = 1; i <= number; i++) {
                page.addTargetRequest("https://www.nature.com/search?q=battery&order=relevance&journal=nmat&page="+i);
            }
        }
        if (page.getUrl().get().equals("https://www.nature.com/search?q=batteries&journal=nmat&order=relevance")){
            for (int i = 1; i <= number; i++) {
                page.addTargetRequest("https://www.nature.com/search?q=batteries&journal=nmat&page="+i);
            }
        }
    }

    public void everyPage(Page page){
        List<String> all = page.getHtml().xpath("//div[@class='u-flex-direction-column']/h3/a//@href").all();
        for (int i = 0; i < all.size(); i++) {
//            log.info(all.get(i));
            page.addTargetRequest("https://www.nature.com"+all.get(i));
        }
    }

    private void doArticleContent(Page page) {
        if (page.getUrl().get().contains("redirect") || !page.getUrl().get().contains("nature")) {
            return;
        }
        //解析页面
        Html html = page.getHtml();
        String articleCode = page.getUrl().get();
        Selectable headSelectable = html.xpath("//div[@class='c-article-header']/header");
        List<Selectable> authorEmailSelectables = html.xpath("//p[@id='corresponding-author-list']/a").nodes();
        Selectable referencesSelectable = html.xpath("//ol[@class='c-article-references']").select(new XpathSelector("li[@class='c-article-references__item js-c-reading-companion-references-item']"));
        Selectable authorAddressSelectable = html.xpath("//ol[@class='c-article-author-affiliation__list']").select(new XpathSelector("li"));

        String title = headSelectable.xpath("//div/h1/text()").get();
        if (StringUtils.isBlank(title)) {
            title = headSelectable.xpath("//h1/text()").get();
        }
        String articleDesc = html.xpath("//div[@class='c-article-section__content']/p/text()").get();
        String publishTime;
        Date publishTimeDateTime = null;
        try {
            publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get();
        } catch (Exception e) {
            try {
                publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(1).xpath("//li/time/text()").get();
            } catch (Exception e1) {
                publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(0).xpath("//li/time/text()").get();
            }
        }
        SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH);

        try {
            publishTimeDateTime = formatter.parse(publishTime);
        } catch (ParseException e) {
            e.printStackTrace();
        }
        Selectable authorSelectable = headSelectable.xpath("//ul").nodes().get(1).select(new XpathSelector("li[@class='c-article-author-list__item']"));
        List<Selectable> authorNodes = authorSelectable.nodes();
        StringBuffer authorName = new StringBuffer();
        for (Selectable node : authorNodes) {
            authorName.append(node.xpath("//a/text()"));
        }

        JSONArray authorAddress = new JSONArray();
        List<Selectable> authorAddressList = authorAddressSelectable.nodes();
        if (CollectionUtils.isNotEmpty(authorAddressList)) {
            for (Selectable selectable : authorAddressList) {
                String address = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__address']/text()").get();
                String authorNames = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__authors-list']/text()").get();
                JSONObject object = new JSONObject();
                object.put("address", address);
                object.put("authorNames", authorNames);
                authorAddress.add(object);
            }
        }

        JSONArray references = new JSONArray();
        List<Selectable> referenceList = referencesSelectable.nodes();
        if (CollectionUtils.isNotEmpty(referenceList)) {
            for (Selectable reference : referenceList) {
                String referenceTitle = reference.xpath("//p").xpath("//p[@class='c-article-references__text']/text()").get();
                List<Selectable> referenceLinks = reference.xpath("//p").xpath("//p[@class='c-article-references__links u-hide-print']").links().nodes();
                List<String> links = new ArrayList<>();
                if (CollectionUtils.isNotEmpty(referenceLinks)) {
                    links = referenceLinks.stream().map(x -> x.get()).collect(Collectors.toList());
                }
                JSONObject object = new JSONObject();
                object.put("referenceTitle", referenceTitle);
                object.put("links", links);
//                if (CollectionUtils.isNotEmpty(links)) {
//                    page.addTargetRequests(links.stream().filter(x -> x.contains("nature")).collect(Collectors.toList()));
//                }
                references.add(object);
            }
        }

        JSONArray authorEmail = new JSONArray();
        for (Selectable authorEmailSelectable : authorEmailSelectables) {
            String[] split = authorEmailSelectable.xpath("//a").links().get().split(":");
            String email = Objects.isNull(split) ? "" : split[split.length - 1];
            String authorEmailName = authorEmailSelectable.xpath("//a/text()").get();
            JSONObject jsonObject = new JSONObject();
            jsonObject.put("authorEmailName", authorEmailName);
            jsonObject.put("email", email);
            authorEmail.add(jsonObject);
        }
        log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorEmail.toJSONString());

        page.putField("article", ArticleDO.builder()
                .articleType(ArticleTypeEnum.NATURE_MATERIAL.getType())
                .articleCode(articleCode)
                .authorName(authorName.toString())
                .title(title)
                .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE))
                .emailInfo(authorEmail.toJSONString())
                .articleDesc(articleDesc)
                .authorAddress(authorAddress.toJSONString())
                .referenceInfo(references.toJSONString()).build());
    }

    public static void main(String[] args) {
        Spider.create(new MatterPagePcoessor())
                .addUrl("https://www.nature.com/nmat/articles?searchType=journalSearch&sort=PubDate&page=1")
                .addPipeline(new ArticlePipeline())
                .thread(1).run();
    }
}