ScienginePcoessor.java 9.27 KB
package com.canrd.webmagic.processor;

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.baomidou.mybatisplus.core.toolkit.CollectionUtils;
import com.canrd.webmagic.common.constant.ServerResult;
import com.canrd.webmagic.common.utils.DateUtil;
import com.canrd.webmagic.domain.ArticleTypeEnum;
import com.canrd.webmagic.domain.dto.ArticleDO;
import com.canrd.webmagic.domain.dto.SciengineAffsListDo;
import com.canrd.webmagic.domain.dto.SciengineAuthorDo;
import com.canrd.webmagic.domain.dto.SciengineReferenceListDo;
import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.jayway.jsonpath.JsonPath;
import lombok.extern.slf4j.Slf4j;
import org.apache.logging.log4j.core.util.UuidUtil;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.model.HttpRequestBody;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.utils.HttpConstant;

import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;

@Slf4j
@Component
public class ScienginePcoessor implements PageProcessor {
    private ConcurrentHashMap<String, Object> map = new ConcurrentHashMap<>();

    private HttpRequestBody httpRequestBody;

    private final Site site = Site.me().setTimeOut(30000);

    @Override
    public void process(Page page) {
        String url = page.getUrl().get();
        if (url.equals("https://www.sciengine.com/plat/search?queryField_a=battery")) {
            getMaxPage(page);
        } else if (url.equals("https://www.sciengine.com/SciSearch/searchNew")) {
            everyPage(page);
        } else if (url.contains("https://www.sciengine.com/restData/initArticle?")) {
            try {
                getPageDetail(page);
            } catch (JsonProcessingException e) {
                throw new RuntimeException(e);
            }
        }
    }

    @Override
    public Site getSite() {
        return site;
    }

    void getMaxPage(Page page) {
        for (int i = 1; i <=490 ; i++) {
        String baseUrl = "https://www.sciengine.com/SciSearch/searchNew";
        map.put("queryField_a", "battery");
        map.put("pageCount",10);
        map.put("curpage",i);
        httpRequestBody = HttpRequestBody.form(map, "UTF-8");
        Request request = new Request(baseUrl).setMethod(HttpConstant.Method.POST)
                .addHeader("Content-Type", "application/x-www-form-urlencoded")
                .addHeader("Connection", "keep-alive")
                .addHeader("Cookie","_ga=GA1.1.12362349.1718065158; SHAREJSESSIONID=35fad62b-37db-455a-af7b-9e9eaac5e5bf; Hm_lvt_633c662645ea15827301cdfaf39e48a1=1718171741; retrievalHistory=%5B%7B%22title%22%3A%22battery%22%7D%5D; Hm_lpvt_633c662645ea15827301cdfaf39e48a1=1718172306; _ga_SB5SCK5F77=GS1.1.1718170247.7.1.1718172335.0.0.0")
                .addHeader("Host","www.sciengine.com")
                .addHeader("Accept-Encoding", "gzip, deflate, br")
                .addHeader("Accept", "*/*")
                .addHeader("Origin","https://www.sciengine.com")
                .addHeader("Referer","https://www.sciengine.com/plat/search?queryField_a=battery")
                .setCharset("UTF-8");
        request.setRequestBody(httpRequestBody);
        page.addTargetRequest(request);
        }
    }

    void everyPage(Page page) {
        String rawText = page.getRawText();
        List<String> BaseIdList = JsonPath.read(rawText, "$.list[*].id");
        List<String> doiList = JsonPath.read(rawText, "$.list[*].doi");
        if (BaseIdList.size() == doiList.size()) {
            for (int i = 0; i < BaseIdList.size(); i++) {
                String baseId = BaseIdList.get(i);
                String doi = doiList.get(i);
                log.info("baseId:"+baseId+",doi:"+doi);
                page.addTargetRequest("https://www.sciengine.com/restData/initArticle?doi="+doi+"&articleBaseId="+baseId);
            }
        } else {
            throw new RuntimeException("匹配不成功");
        }
    }

    void getPageDetail(Page page) throws JsonProcessingException {
        String rawText = page.getRawText();
        //文章链接
        String articleCode = page.getUrl().get();

        //文章标题
        String title = JsonPath.read(rawText, "$.article.title");

        //文章内容
        String articleDesc = JsonPath.read(rawText, "$.article.intro");

        //时间
        Date publishTimeDateTime = null;
        String publishTime = JsonPath.read(rawText, "$.article.pubDateStr");
        SimpleDateFormat formatter = new SimpleDateFormat("MMMM dd,yyyy", Locale.ENGLISH);
        try {
            publishTimeDateTime = formatter.parse(publishTime);
        } catch (ParseException e) {
            e.printStackTrace();
        }

        //作者名字
        List<String> authors = JsonPath.read(rawText, "$.authorList[*].fullName");
        StringBuffer authorName = new StringBuffer();
        authors.forEach(authorName::append);


        //邮箱和地址
        JSONArray authorMail = new JSONArray();
        ObjectMapper objectMapper = new ObjectMapper();
        net.minidev.json.JSONArray authorJsonArray = JsonPath.read(rawText, "$.authorList");
        JSONArray fastJsonArray = JSON.parseArray(authorJsonArray.toJSONString());
        SciengineAuthorDo[] authorList = objectMapper.readValue(fastJsonArray.toJSONString(), SciengineAuthorDo[].class);

        JSONArray authorAddress = new JSONArray();
        boolean isNotAddress = true;
        for (SciengineAuthorDo author : authorList) {
            JSONObject mailObj = new JSONObject();
            if (author.getAuthorNoteList() != null) {
                mailObj.put("authorEmailName", author.getFullName());
                mailObj.put("email", author.getAuthorNoteList().get(0).getEmail());
            } else {
            }
            if (!mailObj.isEmpty()){
                authorMail.add(mailObj);
            }
            JSONObject addressObj = new JSONObject();
            if (author.getAffsList() != null) {
                addressObj.put("address", author.getAffsList().get(0).getAffText());
                addressObj.put("authorNames", author.getFullName());
                isNotAddress = false;
                authorAddress.add(addressObj);
            } else {
            }
        }

        ObjectMapper affMapper = new ObjectMapper();
        net.minidev.json.JSONArray affList = JsonPath.read(rawText, "$.affList");
        JSONArray affListJsonArray = JSON.parseArray(affList.toJSONString());
        SciengineAffsListDo[] affListArray = affMapper.readValue(affListJsonArray.toJSONString(), SciengineAffsListDo[].class);
        if (isNotAddress && CollectionUtils.isNotEmpty(Arrays.asList(affListArray))) {
            JSONObject addressObj = new JSONObject();
            addressObj.put("authorNames", authorName);
            List addressList = new ArrayList();
            for (SciengineAffsListDo sciengineAffsListDo : affListArray) {
                if (sciengineAffsListDo.getAffText() != null) {
                    addressList.add(sciengineAffsListDo.getAffText());
                }
            }
            addressObj.put("address", addressList);
            authorAddress.add(addressObj);
        }

        //引用文献
        JSONArray references = new JSONArray();
        ObjectMapper referenceMapper = new ObjectMapper();
        net.minidev.json.JSONArray refListJsonArray = JsonPath.read(rawText, "$.article.referenceList");
        JSONArray referenceListJsonArray = JSON.parseArray(refListJsonArray.toJSONString());
        SciengineReferenceListDo[] referenceList = referenceMapper.readValue(referenceListJsonArray.toJSONString(), SciengineReferenceListDo[].class);
        for (SciengineReferenceListDo sciengineReferenceListDo : referenceList) {
            StringBuffer referenceTitle = new StringBuffer();
            JSONObject referencesObj = new JSONObject();
            ArrayList<Object> herfList = new ArrayList<>();
            herfList.add(sciengineReferenceListDo.getTitle());
            referenceTitle.append("https://www.sciengine.com/JAS/doi/" + sciengineReferenceListDo.getDoi());
            referencesObj.put("links", herfList);
            referencesObj.put("referenceTitle", referenceTitle);
            references.add(referencesObj);
        }
        log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorName, authorMail.toJSONString());
        page.putField("article", ArticleDO.builder()
                .articleType(ArticleTypeEnum.Sciengine.getType())
                .articleCode(articleCode)
                .authorName(authorName.toString())
                .title(title)
                .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE))
                .emailInfo(authorMail.toJSONString())
                .articleDesc(articleDesc)
                .authorAddress(authorAddress.toJSONString())
                .referenceInfo(references.toJSONString()).build());
    }
}