ChemicalPcoessor.java 18.2 KB
package com.canrd.webmagic.processor;

import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.baomidou.mybatisplus.core.toolkit.CollectionUtils;
import com.canrd.webmagic.common.utils.DateUtil;
import com.canrd.webmagic.common.utils.StringUtils;
import com.canrd.webmagic.domain.ArticleTypeEnum;
import com.canrd.webmagic.domain.dto.ArticleDO;
import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
import com.google.common.net.MediaType;
import lombok.extern.slf4j.Slf4j;
import okhttp3.MultipartBody;
import okhttp3.OkHttpClient;
import okhttp3.Response;
import okhttp3.ResponseBody;
import org.springframework.stereotype.Component;
import org.springframework.web.bind.annotation.RequestBody;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.model.HttpRequestBody;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;
import us.codecraft.webmagic.utils.HttpConstant;

import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;

@Slf4j
@Component
public class ChemicalPcoessor implements PageProcessor {
    private final Site request = Site.me().setTimeOut(300000);

    private int index;
    private String substring;

    private HttpRequestBody httpRequestBody;

    private ConcurrentHashMap<String, Object> map = new ConcurrentHashMap<>();

    @Override
    public void process(Page page) {
//        synchronized (this) {
        String url = page.getUrl().get();
        if (url.equals("https://pubs.rsc.org/en/results?searchtext=battery")) {
            try {
                getMaxPages(page);
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
//            }else if (url.equals("https://pubs.rsc.org/en/search/journalresult")&&index==0) {
//                getResultMax(page);
        } else if (url.equals("https://pubs.rsc.org/en/search/journalresult")) {
            eveyPage(page);
        } else if (url.contains("https://pubs.rsc.org/en/content")) {
            getDetil(page);
        }
//        }
    }

    void getMaxPages(Page page) throws IOException {
        index = 0;
        Html html = page.getHtml();
        String script = html.css("script").regex(".*var searchResultCounts = (.*?);").get();
//        log.info(script);
//        String replace = script.replace("[", "");
//        String replace1 = replace.replace("]", "");
//        String[] split = replace1.split(",");
//        int i1 = split[1].indexOf(":");
//        int i2 = split[1].indexOf("}");
//        substring = split[1].substring(i1 + 1, i1);
//        if (!StringUtils.isEmpty(substring)) {
        for (int i = 1; i <= 2118; i++) {
            String baseUrl = "https://pubs.rsc.org/en/search/journalresult";
            map.put("searchterm", "PD94bWwgdmVyc2lvbj0iMS4wIj8+DQo8U2VhcmNoVGVybSB4" +
                    "bWxuczp4c2Q9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hIiB4bW" +
                    "xuczp4c2k9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hLWluc3Rh" +
                    "bmNlIj4NCiAgPENhdGVnb3J5PmpvdXJuYWw8L0NhdGVnb3J5Pg0KICA8U3ViQ2" +
                    "F0ZWdvcnkgLz4NCiAgPENvbnRlbnRUeXBlPmpvdXJuYWw8L0NvbnRlbnRUeXBl" +
                    "Pg0KICA8Q3JpdGVyaWFzPg0KICAgIDxOYW1lVmFsdWU+DQogICAgICA8TmFtZT" +
                    "5mcmVldGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5PC9WYWx1ZT4N" +
                    "CiAgICA8L05hbWVWYWx1ZT4NCiAgICA8TmFtZVZhbHVlPg0KICAgICAgPE5hbW" +
                    "U+T3JpZ2luYWxGcmVlVGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5" +
                    "PC9WYWx1ZT4NCiAgICA8L05hbWVWYWx1ZT4NCiAgPC9Dcml0ZXJpYXM+DQogID" +
                    "xGYWNldHMgLz4NCiAgPFJlcXVlc3RUaW1lPjAwMDEtMDEtMDFUMDA6MDA6MDA8" +
                    "L1JlcXVlc3RUaW1lPg0KICA8QXV0aG9yQ3JpdGVyaWEgLz4NCiAgPFB1YmxpY2" +
                    "F0aW9uRGF0ZT4NCiAgICA8SXNTZWxlY3RlZERhdGU+ZmFsc2U8L0lzU2VsZWN0" +
                    "ZWREYXRlPg0KICA8L1B1YmxpY2F0aW9uRGF0ZT4NCiAgPEV4Y2x1ZGVzIC8+DQ" +
                    "o8L1NlYXJjaFRlcm0+");
            map.put("resultcount", 52942);
            map.put("pageno", i);
            httpRequestBody = HttpRequestBody.form(map, "UTF-8");
            Request request = new Request(baseUrl).setMethod(HttpConstant.Method.POST)
                    .addHeader("Content-Type", "application/x-www-form-urlencoded")
                    .addHeader("Connection", "keep-alive")
                    .addHeader("Host", "pubs.rsc.org")
                    .addHeader("Accept-Encoding", "gzip, deflate, br")
                    .addHeader("Accept", "*/*")
                    .addHeader("Cookie", "X-Mapping-hhmaobcf=3C9897E0250B6117CDBA3EC8F724E9A9;_PubsBFCleared=1;ASP.NET_SessionId=bang00oiyj1p1v45jkuk2cgt;ShowEUCookieLawBanner=true")
                    .setCharset("UTF-8");
//            log.info(map.toString());
            request.setRequestBody(httpRequestBody);
            page.addTargetRequest(request);

//        OkHttpClient client = new OkHttpClient().newBuilder()
//                .build();
//        MediaType mediaType = MediaType.parse("text/plain");
//        for (int i = 1; i <= 3; i++) {
//            MultipartBody body = new MultipartBody.Builder().setType(MultipartBody.FORM)
//                    .addFormDataPart("searchterm", "PD94bWwgdmVyc2lvbj0iMS4wIj8+DQo8U2VhcmNoVGVybSB4bWxuczp4c2Q9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hIiB4bWxuczp4c2k9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hLWluc3RhbmNlIj4NCiAgPENhdGVnb3J5PmpvdXJuYWw8L0NhdGVnb3J5Pg0KICA8U3ViQ2F0ZWdvcnkgLz4NCiAgPENvbnRlbnRUeXBlPmpvdXJuYWw8L0NvbnRlbnRUeXBlPg0KICA8Q3JpdGVyaWFzPg0KICAgIDxOYW1lVmFsdWU+DQogICAgICA8TmFtZT5mcmVldGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5PC9WYWx1ZT4NCiAgICA8L05hbWVWYWx1ZT4NCiAgICA8TmFtZVZhbHVlPg0KICAgICAgPE5hbWU+T3JpZ2luYWxGcmVlVGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5PC9WYWx1ZT4NCiAgICA8L05hbWVWYWx1ZT4NCiAgPC9Dcml0ZXJpYXM+DQogIDxGYWNldHMgLz4NCiAgPFJlcXVlc3RUaW1lPjAwMDEtMDEtMDFUMDA6MDA6MDA8L1JlcXVlc3RUaW1lPg0KICA8QXV0aG9yQ3JpdGVyaWEgLz4NCiAgPFB1YmxpY2F0aW9uRGF0ZT4NCiAgICA8SXNTZWxlY3RlZERhdGU+ZmFsc2U8L0lzU2VsZWN0ZWREYXRlPg0KICA8L1B1YmxpY2F0aW9uRGF0ZT4NCiAgPEV4Y2x1ZGVzIC8+DQo8L1NlYXJjaFRlcm0+")
//                    .addFormDataPart("resultcount", "25")
////                    .addFormDataPart("category", "journal")
//                    .addFormDataPart("pageno", String.valueOf(i))
//                    .build();
//            okhttp3.Request okrequest = new okhttp3.Request.Builder()
//                    .url("https://pubs.rsc.org/en/search/journalresult")
//                    .method("POST", body)
//                    .addHeader("Cookie", "X-Mapping-hhmaobcf=3C9897E0250B6117CDBA3EC8F724E9A9;_PubsBFCleared=1;ASP.NET_SessionId=bang00oiyj1p1v45jkuk2cgt;ShowEUCookieLawBanner=true")
//                    .build();
//            Response response = client.newCall(okrequest).execute();
//            String responseBody = response.body().string();
//            log.info(String.valueOf(responseBody));
        }
    }
//        }else{
//            throw new RuntimeException();
//        }
//    }

    void getResultMax(Page page) {
        index = 1;
//        .xpath("//span[@class='paging--label']/text()")
        page.putField("html", page.getHtml());
        String ever = page.getHtml().xpath("//div[@class='fixpadv--l']/strong/text()").get();
        int total = Integer.parseInt(substring);
        int everReq = Integer.parseInt(ever.split(" ")[0]);
        int pageNo = total / everReq;
        if ((pageNo * everReq) % total < everReq && (pageNo * everReq) % total == 0) {
            pageNo = pageNo + 1;
        }
        log.info(String.valueOf(pageNo));
//        String s = page.getHtml().xpath("//span[@class='paging--label']/text()").get();
//        String[] pageTotal = s.split(" ");
//        int now = Integer.parseInt(pageTotal[4]);
//        int total = Integer.parseInt(pageTotal[6]);
        for (int i = 1; i <= total; i++) {
            String baseUrl = "https://pubs.rsc.org/en/search/journalresult";
            Map<String, Object> map = new HashMap<>();
            map.put("searchterm", "PD94bWwgdmVyc2lvbj0iMS4wIj8+DQo8U2VhcmNoVGVybSB4" +
                    "bWxuczp4c2Q9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hIiB4bW" +
                    "xuczp4c2k9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hLWluc3Rh" +
                    "bmNlIj4NCiAgPENhdGVnb3J5PmpvdXJuYWw8L0NhdGVnb3J5Pg0KICA8U3ViQ2" +
                    "F0ZWdvcnkgLz4NCiAgPENvbnRlbnRUeXBlPmpvdXJuYWw8L0NvbnRlbnRUeXBl" +
                    "Pg0KICA8Q3JpdGVyaWFzPg0KICAgIDxOYW1lVmFsdWU+DQogICAgICA8TmFtZT" +
                    "5mcmVldGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5PC9WYWx1ZT4N" +
                    "CiAgICA8L05hbWVWYWx1ZT4NCiAgICA8TmFtZVZhbHVlPg0KICAgICAgPE5hbW" +
                    "U+T3JpZ2luYWxGcmVlVGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5" +
                    "PC9WYWx1ZT4NCiAgICA8L05hbWVWYWx1ZT4NCiAgPC9Dcml0ZXJpYXM+DQogID" +
                    "xGYWNldHMgLz4NCiAgPFJlcXVlc3RUaW1lPjAwMDEtMDEtMDFUMDA6MDA6MDA8" +
                    "L1JlcXVlc3RUaW1lPg0KICA8QXV0aG9yQ3JpdGVyaWEgLz4NCiAgPFB1YmxpY2" +
                    "F0aW9uRGF0ZT4NCiAgICA8SXNTZWxlY3RlZERhdGU+ZmFsc2U8L0lzU2VsZWN0" +
                    "ZWREYXRlPg0KICA8L1B1YmxpY2F0aW9uRGF0ZT4NCiAgPEV4Y2x1ZGVzIC8+DQ" +
                    "o8L1NlYXJjaFRlcm0+");
            map.put("resultcount", 25);
            map.put("pageno", i);
            HttpRequestBody httpRequestBody = HttpRequestBody.form(map, "UTF-8");
            Request request = new Request(baseUrl).setMethod(HttpConstant.Method.POST)
                    .addHeader("Content-Type", "application/x-www-form-urlencoded;charset=utf-8")
                    .addHeader("Connection", "keep-alive")
                    .addHeader("Host", "pubs.rsc.org")
                    .addHeader("Accept-Encoding", "gzip, deflate, br, zstd")
                    .addHeader("Accept", "text/html, */*; q=0.01")
                    .setCharset("UTF-8");
            request.setRequestBody(httpRequestBody);
            page.addTargetRequest(request);
        }
    }

    void eveyPage(Page page) {
//        <span class="paging--label"> - Showing page 2 of 4000</span>
//        log.info(page.getHtml().get());
        List<String> hrefList = page.getHtml().xpath("//a[@class='capsule__action']/@href").all();
        for (String herf : hrefList) {
            log.info("https://pubs.rsc.org" + herf);
            String url = "https://pubs.rsc.org" + herf;
            page.addTargetRequest(url);
        }
    }

    void getDetil(Page page) {
        Html html = page.getHtml();
        //文章链接
        String articleCode = page.getUrl().get();
        //文章标题
        String title = html.xpath("//div[@class='article__title']/h2/font/text()").get();
        if (StringUtils.isBlank(title)) {
            title = html.xpath("//div[@class='article__title']/h2/text()").get();
        }

        //文章内容
        String articleDesc = html.xpath("//div[@class='capsule__text']/p/text()").get();

        //时间
        String publishTime;
        Date publishTimeDateTime = null;
        List<String> all = html.xpath("//div[@class='c fixpadt--l']/dd[@class='c__10']").all();
        AtomicInteger timeIndex = new AtomicInteger(0);
        all.stream().filter(s -> {
            timeIndex.getAndIncrement();
            return s.equals("test2");
        }).findFirst();
        publishTime = html.xpath("//div[@class='c fixpadt--l']/dd[@class='c__14']/text()").all().get(timeIndex.get());
        SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH);
        try {
            publishTimeDateTime = formatter.parse(publishTime);
        } catch (ParseException e) {
            e.printStackTrace();
        }

        //作者名字
        List<String> authorList = html.xpath("//div[@class='article__authors']/span[@class='article__author-link']/a/text()").all();
        List<String> collect = authorList.stream().filter(a -> !a.equals("†")).collect(Collectors.toList());

//        collect.forEach(log::info);
        StringBuffer stringBuffer = new StringBuffer();
        collect.forEach(stringBuffer::append);

        StringBuffer authorName = new StringBuffer();
        authorList.forEach(authorName::append);
        HashMap<Object, Object> authorHashMap = new HashMap<>();
        List<String> cupList = html.xpath("//div[@class='article__authors']/span/span/sup/i/text()").all();
        if (collect.size() == cupList.size()) {
            for (int i = 0; i < collect.size(); i++) {
                authorHashMap.put(collect.get(i), cupList.get(i));
            }
        }
//        单位地址 邮箱
        JSONArray authorAddress = new JSONArray();
        JSONArray authorMail = new JSONArray();
        List<Selectable> addressMailList = html.xpath("//p[@class='article__author-affiliation']").nodes();
        HashMap<Object, Object> addressMap = new HashMap<>();
        HashMap<Object, Object> mailMap = new HashMap<>();
        if (CollectionUtils.isNotEmpty(addressMailList)) {
            for (Selectable selectable : addressMailList) {
                List<Selectable> nodes = selectable.xpath("//span").nodes();
                if (CollectionUtils.isNotEmpty(nodes)) {
                    Selectable keyXpath = nodes.get(0);
                    Selectable valueXpath = nodes.get(1);
                    String key = keyXpath.xpath("//sup/text()").get();
                    String address = valueXpath.xpath("//span/text()").get();
                    String mail = valueXpath.xpath("//span/a/text()").get();
                    if (!StringUtils.isBlank(key)) {
                        if (!StringUtils.isBlank(address)) {
                            addressMap.put(key, address);
                        } else {
                            addressMap.put(key, null);
                        }
                        if (!StringUtils.isBlank(mail)) {
                            mailMap.put(key, mail);
                        } else {
                            mailMap.put(key, null);
                        }
                    } else {
                        if (!StringUtils.isBlank(address)) {
                            addressMap.put("*", address);
                        } else {
                            addressMap.put("*", null);
                        }
                        if (!StringUtils.isBlank(mail)) {
                            mailMap.put("*", mail);
                        } else {
                            mailMap.put("*", null);
                        }
                    }
                }
            }
            Object[] objects = authorHashMap.keySet().stream().toArray();
//            log.info(Arrays.toString(objects));
            for (int i = 0; i < objects.length; i++) {
                JSONObject addressObj = new JSONObject();
                JSONObject mailObj = new JSONObject();
                Object point = authorHashMap.get(String.valueOf(objects[i]));
                Object address = addressMap.get(point);
                Object mail = mailMap.get(point);
                addressObj.put("address", address);
                addressObj.put("authorNames", objects[i]);
                mailObj.put("authorEmailName", objects[i]);
                mailObj.put("email", mail);

                authorAddress.add(addressObj);
                authorMail.add(mailObj);
            }
        }

        JSONArray references = new JSONArray();
        List<Selectable> referenceNodeList = html.xpath("//div[@class='ref-list']/ol/li").nodes();
        if (CollectionUtils.isNotEmpty(referenceNodeList)) {
            for (Selectable reference : referenceNodeList) {
                List<String> herfOpList = reference.xpath("//a/@href").all();
                ArrayList<Object> herfList = new ArrayList<>();
                for (String herf : herfOpList) {
                    if (herf.startsWith("https") || herf.startsWith("http")) {
                        String before = "https://pubs.rsc.org";
                        herfList.add(before + herf);
                    } else {
                        herfList.add(herf);
                    }
                }
                StringBuffer referenceTitle = new StringBuffer();
                List<String> spanList = reference.xpath("//span/text()").all();
                if (CollectionUtils.isNotEmpty(spanList)) {
                    for (int i = 0; i < spanList.size(); i++) {
                        referenceTitle.append(spanList.get(i));
                        if (i < spanList.size() - 1) {
                            referenceTitle.append(",");
                        }
                    }
                }
                String em = reference.xpath("//em/text()").get();
                String strong = reference.xpath("//strong/text()").get();
                referenceTitle.append(em);
                referenceTitle.append(strong);
                JSONObject referencesObj = new JSONObject();
                referencesObj.put("links", herfList);
                referencesObj.put("referenceTitle", referenceTitle);
                references.add(referencesObj);
            }
        }
        log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorName, authorMail.toJSONString());
        page.putField("article", ArticleDO.builder()
                .articleType(ArticleTypeEnum.Chemical.getType())
                .articleCode(articleCode)
                .authorName(authorName.toString())
                .title(title)
                .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE))
                .emailInfo(authorMail.toJSONString())
                .articleDesc(articleDesc)
                .authorAddress(authorAddress.toJSONString())
                .referenceInfo(references.toJSONString()).build());
    }

    @Override
    public Site getSite() {
        return request;
    }

    public static void main(String[] args) {
        Spider.create(new ChemicalPcoessor())
                .addUrl("https://pubs.rsc.org/en/search/journalresult")
                .addPipeline(new ArticlePipeline())
                .thread(5).run();
    }
}