ChemicalsciencePcoessor.java 7.05 KB
package com.canrd.webmagic.processor;

import com.canrd.webmagic.common.utils.StringUtils;
import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
import lombok.extern.slf4j.Slf4j;
import org.apache.logging.log4j.core.util.UuidUtil;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.model.HttpRequestBody;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.*;
import us.codecraft.webmagic.utils.HttpConstant;

import java.util.*;

@Slf4j
@Component
public class ChemicalsciencePcoessor implements PageProcessor {

    @Autowired
    private ChemicalPcoessor journalResultPcoessor;
    private final Site base = Site.me().setDomain("base").setTimeOut(200000);

    @Override
    public Site getSite() {
        return base;
    }

    private String substring;

    @Override
    public void process(Page page) {
        String url = page.getUrl().get();
        if (url.equals("https://pubs.rsc.org/en/results?searchtext=battery")) {
            getMaxPages(page);
        } else if (url.equals("https://pubs.rsc.org/en/search/journalresult")) {
            getResultMax(page);
        }
    }

    void getMaxPages(Page page) {
        Html html = page.getHtml();
        String script = page.getHtml().css("script").regex(".*var searchResultCounts = (.*?);").get();
        String replace = script.replace("[", "");
        String replace1 = replace.replace("]", "");
        String[] split = replace1.split(",");
        int i = split[1].indexOf(":");
        int i1 = split[1].indexOf("}");
        substring = split[1].substring(i + 1, i1);
        if (!StringUtils.isEmpty(substring)) {
            String baseUrl = "https://pubs.rsc.org/en/search/journalresult";
            Map<String, Object> map = new HashMap<>();
            map.put("searchterm", "PD94bWwgdmVyc2lvbj0iMS4wIj8+DQo8U2VhcmNoVGVybSB4" +
                    "bWxuczp4c2Q9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hIiB4bW" +
                    "xuczp4c2k9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hLWluc3Rh" +
                    "bmNlIj4NCiAgPENhdGVnb3J5PmpvdXJuYWw8L0NhdGVnb3J5Pg0KICA8U3ViQ2" +
                    "F0ZWdvcnkgLz4NCiAgPENvbnRlbnRUeXBlPmpvdXJuYWw8L0NvbnRlbnRUeXBl" +
                    "Pg0KICA8Q3JpdGVyaWFzPg0KICAgIDxOYW1lVmFsdWU+DQogICAgICA8TmFtZT" +
                    "5mcmVldGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5PC9WYWx1ZT4N" +
                    "CiAgICA8L05hbWVWYWx1ZT4NCiAgICA8TmFtZVZhbHVlPg0KICAgICAgPE5hbW" +
                    "U+T3JpZ2luYWxGcmVlVGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5" +
                    "PC9WYWx1ZT4NCiAgICA8L05hbWVWYWx1ZT4NCiAgPC9Dcml0ZXJpYXM+DQogID" +
                    "xGYWNldHMgLz4NCiAgPFJlcXVlc3RUaW1lPjAwMDEtMDEtMDFUMDA6MDA6MDA8" +
                    "L1JlcXVlc3RUaW1lPg0KICA8QXV0aG9yQ3JpdGVyaWEgLz4NCiAgPFB1YmxpY2" +
                    "F0aW9uRGF0ZT4NCiAgICA8SXNTZWxlY3RlZERhdGU+ZmFsc2U8L0lzU2VsZWN0" +
                    "ZWREYXRlPg0KICA8L1B1YmxpY2F0aW9uRGF0ZT4NCiAgPEV4Y2x1ZGVzIC8+DQ" +
                    "o8L1NlYXJjaFRlcm0+");
            log.info(substring);
            map.put("resultcount",25);
            map.put("pageno",2);
            HttpRequestBody httpRequestBody = HttpRequestBody.form(map, "UTF-8");
            Request request = new Request(baseUrl).setMethod(HttpConstant.Method.POST)
                    .addHeader("Content-Type", "application/x-www-form-urlencoded;charset=utf-8")
                    .addHeader("Connection", "keep-alive")
                    .addHeader("Host", "pubs.rsc.org")
                    .addHeader("Accept-Encoding", "gzip, deflate, br, zstd")
                    .addHeader("Accept", "text/html, */*; q=0.01")
                    .setCharset("UTF-8");
            request.setRequestBody(httpRequestBody);
            page.addTargetRequest(request);
        } else {
            throw new RuntimeException();
        }
    }

    void getResultMax(Page page) {
//        .xpath("//span[@class='paging--label']/text()")
        page.putField("html", page.getHtml());
        String ever = page.getHtml().xpath("//div[@class='fixpadv--l']/strong/text()").get();
        int total = Integer.parseInt(substring);
        int everReq = Integer.parseInt(ever.split(" ")[0]);
        int pageNo=total/everReq;
        if ((pageNo*everReq)%total<everReq&&(pageNo*everReq)%total==0){
            pageNo=pageNo+1;
        }
        log.info(String.valueOf(pageNo));
//        String s = page.getHtml().xpath("//span[@class='paging--label']/text()").get();
//        String[] pageTotal = s.split(" ");
//        int now = Integer.parseInt(pageTotal[4]);
//        int total = Integer.parseInt(pageTotal[6]);
        for (int i = 1; i <= total; i++) {
        String baseUrl = "https://pubs.rsc.org/en/search/journalresult";
        Map<String, Object> map = new HashMap<>();
        map.put("searchterm", "PD94bWwgdmVyc2lvbj0iMS4wIj8+DQo8U2VhcmNoVGVybSB4" +
                "bWxuczp4c2Q9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hIiB4bW" +
                "xuczp4c2k9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hLWluc3Rh" +
                "bmNlIj4NCiAgPENhdGVnb3J5PmpvdXJuYWw8L0NhdGVnb3J5Pg0KICA8U3ViQ2" +
                "F0ZWdvcnkgLz4NCiAgPENvbnRlbnRUeXBlPmpvdXJuYWw8L0NvbnRlbnRUeXBl" +
                "Pg0KICA8Q3JpdGVyaWFzPg0KICAgIDxOYW1lVmFsdWU+DQogICAgICA8TmFtZT" +
                "5mcmVldGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5PC9WYWx1ZT4N" +
                "CiAgICA8L05hbWVWYWx1ZT4NCiAgICA8TmFtZVZhbHVlPg0KICAgICAgPE5hbW" +
                "U+T3JpZ2luYWxGcmVlVGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5" +
                "PC9WYWx1ZT4NCiAgICA8L05hbWVWYWx1ZT4NCiAgPC9Dcml0ZXJpYXM+DQogID" +
                "xGYWNldHMgLz4NCiAgPFJlcXVlc3RUaW1lPjAwMDEtMDEtMDFUMDA6MDA6MDA8" +
                "L1JlcXVlc3RUaW1lPg0KICA8QXV0aG9yQ3JpdGVyaWEgLz4NCiAgPFB1YmxpY2" +
                "F0aW9uRGF0ZT4NCiAgICA8SXNTZWxlY3RlZERhdGU+ZmFsc2U8L0lzU2VsZWN0" +
                "ZWREYXRlPg0KICA8L1B1YmxpY2F0aW9uRGF0ZT4NCiAgPEV4Y2x1ZGVzIC8+DQ" +
                "o8L1NlYXJjaFRlcm0+");
        map.put("resultcount",25);
        map.put("pageno",i);
//        map.put("pageno",1);
        HttpRequestBody httpRequestBody = HttpRequestBody.form(map, "UTF-8");
        Request request = new Request(baseUrl).setMethod(HttpConstant.Method.POST)
                .addHeader("Content-Type", "application/x-www-form-urlencoded;charset=utf-8")
                .addHeader("Connection", "keep-alive")
                .addHeader("Host", "pubs.rsc.org")
                .addHeader("Accept-Encoding", "gzip, deflate, br, zstd")
                .addHeader("Accept", "text/html, */*; q=0.01")
                .setCharset("UTF-8");
        request.setRequestBody(httpRequestBody);
//        page.addTargetRequest(request);
        Spider.create(journalResultPcoessor)
                .setUUID(UuidUtil.getTimeBasedUuid().toString())
                .addRequest(request)
                .addPipeline(new ArticlePipeline())
                .thread(1).start();
    }
    }
}