ChemicalsciencePcoessor.java
7.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
package com.canrd.webmagic.processor;
import com.canrd.webmagic.common.utils.StringUtils;
import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
import lombok.extern.slf4j.Slf4j;
import org.apache.logging.log4j.core.util.UuidUtil;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.model.HttpRequestBody;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.*;
import us.codecraft.webmagic.utils.HttpConstant;
import java.util.*;
@Slf4j
@Component
public class ChemicalsciencePcoessor implements PageProcessor {
@Autowired
private ChemicalPcoessor journalResultPcoessor;
private final Site base = Site.me().setDomain("base").setTimeOut(200000);
@Override
public Site getSite() {
return base;
}
private String substring;
@Override
public void process(Page page) {
String url = page.getUrl().get();
if (url.equals("https://pubs.rsc.org/en/results?searchtext=battery")) {
getMaxPages(page);
} else if (url.equals("https://pubs.rsc.org/en/search/journalresult")) {
getResultMax(page);
}
}
void getMaxPages(Page page) {
Html html = page.getHtml();
String script = page.getHtml().css("script").regex(".*var searchResultCounts = (.*?);").get();
String replace = script.replace("[", "");
String replace1 = replace.replace("]", "");
String[] split = replace1.split(",");
int i = split[1].indexOf(":");
int i1 = split[1].indexOf("}");
substring = split[1].substring(i + 1, i1);
if (!StringUtils.isEmpty(substring)) {
String baseUrl = "https://pubs.rsc.org/en/search/journalresult";
Map<String, Object> map = new HashMap<>();
map.put("searchterm", "PD94bWwgdmVyc2lvbj0iMS4wIj8+DQo8U2VhcmNoVGVybSB4" +
"bWxuczp4c2Q9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hIiB4bW" +
"xuczp4c2k9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hLWluc3Rh" +
"bmNlIj4NCiAgPENhdGVnb3J5PmpvdXJuYWw8L0NhdGVnb3J5Pg0KICA8U3ViQ2" +
"F0ZWdvcnkgLz4NCiAgPENvbnRlbnRUeXBlPmpvdXJuYWw8L0NvbnRlbnRUeXBl" +
"Pg0KICA8Q3JpdGVyaWFzPg0KICAgIDxOYW1lVmFsdWU+DQogICAgICA8TmFtZT" +
"5mcmVldGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5PC9WYWx1ZT4N" +
"CiAgICA8L05hbWVWYWx1ZT4NCiAgICA8TmFtZVZhbHVlPg0KICAgICAgPE5hbW" +
"U+T3JpZ2luYWxGcmVlVGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5" +
"PC9WYWx1ZT4NCiAgICA8L05hbWVWYWx1ZT4NCiAgPC9Dcml0ZXJpYXM+DQogID" +
"xGYWNldHMgLz4NCiAgPFJlcXVlc3RUaW1lPjAwMDEtMDEtMDFUMDA6MDA6MDA8" +
"L1JlcXVlc3RUaW1lPg0KICA8QXV0aG9yQ3JpdGVyaWEgLz4NCiAgPFB1YmxpY2" +
"F0aW9uRGF0ZT4NCiAgICA8SXNTZWxlY3RlZERhdGU+ZmFsc2U8L0lzU2VsZWN0" +
"ZWREYXRlPg0KICA8L1B1YmxpY2F0aW9uRGF0ZT4NCiAgPEV4Y2x1ZGVzIC8+DQ" +
"o8L1NlYXJjaFRlcm0+");
log.info(substring);
map.put("resultcount",25);
map.put("pageno",2);
HttpRequestBody httpRequestBody = HttpRequestBody.form(map, "UTF-8");
Request request = new Request(baseUrl).setMethod(HttpConstant.Method.POST)
.addHeader("Content-Type", "application/x-www-form-urlencoded;charset=utf-8")
.addHeader("Connection", "keep-alive")
.addHeader("Host", "pubs.rsc.org")
.addHeader("Accept-Encoding", "gzip, deflate, br, zstd")
.addHeader("Accept", "text/html, */*; q=0.01")
.setCharset("UTF-8");
request.setRequestBody(httpRequestBody);
page.addTargetRequest(request);
} else {
throw new RuntimeException();
}
}
void getResultMax(Page page) {
// .xpath("//span[@class='paging--label']/text()")
page.putField("html", page.getHtml());
String ever = page.getHtml().xpath("//div[@class='fixpadv--l']/strong/text()").get();
int total = Integer.parseInt(substring);
int everReq = Integer.parseInt(ever.split(" ")[0]);
int pageNo=total/everReq;
if ((pageNo*everReq)%total<everReq&&(pageNo*everReq)%total==0){
pageNo=pageNo+1;
}
log.info(String.valueOf(pageNo));
// String s = page.getHtml().xpath("//span[@class='paging--label']/text()").get();
// String[] pageTotal = s.split(" ");
// int now = Integer.parseInt(pageTotal[4]);
// int total = Integer.parseInt(pageTotal[6]);
for (int i = 1; i <= total; i++) {
String baseUrl = "https://pubs.rsc.org/en/search/journalresult";
Map<String, Object> map = new HashMap<>();
map.put("searchterm", "PD94bWwgdmVyc2lvbj0iMS4wIj8+DQo8U2VhcmNoVGVybSB4" +
"bWxuczp4c2Q9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hIiB4bW" +
"xuczp4c2k9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hLWluc3Rh" +
"bmNlIj4NCiAgPENhdGVnb3J5PmpvdXJuYWw8L0NhdGVnb3J5Pg0KICA8U3ViQ2" +
"F0ZWdvcnkgLz4NCiAgPENvbnRlbnRUeXBlPmpvdXJuYWw8L0NvbnRlbnRUeXBl" +
"Pg0KICA8Q3JpdGVyaWFzPg0KICAgIDxOYW1lVmFsdWU+DQogICAgICA8TmFtZT" +
"5mcmVldGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5PC9WYWx1ZT4N" +
"CiAgICA8L05hbWVWYWx1ZT4NCiAgICA8TmFtZVZhbHVlPg0KICAgICAgPE5hbW" +
"U+T3JpZ2luYWxGcmVlVGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5" +
"PC9WYWx1ZT4NCiAgICA8L05hbWVWYWx1ZT4NCiAgPC9Dcml0ZXJpYXM+DQogID" +
"xGYWNldHMgLz4NCiAgPFJlcXVlc3RUaW1lPjAwMDEtMDEtMDFUMDA6MDA6MDA8" +
"L1JlcXVlc3RUaW1lPg0KICA8QXV0aG9yQ3JpdGVyaWEgLz4NCiAgPFB1YmxpY2" +
"F0aW9uRGF0ZT4NCiAgICA8SXNTZWxlY3RlZERhdGU+ZmFsc2U8L0lzU2VsZWN0" +
"ZWREYXRlPg0KICA8L1B1YmxpY2F0aW9uRGF0ZT4NCiAgPEV4Y2x1ZGVzIC8+DQ" +
"o8L1NlYXJjaFRlcm0+");
map.put("resultcount",25);
map.put("pageno",i);
// map.put("pageno",1);
HttpRequestBody httpRequestBody = HttpRequestBody.form(map, "UTF-8");
Request request = new Request(baseUrl).setMethod(HttpConstant.Method.POST)
.addHeader("Content-Type", "application/x-www-form-urlencoded;charset=utf-8")
.addHeader("Connection", "keep-alive")
.addHeader("Host", "pubs.rsc.org")
.addHeader("Accept-Encoding", "gzip, deflate, br, zstd")
.addHeader("Accept", "text/html, */*; q=0.01")
.setCharset("UTF-8");
request.setRequestBody(httpRequestBody);
// page.addTargetRequest(request);
Spider.create(journalResultPcoessor)
.setUUID(UuidUtil.getTimeBasedUuid().toString())
.addRequest(request)
.addPipeline(new ArticlePipeline())
.thread(1).start();
}
}
}