ScienginePcoessor.java
9.27 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
package com.canrd.webmagic.processor;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.baomidou.mybatisplus.core.toolkit.CollectionUtils;
import com.canrd.webmagic.common.constant.ServerResult;
import com.canrd.webmagic.common.utils.DateUtil;
import com.canrd.webmagic.domain.ArticleTypeEnum;
import com.canrd.webmagic.domain.dto.ArticleDO;
import com.canrd.webmagic.domain.dto.SciengineAffsListDo;
import com.canrd.webmagic.domain.dto.SciengineAuthorDo;
import com.canrd.webmagic.domain.dto.SciengineReferenceListDo;
import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.jayway.jsonpath.JsonPath;
import lombok.extern.slf4j.Slf4j;
import org.apache.logging.log4j.core.util.UuidUtil;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.model.HttpRequestBody;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.utils.HttpConstant;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
@Slf4j
@Component
public class ScienginePcoessor implements PageProcessor {
private ConcurrentHashMap<String, Object> map = new ConcurrentHashMap<>();
private HttpRequestBody httpRequestBody;
private final Site site = Site.me().setTimeOut(30000);
@Override
public void process(Page page) {
String url = page.getUrl().get();
if (url.equals("https://www.sciengine.com/plat/search?queryField_a=battery")) {
getMaxPage(page);
} else if (url.equals("https://www.sciengine.com/SciSearch/searchNew")) {
everyPage(page);
} else if (url.contains("https://www.sciengine.com/restData/initArticle?")) {
try {
getPageDetail(page);
} catch (JsonProcessingException e) {
throw new RuntimeException(e);
}
}
}
@Override
public Site getSite() {
return site;
}
void getMaxPage(Page page) {
for (int i = 1; i <=490 ; i++) {
String baseUrl = "https://www.sciengine.com/SciSearch/searchNew";
map.put("queryField_a", "battery");
map.put("pageCount",10);
map.put("curpage",i);
httpRequestBody = HttpRequestBody.form(map, "UTF-8");
Request request = new Request(baseUrl).setMethod(HttpConstant.Method.POST)
.addHeader("Content-Type", "application/x-www-form-urlencoded")
.addHeader("Connection", "keep-alive")
.addHeader("Cookie","_ga=GA1.1.12362349.1718065158; SHAREJSESSIONID=35fad62b-37db-455a-af7b-9e9eaac5e5bf; Hm_lvt_633c662645ea15827301cdfaf39e48a1=1718171741; retrievalHistory=%5B%7B%22title%22%3A%22battery%22%7D%5D; Hm_lpvt_633c662645ea15827301cdfaf39e48a1=1718172306; _ga_SB5SCK5F77=GS1.1.1718170247.7.1.1718172335.0.0.0")
.addHeader("Host","www.sciengine.com")
.addHeader("Accept-Encoding", "gzip, deflate, br")
.addHeader("Accept", "*/*")
.addHeader("Origin","https://www.sciengine.com")
.addHeader("Referer","https://www.sciengine.com/plat/search?queryField_a=battery")
.setCharset("UTF-8");
request.setRequestBody(httpRequestBody);
page.addTargetRequest(request);
}
}
void everyPage(Page page) {
String rawText = page.getRawText();
List<String> BaseIdList = JsonPath.read(rawText, "$.list[*].id");
List<String> doiList = JsonPath.read(rawText, "$.list[*].doi");
if (BaseIdList.size() == doiList.size()) {
for (int i = 0; i < BaseIdList.size(); i++) {
String baseId = BaseIdList.get(i);
String doi = doiList.get(i);
log.info("baseId:"+baseId+",doi:"+doi);
page.addTargetRequest("https://www.sciengine.com/restData/initArticle?doi="+doi+"&articleBaseId="+baseId);
}
} else {
throw new RuntimeException("匹配不成功");
}
}
void getPageDetail(Page page) throws JsonProcessingException {
String rawText = page.getRawText();
//文章链接
String articleCode = page.getUrl().get();
//文章标题
String title = JsonPath.read(rawText, "$.article.title");
//文章内容
String articleDesc = JsonPath.read(rawText, "$.article.intro");
//时间
Date publishTimeDateTime = null;
String publishTime = JsonPath.read(rawText, "$.article.pubDateStr");
SimpleDateFormat formatter = new SimpleDateFormat("MMMM dd,yyyy", Locale.ENGLISH);
try {
publishTimeDateTime = formatter.parse(publishTime);
} catch (ParseException e) {
e.printStackTrace();
}
//作者名字
List<String> authors = JsonPath.read(rawText, "$.authorList[*].fullName");
StringBuffer authorName = new StringBuffer();
authors.forEach(authorName::append);
//邮箱和地址
JSONArray authorMail = new JSONArray();
ObjectMapper objectMapper = new ObjectMapper();
net.minidev.json.JSONArray authorJsonArray = JsonPath.read(rawText, "$.authorList");
JSONArray fastJsonArray = JSON.parseArray(authorJsonArray.toJSONString());
SciengineAuthorDo[] authorList = objectMapper.readValue(fastJsonArray.toJSONString(), SciengineAuthorDo[].class);
JSONArray authorAddress = new JSONArray();
boolean isNotAddress = true;
for (SciengineAuthorDo author : authorList) {
JSONObject mailObj = new JSONObject();
if (author.getAuthorNoteList() != null) {
mailObj.put("authorEmailName", author.getFullName());
mailObj.put("email", author.getAuthorNoteList().get(0).getEmail());
} else {
}
if (!mailObj.isEmpty()){
authorMail.add(mailObj);
}
JSONObject addressObj = new JSONObject();
if (author.getAffsList() != null) {
addressObj.put("address", author.getAffsList().get(0).getAffText());
addressObj.put("authorNames", author.getFullName());
isNotAddress = false;
authorAddress.add(addressObj);
} else {
}
}
ObjectMapper affMapper = new ObjectMapper();
net.minidev.json.JSONArray affList = JsonPath.read(rawText, "$.affList");
JSONArray affListJsonArray = JSON.parseArray(affList.toJSONString());
SciengineAffsListDo[] affListArray = affMapper.readValue(affListJsonArray.toJSONString(), SciengineAffsListDo[].class);
if (isNotAddress && CollectionUtils.isNotEmpty(Arrays.asList(affListArray))) {
JSONObject addressObj = new JSONObject();
addressObj.put("authorNames", authorName);
List addressList = new ArrayList();
for (SciengineAffsListDo sciengineAffsListDo : affListArray) {
if (sciengineAffsListDo.getAffText() != null) {
addressList.add(sciengineAffsListDo.getAffText());
}
}
addressObj.put("address", addressList);
authorAddress.add(addressObj);
}
//引用文献
JSONArray references = new JSONArray();
ObjectMapper referenceMapper = new ObjectMapper();
net.minidev.json.JSONArray refListJsonArray = JsonPath.read(rawText, "$.article.referenceList");
JSONArray referenceListJsonArray = JSON.parseArray(refListJsonArray.toJSONString());
SciengineReferenceListDo[] referenceList = referenceMapper.readValue(referenceListJsonArray.toJSONString(), SciengineReferenceListDo[].class);
for (SciengineReferenceListDo sciengineReferenceListDo : referenceList) {
StringBuffer referenceTitle = new StringBuffer();
JSONObject referencesObj = new JSONObject();
ArrayList<Object> herfList = new ArrayList<>();
herfList.add(sciengineReferenceListDo.getTitle());
referenceTitle.append("https://www.sciengine.com/JAS/doi/" + sciengineReferenceListDo.getDoi());
referencesObj.put("links", herfList);
referencesObj.put("referenceTitle", referenceTitle);
references.add(referencesObj);
}
log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorName, authorMail.toJSONString());
page.putField("article", ArticleDO.builder()
.articleType(ArticleTypeEnum.Sciengine.getType())
.articleCode(articleCode)
.authorName(authorName.toString())
.title(title)
.publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE))
.emailInfo(authorMail.toJSONString())
.articleDesc(articleDesc)
.authorAddress(authorAddress.toJSONString())
.referenceInfo(references.toJSONString()).build());
}
}