Commit 432ddd722c077bdb191dad3449658f81be91780f
1 parent
9f5b8d9b
添加其他网站的爬取方法
Showing
16 changed files
with
559 additions
and
5 deletions
Too many changes to show.
To preserve performance only 16 of 21 files are displayed.
.idea/inspectionProfiles/Project_Default.xml
1 | 1 | <component name="InspectionProjectProfileManager"> |
2 | 2 | <profile version="1.0"> |
3 | 3 | <option name="myName" value="Project Default" /> |
4 | + <inspection_tool class="AutoCloseableResource" enabled="true" level="WARNING" enabled_by_default="true"> | |
5 | + <option name="METHOD_MATCHER_CONFIG" value="java.util.Formatter,format,java.io.Writer,append,com.google.common.base.Preconditions,checkNotNull,org.hibernate.Session,close,java.io.PrintWriter,printf,java.io.PrintStream,printf,okhttp3.Call,execute" /> | |
6 | + </inspection_tool> | |
4 | 7 | <inspection_tool class="NonSerializableWithSerialVersionUIDField" enabled="true" level="WARNING" enabled_by_default="true" /> |
5 | 8 | <inspection_tool class="SpringJavaInjectionPointsAutowiringInspection" enabled="false" level="ERROR" enabled_by_default="false" /> |
6 | 9 | </profile> | ... | ... |
pom.xml
... | ... | @@ -77,6 +77,12 @@ |
77 | 77 | <version>2.1.8</version> |
78 | 78 | </dependency> |
79 | 79 | |
80 | + <dependency> | |
81 | + <groupId>com.squareup.okhttp3</groupId> | |
82 | + <artifactId>okhttp</artifactId> | |
83 | + <version>3.8.1</version> | |
84 | + </dependency> | |
85 | + | |
80 | 86 | <!-- webmagic核心库 --> |
81 | 87 | <dependency> |
82 | 88 | <groupId>us.codecraft</groupId> | ... | ... |
src/main/java/com/canrd/webmagic/controller/ChemicalController.java
1 | 1 | package com.canrd.webmagic.controller; |
2 | 2 | |
3 | 3 | |
4 | +import com.canrd.webmagic.common.constant.ServerResult; | |
5 | +import com.canrd.webmagic.processor.ChemicalPcoessor; | |
6 | +import com.canrd.webmagic.processor.ChemicalsciencePcoessor; | |
7 | +import com.canrd.webmagic.processor.MatterPagePcoessor; | |
8 | +import com.canrd.webmagic.processor.download.SeleniumDownloader; | |
9 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | |
4 | 10 | import io.swagger.annotations.Api; |
11 | +import io.swagger.annotations.ApiOperation; | |
12 | +import org.apache.logging.log4j.core.util.UuidUtil; | |
13 | +import org.springframework.web.bind.annotation.GetMapping; | |
5 | 14 | import org.springframework.web.bind.annotation.RequestMapping; |
6 | 15 | import org.springframework.web.bind.annotation.RestController; |
16 | +import us.codecraft.webmagic.Request; | |
17 | +import us.codecraft.webmagic.Spider; | |
18 | +import us.codecraft.webmagic.scheduler.RedisScheduler; | |
19 | + | |
20 | +import javax.annotation.Resource; | |
7 | 21 | |
8 | 22 | @RestController |
9 | -@RequestMapping("/nature/article") | |
10 | -@Api("Nature") | |
23 | +@RequestMapping("/chemical/article") | |
24 | +@Api("Chemical") | |
11 | 25 | public class ChemicalController { |
26 | + @Resource | |
27 | + private ChemicalPcoessor chemicalPcoessor; | |
28 | + | |
29 | + @Resource | |
30 | + private SeleniumDownloader seleniumDownloader; | |
31 | + | |
32 | + @Resource | |
33 | + private ArticlePipeline articlePipeline; | |
12 | 34 | |
35 | + @GetMapping("/start") | |
36 | + @ApiOperation("start") | |
37 | + public ServerResult start() { | |
38 | + Spider.create(chemicalPcoessor) | |
39 | + .setUUID(UuidUtil.getTimeBasedUuid().toString()) | |
40 | + .addRequest(new Request("https://pubs.rsc.org/en/results?searchtext=battery")) | |
41 | +// .setScheduler(new RedisScheduler("127.0.0.1")) | |
42 | + .addPipeline(articlePipeline) | |
43 | +// .setDownloader(seleniumDownloader) | |
44 | + .thread(1).run(); | |
45 | + return ServerResult.success(); | |
46 | + } | |
13 | 47 | } | ... | ... |
src/main/java/com/canrd/webmagic/controller/MatterController.java
... | ... | @@ -26,13 +26,14 @@ public class MatterController { |
26 | 26 | @GetMapping("/start") |
27 | 27 | @ApiOperation("start") |
28 | 28 | public ServerResult start() { |
29 | + | |
29 | 30 | Spider.create(matterPragePcoessor) |
30 | 31 | // 添加这个Spider要爬取的网页地址 |
31 | 32 | .addUrl("https://www.cell.com/matter/home") |
32 | 33 | .setUUID(UuidUtil.getTimeBasedUuid().toString()) |
33 | 34 | .setDownloader(seleniumDownloader) |
34 | 35 | // 开启5个线程执行,并开始爬取 |
35 | - .thread(5).run(); | |
36 | + .thread(60).start(); | |
36 | 37 | return ServerResult.success(); |
37 | 38 | } |
38 | 39 | } | ... | ... |
src/main/java/com/canrd/webmagic/domain/ArticleTypeEnum.java
... | ... | @@ -25,6 +25,7 @@ public enum ArticleTypeEnum { |
25 | 25 | NATURE_COMMUNICATIONS("nature-communications","nuture网站-communications"), |
26 | 26 | NATURE_COMPUTATIONAL_SCIENCE("nature-computational|science","nuture网站-computational|science"), |
27 | 27 | NATURE_METHODS("nature-methods","nuture网站-methods"), |
28 | + Chemical("chemical","chemical网站") | |
28 | 29 | ; |
29 | 30 | private String type; |
30 | 31 | private String desc; | ... | ... |
src/main/java/com/canrd/webmagic/processor/AdvancedEnergyMaterialPcoessor.java
... | ... | @@ -4,10 +4,14 @@ import us.codecraft.webmagic.Page; |
4 | 4 | import us.codecraft.webmagic.Site; |
5 | 5 | import us.codecraft.webmagic.processor.PageProcessor; |
6 | 6 | |
7 | +import java.util.ArrayList; | |
8 | +import java.util.List; | |
9 | + | |
7 | 10 | public class AdvancedEnergyMaterialPcoessor implements PageProcessor { |
8 | 11 | @Override |
9 | 12 | public void process(Page page) { |
10 | - | |
13 | + String url = page.getUrl().get(); | |
14 | + if (url.equals("https://techxplore.com/journals/advanced-energy-materials/")){} | |
11 | 15 | } |
12 | 16 | |
13 | 17 | @Override |
... | ... | @@ -15,5 +19,5 @@ public class AdvancedEnergyMaterialPcoessor implements PageProcessor { |
15 | 19 | return PageProcessor.super.getSite(); |
16 | 20 | } |
17 | 21 | |
18 | - | |
22 | + | |
19 | 23 | } | ... | ... |
src/main/java/com/canrd/webmagic/processor/ChemicalPcoessor.java
0 → 100644
1 | +package com.canrd.webmagic.processor; | |
2 | + | |
3 | +import com.alibaba.fastjson.JSONArray; | |
4 | +import com.alibaba.fastjson.JSONObject; | |
5 | +import com.baomidou.mybatisplus.core.toolkit.CollectionUtils; | |
6 | +import com.canrd.webmagic.common.utils.DateUtil; | |
7 | +import com.canrd.webmagic.common.utils.StringUtils; | |
8 | +import com.canrd.webmagic.domain.ArticleTypeEnum; | |
9 | +import com.canrd.webmagic.domain.dto.ArticleDO; | |
10 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | |
11 | +import com.google.common.net.MediaType; | |
12 | +import lombok.extern.slf4j.Slf4j; | |
13 | +import okhttp3.MultipartBody; | |
14 | +import okhttp3.OkHttpClient; | |
15 | +import okhttp3.Response; | |
16 | +import okhttp3.ResponseBody; | |
17 | +import org.springframework.stereotype.Component; | |
18 | +import org.springframework.web.bind.annotation.RequestBody; | |
19 | +import us.codecraft.webmagic.Page; | |
20 | +import us.codecraft.webmagic.Request; | |
21 | +import us.codecraft.webmagic.Site; | |
22 | +import us.codecraft.webmagic.Spider; | |
23 | +import us.codecraft.webmagic.model.HttpRequestBody; | |
24 | +import us.codecraft.webmagic.processor.PageProcessor; | |
25 | +import us.codecraft.webmagic.selector.Html; | |
26 | +import us.codecraft.webmagic.selector.Selectable; | |
27 | +import us.codecraft.webmagic.utils.HttpConstant; | |
28 | + | |
29 | +import java.io.IOException; | |
30 | +import java.text.ParseException; | |
31 | +import java.text.SimpleDateFormat; | |
32 | +import java.util.*; | |
33 | +import java.util.concurrent.ConcurrentHashMap; | |
34 | +import java.util.concurrent.atomic.AtomicInteger; | |
35 | +import java.util.stream.Collectors; | |
36 | + | |
37 | +@Slf4j | |
38 | +@Component | |
39 | +public class ChemicalPcoessor implements PageProcessor { | |
40 | + private final Site request = Site.me().setTimeOut(300000); | |
41 | + | |
42 | + private int index; | |
43 | + private String substring; | |
44 | + | |
45 | + private HttpRequestBody httpRequestBody; | |
46 | + | |
47 | + private ConcurrentHashMap<String, Object> map = new ConcurrentHashMap<>(); | |
48 | + | |
49 | + @Override | |
50 | + public void process(Page page) { | |
51 | +// synchronized (this) { | |
52 | + String url = page.getUrl().get(); | |
53 | + if (url.equals("https://pubs.rsc.org/en/results?searchtext=battery")) { | |
54 | + try { | |
55 | + getMaxPages(page); | |
56 | + } catch (IOException e) { | |
57 | + throw new RuntimeException(e); | |
58 | + } | |
59 | +// }else if (url.equals("https://pubs.rsc.org/en/search/journalresult")&&index==0) { | |
60 | +// getResultMax(page); | |
61 | + } else if (url.equals("https://pubs.rsc.org/en/search/journalresult")) { | |
62 | + eveyPage(page); | |
63 | + } else if (url.contains("https://pubs.rsc.org/en/content")) { | |
64 | + getDetil(page); | |
65 | + } | |
66 | +// } | |
67 | + } | |
68 | + | |
69 | + void getMaxPages(Page page) throws IOException { | |
70 | + index = 0; | |
71 | + Html html = page.getHtml(); | |
72 | + String script = html.css("script").regex(".*var searchResultCounts = (.*?);").get(); | |
73 | +// log.info(script); | |
74 | +// String replace = script.replace("[", ""); | |
75 | +// String replace1 = replace.replace("]", ""); | |
76 | +// String[] split = replace1.split(","); | |
77 | +// int i1 = split[1].indexOf(":"); | |
78 | +// int i2 = split[1].indexOf("}"); | |
79 | +// substring = split[1].substring(i1 + 1, i1); | |
80 | +// if (!StringUtils.isEmpty(substring)) { | |
81 | + for (int i = 1; i <= 2118; i++) { | |
82 | + String baseUrl = "https://pubs.rsc.org/en/search/journalresult"; | |
83 | + map.put("searchterm", "PD94bWwgdmVyc2lvbj0iMS4wIj8+DQo8U2VhcmNoVGVybSB4" + | |
84 | + "bWxuczp4c2Q9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hIiB4bW" + | |
85 | + "xuczp4c2k9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hLWluc3Rh" + | |
86 | + "bmNlIj4NCiAgPENhdGVnb3J5PmpvdXJuYWw8L0NhdGVnb3J5Pg0KICA8U3ViQ2" + | |
87 | + "F0ZWdvcnkgLz4NCiAgPENvbnRlbnRUeXBlPmpvdXJuYWw8L0NvbnRlbnRUeXBl" + | |
88 | + "Pg0KICA8Q3JpdGVyaWFzPg0KICAgIDxOYW1lVmFsdWU+DQogICAgICA8TmFtZT" + | |
89 | + "5mcmVldGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5PC9WYWx1ZT4N" + | |
90 | + "CiAgICA8L05hbWVWYWx1ZT4NCiAgICA8TmFtZVZhbHVlPg0KICAgICAgPE5hbW" + | |
91 | + "U+T3JpZ2luYWxGcmVlVGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5" + | |
92 | + "PC9WYWx1ZT4NCiAgICA8L05hbWVWYWx1ZT4NCiAgPC9Dcml0ZXJpYXM+DQogID" + | |
93 | + "xGYWNldHMgLz4NCiAgPFJlcXVlc3RUaW1lPjAwMDEtMDEtMDFUMDA6MDA6MDA8" + | |
94 | + "L1JlcXVlc3RUaW1lPg0KICA8QXV0aG9yQ3JpdGVyaWEgLz4NCiAgPFB1YmxpY2" + | |
95 | + "F0aW9uRGF0ZT4NCiAgICA8SXNTZWxlY3RlZERhdGU+ZmFsc2U8L0lzU2VsZWN0" + | |
96 | + "ZWREYXRlPg0KICA8L1B1YmxpY2F0aW9uRGF0ZT4NCiAgPEV4Y2x1ZGVzIC8+DQ" + | |
97 | + "o8L1NlYXJjaFRlcm0+"); | |
98 | + map.put("resultcount", 52942); | |
99 | + map.put("pageno", i); | |
100 | + httpRequestBody = HttpRequestBody.form(map, "UTF-8"); | |
101 | + Request request = new Request(baseUrl).setMethod(HttpConstant.Method.POST) | |
102 | + .addHeader("Content-Type", "application/x-www-form-urlencoded") | |
103 | + .addHeader("Connection", "keep-alive") | |
104 | + .addHeader("Host", "pubs.rsc.org") | |
105 | + .addHeader("Accept-Encoding", "gzip, deflate, br") | |
106 | + .addHeader("Accept", "*/*") | |
107 | + .addHeader("Cookie", "X-Mapping-hhmaobcf=3C9897E0250B6117CDBA3EC8F724E9A9;_PubsBFCleared=1;ASP.NET_SessionId=bang00oiyj1p1v45jkuk2cgt;ShowEUCookieLawBanner=true") | |
108 | + .setCharset("UTF-8"); | |
109 | +// log.info(map.toString()); | |
110 | + request.setRequestBody(httpRequestBody); | |
111 | + page.addTargetRequest(request); | |
112 | + | |
113 | +// OkHttpClient client = new OkHttpClient().newBuilder() | |
114 | +// .build(); | |
115 | +// MediaType mediaType = MediaType.parse("text/plain"); | |
116 | +// for (int i = 1; i <= 3; i++) { | |
117 | +// MultipartBody body = new MultipartBody.Builder().setType(MultipartBody.FORM) | |
118 | +// .addFormDataPart("searchterm", "PD94bWwgdmVyc2lvbj0iMS4wIj8+DQo8U2VhcmNoVGVybSB4bWxuczp4c2Q9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hIiB4bWxuczp4c2k9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hLWluc3RhbmNlIj4NCiAgPENhdGVnb3J5PmpvdXJuYWw8L0NhdGVnb3J5Pg0KICA8U3ViQ2F0ZWdvcnkgLz4NCiAgPENvbnRlbnRUeXBlPmpvdXJuYWw8L0NvbnRlbnRUeXBlPg0KICA8Q3JpdGVyaWFzPg0KICAgIDxOYW1lVmFsdWU+DQogICAgICA8TmFtZT5mcmVldGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5PC9WYWx1ZT4NCiAgICA8L05hbWVWYWx1ZT4NCiAgICA8TmFtZVZhbHVlPg0KICAgICAgPE5hbWU+T3JpZ2luYWxGcmVlVGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5PC9WYWx1ZT4NCiAgICA8L05hbWVWYWx1ZT4NCiAgPC9Dcml0ZXJpYXM+DQogIDxGYWNldHMgLz4NCiAgPFJlcXVlc3RUaW1lPjAwMDEtMDEtMDFUMDA6MDA6MDA8L1JlcXVlc3RUaW1lPg0KICA8QXV0aG9yQ3JpdGVyaWEgLz4NCiAgPFB1YmxpY2F0aW9uRGF0ZT4NCiAgICA8SXNTZWxlY3RlZERhdGU+ZmFsc2U8L0lzU2VsZWN0ZWREYXRlPg0KICA8L1B1YmxpY2F0aW9uRGF0ZT4NCiAgPEV4Y2x1ZGVzIC8+DQo8L1NlYXJjaFRlcm0+") | |
119 | +// .addFormDataPart("resultcount", "25") | |
120 | +//// .addFormDataPart("category", "journal") | |
121 | +// .addFormDataPart("pageno", String.valueOf(i)) | |
122 | +// .build(); | |
123 | +// okhttp3.Request okrequest = new okhttp3.Request.Builder() | |
124 | +// .url("https://pubs.rsc.org/en/search/journalresult") | |
125 | +// .method("POST", body) | |
126 | +// .addHeader("Cookie", "X-Mapping-hhmaobcf=3C9897E0250B6117CDBA3EC8F724E9A9;_PubsBFCleared=1;ASP.NET_SessionId=bang00oiyj1p1v45jkuk2cgt;ShowEUCookieLawBanner=true") | |
127 | +// .build(); | |
128 | +// Response response = client.newCall(okrequest).execute(); | |
129 | +// String responseBody = response.body().string(); | |
130 | +// log.info(String.valueOf(responseBody)); | |
131 | + } | |
132 | + } | |
133 | +// }else{ | |
134 | +// throw new RuntimeException(); | |
135 | +// } | |
136 | +// } | |
137 | + | |
138 | + void getResultMax(Page page) { | |
139 | + index = 1; | |
140 | +// .xpath("//span[@class='paging--label']/text()") | |
141 | + page.putField("html", page.getHtml()); | |
142 | + String ever = page.getHtml().xpath("//div[@class='fixpadv--l']/strong/text()").get(); | |
143 | + int total = Integer.parseInt(substring); | |
144 | + int everReq = Integer.parseInt(ever.split(" ")[0]); | |
145 | + int pageNo = total / everReq; | |
146 | + if ((pageNo * everReq) % total < everReq && (pageNo * everReq) % total == 0) { | |
147 | + pageNo = pageNo + 1; | |
148 | + } | |
149 | + log.info(String.valueOf(pageNo)); | |
150 | +// String s = page.getHtml().xpath("//span[@class='paging--label']/text()").get(); | |
151 | +// String[] pageTotal = s.split(" "); | |
152 | +// int now = Integer.parseInt(pageTotal[4]); | |
153 | +// int total = Integer.parseInt(pageTotal[6]); | |
154 | + for (int i = 1; i <= total; i++) { | |
155 | + String baseUrl = "https://pubs.rsc.org/en/search/journalresult"; | |
156 | + Map<String, Object> map = new HashMap<>(); | |
157 | + map.put("searchterm", "PD94bWwgdmVyc2lvbj0iMS4wIj8+DQo8U2VhcmNoVGVybSB4" + | |
158 | + "bWxuczp4c2Q9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hIiB4bW" + | |
159 | + "xuczp4c2k9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hLWluc3Rh" + | |
160 | + "bmNlIj4NCiAgPENhdGVnb3J5PmpvdXJuYWw8L0NhdGVnb3J5Pg0KICA8U3ViQ2" + | |
161 | + "F0ZWdvcnkgLz4NCiAgPENvbnRlbnRUeXBlPmpvdXJuYWw8L0NvbnRlbnRUeXBl" + | |
162 | + "Pg0KICA8Q3JpdGVyaWFzPg0KICAgIDxOYW1lVmFsdWU+DQogICAgICA8TmFtZT" + | |
163 | + "5mcmVldGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5PC9WYWx1ZT4N" + | |
164 | + "CiAgICA8L05hbWVWYWx1ZT4NCiAgICA8TmFtZVZhbHVlPg0KICAgICAgPE5hbW" + | |
165 | + "U+T3JpZ2luYWxGcmVlVGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5" + | |
166 | + "PC9WYWx1ZT4NCiAgICA8L05hbWVWYWx1ZT4NCiAgPC9Dcml0ZXJpYXM+DQogID" + | |
167 | + "xGYWNldHMgLz4NCiAgPFJlcXVlc3RUaW1lPjAwMDEtMDEtMDFUMDA6MDA6MDA8" + | |
168 | + "L1JlcXVlc3RUaW1lPg0KICA8QXV0aG9yQ3JpdGVyaWEgLz4NCiAgPFB1YmxpY2" + | |
169 | + "F0aW9uRGF0ZT4NCiAgICA8SXNTZWxlY3RlZERhdGU+ZmFsc2U8L0lzU2VsZWN0" + | |
170 | + "ZWREYXRlPg0KICA8L1B1YmxpY2F0aW9uRGF0ZT4NCiAgPEV4Y2x1ZGVzIC8+DQ" + | |
171 | + "o8L1NlYXJjaFRlcm0+"); | |
172 | + map.put("resultcount", 25); | |
173 | + map.put("pageno", i); | |
174 | + HttpRequestBody httpRequestBody = HttpRequestBody.form(map, "UTF-8"); | |
175 | + Request request = new Request(baseUrl).setMethod(HttpConstant.Method.POST) | |
176 | + .addHeader("Content-Type", "application/x-www-form-urlencoded;charset=utf-8") | |
177 | + .addHeader("Connection", "keep-alive") | |
178 | + .addHeader("Host", "pubs.rsc.org") | |
179 | + .addHeader("Accept-Encoding", "gzip, deflate, br, zstd") | |
180 | + .addHeader("Accept", "text/html, */*; q=0.01") | |
181 | + .setCharset("UTF-8"); | |
182 | + request.setRequestBody(httpRequestBody); | |
183 | + page.addTargetRequest(request); | |
184 | + } | |
185 | + } | |
186 | + | |
187 | + void eveyPage(Page page) { | |
188 | +// <span class="paging--label"> - Showing page 2 of 4000</span> | |
189 | +// log.info(page.getHtml().get()); | |
190 | + List<String> hrefList = page.getHtml().xpath("//a[@class='capsule__action']/@href").all(); | |
191 | + for (String herf : hrefList) { | |
192 | + log.info("https://pubs.rsc.org" + herf); | |
193 | + String url = "https://pubs.rsc.org" + herf; | |
194 | + page.addTargetRequest(url); | |
195 | + } | |
196 | + } | |
197 | + | |
198 | + void getDetil(Page page) { | |
199 | + Html html = page.getHtml(); | |
200 | + //文章链接 | |
201 | + String articleCode = page.getUrl().get(); | |
202 | + //文章标题 | |
203 | + String title = html.xpath("//div[@class='article__title']/h2/font/text()").get(); | |
204 | + if (StringUtils.isBlank(title)) { | |
205 | + title = html.xpath("//div[@class='article__title']/h2/text()").get(); | |
206 | + } | |
207 | + | |
208 | + //文章内容 | |
209 | + String articleDesc = html.xpath("//div[@class='capsule__text']/p/text()").get(); | |
210 | + | |
211 | + //时间 | |
212 | + String publishTime; | |
213 | + Date publishTimeDateTime = null; | |
214 | + List<String> all = html.xpath("//div[@class='c fixpadt--l']/dd[@class='c__10']").all(); | |
215 | + AtomicInteger timeIndex = new AtomicInteger(0); | |
216 | + all.stream().filter(s -> { | |
217 | + timeIndex.getAndIncrement(); | |
218 | + return s.equals("test2"); | |
219 | + }).findFirst(); | |
220 | + publishTime = html.xpath("//div[@class='c fixpadt--l']/dd[@class='c__14']/text()").all().get(timeIndex.get()); | |
221 | + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH); | |
222 | + try { | |
223 | + publishTimeDateTime = formatter.parse(publishTime); | |
224 | + } catch (ParseException e) { | |
225 | + e.printStackTrace(); | |
226 | + } | |
227 | + | |
228 | + //作者名字 | |
229 | + List<String> authorList = html.xpath("//div[@class='article__authors']/span[@class='article__author-link']/a/text()").all(); | |
230 | + List<String> collect = authorList.stream().filter(a -> !a.equals("†")).collect(Collectors.toList()); | |
231 | + | |
232 | +// collect.forEach(log::info); | |
233 | + StringBuffer stringBuffer = new StringBuffer(); | |
234 | + collect.forEach(stringBuffer::append); | |
235 | + | |
236 | + StringBuffer authorName = new StringBuffer(); | |
237 | + authorList.forEach(authorName::append); | |
238 | + HashMap<Object, Object> authorHashMap = new HashMap<>(); | |
239 | + List<String> cupList = html.xpath("//div[@class='article__authors']/span/span/sup/i/text()").all(); | |
240 | + if (collect.size() == cupList.size()) { | |
241 | + for (int i = 0; i < collect.size(); i++) { | |
242 | + authorHashMap.put(collect.get(i), cupList.get(i)); | |
243 | + } | |
244 | + } | |
245 | +// 单位地址 邮箱 | |
246 | + JSONArray authorAddress = new JSONArray(); | |
247 | + JSONArray authorMail = new JSONArray(); | |
248 | + List<Selectable> addressMailList = html.xpath("//p[@class='article__author-affiliation']").nodes(); | |
249 | + HashMap<Object, Object> addressMap = new HashMap<>(); | |
250 | + HashMap<Object, Object> mailMap = new HashMap<>(); | |
251 | + if (CollectionUtils.isNotEmpty(addressMailList)) { | |
252 | + for (Selectable selectable : addressMailList) { | |
253 | + List<Selectable> nodes = selectable.xpath("//span").nodes(); | |
254 | + if (CollectionUtils.isNotEmpty(nodes)) { | |
255 | + Selectable keyXpath = nodes.get(0); | |
256 | + Selectable valueXpath = nodes.get(1); | |
257 | + String key = keyXpath.xpath("//sup/text()").get(); | |
258 | + String address = valueXpath.xpath("//span/text()").get(); | |
259 | + String mail = valueXpath.xpath("//span/a/text()").get(); | |
260 | + if (!StringUtils.isBlank(key)) { | |
261 | + if (!StringUtils.isBlank(address)) { | |
262 | + addressMap.put(key, address); | |
263 | + } else { | |
264 | + addressMap.put(key, null); | |
265 | + } | |
266 | + if (!StringUtils.isBlank(mail)) { | |
267 | + mailMap.put(key, mail); | |
268 | + } else { | |
269 | + mailMap.put(key, null); | |
270 | + } | |
271 | + } else { | |
272 | + if (!StringUtils.isBlank(address)) { | |
273 | + addressMap.put("*", address); | |
274 | + } else { | |
275 | + addressMap.put("*", null); | |
276 | + } | |
277 | + if (!StringUtils.isBlank(mail)) { | |
278 | + mailMap.put("*", mail); | |
279 | + } else { | |
280 | + mailMap.put("*", null); | |
281 | + } | |
282 | + } | |
283 | + } | |
284 | + } | |
285 | + Object[] objects = authorHashMap.keySet().stream().toArray(); | |
286 | +// log.info(Arrays.toString(objects)); | |
287 | + for (int i = 0; i < objects.length; i++) { | |
288 | + JSONObject addressObj = new JSONObject(); | |
289 | + JSONObject mailObj = new JSONObject(); | |
290 | + Object point = authorHashMap.get(String.valueOf(objects[i])); | |
291 | + Object address = addressMap.get(point); | |
292 | + Object mail = mailMap.get(point); | |
293 | + addressObj.put("address", address); | |
294 | + addressObj.put("authorNames", objects[i]); | |
295 | + mailObj.put("authorEmailName", objects[i]); | |
296 | + mailObj.put("email", mail); | |
297 | + | |
298 | + authorAddress.add(addressObj); | |
299 | + authorMail.add(mailObj); | |
300 | + } | |
301 | + } | |
302 | + | |
303 | + JSONArray references = new JSONArray(); | |
304 | + List<Selectable> referenceNodeList = html.xpath("//div[@class='ref-list']/ol/li").nodes(); | |
305 | + if (CollectionUtils.isNotEmpty(referenceNodeList)) { | |
306 | + for (Selectable reference : referenceNodeList) { | |
307 | + List<String> herfOpList = reference.xpath("//a/@href").all(); | |
308 | + ArrayList<Object> herfList = new ArrayList<>(); | |
309 | + for (String herf : herfOpList) { | |
310 | + if (herf.startsWith("https") || herf.startsWith("http")) { | |
311 | + String before = "https://pubs.rsc.org"; | |
312 | + herfList.add(before + herf); | |
313 | + } else { | |
314 | + herfList.add(herf); | |
315 | + } | |
316 | + } | |
317 | + StringBuffer referenceTitle = new StringBuffer(); | |
318 | + List<String> spanList = reference.xpath("//span/text()").all(); | |
319 | + if (CollectionUtils.isNotEmpty(spanList)) { | |
320 | + for (int i = 0; i < spanList.size(); i++) { | |
321 | + referenceTitle.append(spanList.get(i)); | |
322 | + if (i < spanList.size() - 1) { | |
323 | + referenceTitle.append(","); | |
324 | + } | |
325 | + } | |
326 | + } | |
327 | + String em = reference.xpath("//em/text()").get(); | |
328 | + String strong = reference.xpath("//strong/text()").get(); | |
329 | + referenceTitle.append(em); | |
330 | + referenceTitle.append(strong); | |
331 | + JSONObject referencesObj = new JSONObject(); | |
332 | + referencesObj.put("links", herfList); | |
333 | + referencesObj.put("referenceTitle", referenceTitle); | |
334 | + references.add(referencesObj); | |
335 | + } | |
336 | + } | |
337 | + log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorName, authorMail.toJSONString()); | |
338 | + page.putField("article", ArticleDO.builder() | |
339 | + .articleType(ArticleTypeEnum.Chemical.getType()) | |
340 | + .articleCode(articleCode) | |
341 | + .authorName(authorName.toString()) | |
342 | + .title(title) | |
343 | + .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE)) | |
344 | + .emailInfo(authorMail.toJSONString()) | |
345 | + .articleDesc(articleDesc) | |
346 | + .authorAddress(authorAddress.toJSONString()) | |
347 | + .referenceInfo(references.toJSONString()).build()); | |
348 | + } | |
349 | + | |
350 | + @Override | |
351 | + public Site getSite() { | |
352 | + return request; | |
353 | + } | |
354 | + | |
355 | + public static void main(String[] args) { | |
356 | + Spider.create(new ChemicalPcoessor()) | |
357 | + .addUrl("https://pubs.rsc.org/en/search/journalresult") | |
358 | + .addPipeline(new ArticlePipeline()) | |
359 | + .thread(5).run(); | |
360 | + } | |
361 | +} | ... | ... |
src/main/java/com/canrd/webmagic/processor/ChemicalsciencePcoessor.java
0 → 100644
1 | +package com.canrd.webmagic.processor; | |
2 | + | |
3 | +import com.canrd.webmagic.common.utils.StringUtils; | |
4 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | |
5 | +import lombok.extern.slf4j.Slf4j; | |
6 | +import org.apache.logging.log4j.core.util.UuidUtil; | |
7 | +import org.springframework.beans.factory.annotation.Autowired; | |
8 | +import org.springframework.stereotype.Component; | |
9 | +import us.codecraft.webmagic.Page; | |
10 | +import us.codecraft.webmagic.Request; | |
11 | +import us.codecraft.webmagic.Site; | |
12 | +import us.codecraft.webmagic.Spider; | |
13 | +import us.codecraft.webmagic.model.HttpRequestBody; | |
14 | +import us.codecraft.webmagic.processor.PageProcessor; | |
15 | +import us.codecraft.webmagic.selector.*; | |
16 | +import us.codecraft.webmagic.utils.HttpConstant; | |
17 | + | |
18 | +import java.util.*; | |
19 | + | |
20 | +@Slf4j | |
21 | +@Component | |
22 | +public class ChemicalsciencePcoessor implements PageProcessor { | |
23 | + | |
24 | + @Autowired | |
25 | + private ChemicalPcoessor journalResultPcoessor; | |
26 | + private final Site base = Site.me().setDomain("base").setTimeOut(200000); | |
27 | + | |
28 | + @Override | |
29 | + public Site getSite() { | |
30 | + return base; | |
31 | + } | |
32 | + | |
33 | + private String substring; | |
34 | + | |
35 | + @Override | |
36 | + public void process(Page page) { | |
37 | + String url = page.getUrl().get(); | |
38 | + if (url.equals("https://pubs.rsc.org/en/results?searchtext=battery")) { | |
39 | + getMaxPages(page); | |
40 | + } else if (url.equals("https://pubs.rsc.org/en/search/journalresult")) { | |
41 | + getResultMax(page); | |
42 | + } | |
43 | + } | |
44 | + | |
45 | + void getMaxPages(Page page) { | |
46 | + Html html = page.getHtml(); | |
47 | + String script = page.getHtml().css("script").regex(".*var searchResultCounts = (.*?);").get(); | |
48 | + String replace = script.replace("[", ""); | |
49 | + String replace1 = replace.replace("]", ""); | |
50 | + String[] split = replace1.split(","); | |
51 | + int i = split[1].indexOf(":"); | |
52 | + int i1 = split[1].indexOf("}"); | |
53 | + substring = split[1].substring(i + 1, i1); | |
54 | + if (!StringUtils.isEmpty(substring)) { | |
55 | + String baseUrl = "https://pubs.rsc.org/en/search/journalresult"; | |
56 | + Map<String, Object> map = new HashMap<>(); | |
57 | + map.put("searchterm", "PD94bWwgdmVyc2lvbj0iMS4wIj8+DQo8U2VhcmNoVGVybSB4" + | |
58 | + "bWxuczp4c2Q9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hIiB4bW" + | |
59 | + "xuczp4c2k9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hLWluc3Rh" + | |
60 | + "bmNlIj4NCiAgPENhdGVnb3J5PmpvdXJuYWw8L0NhdGVnb3J5Pg0KICA8U3ViQ2" + | |
61 | + "F0ZWdvcnkgLz4NCiAgPENvbnRlbnRUeXBlPmpvdXJuYWw8L0NvbnRlbnRUeXBl" + | |
62 | + "Pg0KICA8Q3JpdGVyaWFzPg0KICAgIDxOYW1lVmFsdWU+DQogICAgICA8TmFtZT" + | |
63 | + "5mcmVldGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5PC9WYWx1ZT4N" + | |
64 | + "CiAgICA8L05hbWVWYWx1ZT4NCiAgICA8TmFtZVZhbHVlPg0KICAgICAgPE5hbW" + | |
65 | + "U+T3JpZ2luYWxGcmVlVGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5" + | |
66 | + "PC9WYWx1ZT4NCiAgICA8L05hbWVWYWx1ZT4NCiAgPC9Dcml0ZXJpYXM+DQogID" + | |
67 | + "xGYWNldHMgLz4NCiAgPFJlcXVlc3RUaW1lPjAwMDEtMDEtMDFUMDA6MDA6MDA8" + | |
68 | + "L1JlcXVlc3RUaW1lPg0KICA8QXV0aG9yQ3JpdGVyaWEgLz4NCiAgPFB1YmxpY2" + | |
69 | + "F0aW9uRGF0ZT4NCiAgICA8SXNTZWxlY3RlZERhdGU+ZmFsc2U8L0lzU2VsZWN0" + | |
70 | + "ZWREYXRlPg0KICA8L1B1YmxpY2F0aW9uRGF0ZT4NCiAgPEV4Y2x1ZGVzIC8+DQ" + | |
71 | + "o8L1NlYXJjaFRlcm0+"); | |
72 | + log.info(substring); | |
73 | + map.put("resultcount",25); | |
74 | + map.put("pageno",2); | |
75 | + HttpRequestBody httpRequestBody = HttpRequestBody.form(map, "UTF-8"); | |
76 | + Request request = new Request(baseUrl).setMethod(HttpConstant.Method.POST) | |
77 | + .addHeader("Content-Type", "application/x-www-form-urlencoded;charset=utf-8") | |
78 | + .addHeader("Connection", "keep-alive") | |
79 | + .addHeader("Host", "pubs.rsc.org") | |
80 | + .addHeader("Accept-Encoding", "gzip, deflate, br, zstd") | |
81 | + .addHeader("Accept", "text/html, */*; q=0.01") | |
82 | + .setCharset("UTF-8"); | |
83 | + request.setRequestBody(httpRequestBody); | |
84 | + page.addTargetRequest(request); | |
85 | + } else { | |
86 | + throw new RuntimeException(); | |
87 | + } | |
88 | + } | |
89 | + | |
90 | + void getResultMax(Page page) { | |
91 | +// .xpath("//span[@class='paging--label']/text()") | |
92 | + page.putField("html", page.getHtml()); | |
93 | + String ever = page.getHtml().xpath("//div[@class='fixpadv--l']/strong/text()").get(); | |
94 | + int total = Integer.parseInt(substring); | |
95 | + int everReq = Integer.parseInt(ever.split(" ")[0]); | |
96 | + int pageNo=total/everReq; | |
97 | + if ((pageNo*everReq)%total<everReq&&(pageNo*everReq)%total==0){ | |
98 | + pageNo=pageNo+1; | |
99 | + } | |
100 | + log.info(String.valueOf(pageNo)); | |
101 | +// String s = page.getHtml().xpath("//span[@class='paging--label']/text()").get(); | |
102 | +// String[] pageTotal = s.split(" "); | |
103 | +// int now = Integer.parseInt(pageTotal[4]); | |
104 | +// int total = Integer.parseInt(pageTotal[6]); | |
105 | + for (int i = 1; i <= total; i++) { | |
106 | + String baseUrl = "https://pubs.rsc.org/en/search/journalresult"; | |
107 | + Map<String, Object> map = new HashMap<>(); | |
108 | + map.put("searchterm", "PD94bWwgdmVyc2lvbj0iMS4wIj8+DQo8U2VhcmNoVGVybSB4" + | |
109 | + "bWxuczp4c2Q9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hIiB4bW" + | |
110 | + "xuczp4c2k9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hLWluc3Rh" + | |
111 | + "bmNlIj4NCiAgPENhdGVnb3J5PmpvdXJuYWw8L0NhdGVnb3J5Pg0KICA8U3ViQ2" + | |
112 | + "F0ZWdvcnkgLz4NCiAgPENvbnRlbnRUeXBlPmpvdXJuYWw8L0NvbnRlbnRUeXBl" + | |
113 | + "Pg0KICA8Q3JpdGVyaWFzPg0KICAgIDxOYW1lVmFsdWU+DQogICAgICA8TmFtZT" + | |
114 | + "5mcmVldGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5PC9WYWx1ZT4N" + | |
115 | + "CiAgICA8L05hbWVWYWx1ZT4NCiAgICA8TmFtZVZhbHVlPg0KICAgICAgPE5hbW" + | |
116 | + "U+T3JpZ2luYWxGcmVlVGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5" + | |
117 | + "PC9WYWx1ZT4NCiAgICA8L05hbWVWYWx1ZT4NCiAgPC9Dcml0ZXJpYXM+DQogID" + | |
118 | + "xGYWNldHMgLz4NCiAgPFJlcXVlc3RUaW1lPjAwMDEtMDEtMDFUMDA6MDA6MDA8" + | |
119 | + "L1JlcXVlc3RUaW1lPg0KICA8QXV0aG9yQ3JpdGVyaWEgLz4NCiAgPFB1YmxpY2" + | |
120 | + "F0aW9uRGF0ZT4NCiAgICA8SXNTZWxlY3RlZERhdGU+ZmFsc2U8L0lzU2VsZWN0" + | |
121 | + "ZWREYXRlPg0KICA8L1B1YmxpY2F0aW9uRGF0ZT4NCiAgPEV4Y2x1ZGVzIC8+DQ" + | |
122 | + "o8L1NlYXJjaFRlcm0+"); | |
123 | + map.put("resultcount",25); | |
124 | + map.put("pageno",i); | |
125 | +// map.put("pageno",1); | |
126 | + HttpRequestBody httpRequestBody = HttpRequestBody.form(map, "UTF-8"); | |
127 | + Request request = new Request(baseUrl).setMethod(HttpConstant.Method.POST) | |
128 | + .addHeader("Content-Type", "application/x-www-form-urlencoded;charset=utf-8") | |
129 | + .addHeader("Connection", "keep-alive") | |
130 | + .addHeader("Host", "pubs.rsc.org") | |
131 | + .addHeader("Accept-Encoding", "gzip, deflate, br, zstd") | |
132 | + .addHeader("Accept", "text/html, */*; q=0.01") | |
133 | + .setCharset("UTF-8"); | |
134 | + request.setRequestBody(httpRequestBody); | |
135 | +// page.addTargetRequest(request); | |
136 | + Spider.create(journalResultPcoessor) | |
137 | + .setUUID(UuidUtil.getTimeBasedUuid().toString()) | |
138 | + .addRequest(request) | |
139 | + .addPipeline(new ArticlePipeline()) | |
140 | + .thread(1).start(); | |
141 | + } | |
142 | + } | |
143 | +} | ... | ... |
src/main/java/com/canrd/webmagic/processor/download/SeleniumDownloader.java
... | ... | @@ -71,6 +71,7 @@ public class SeleniumDownloader extends AbstractDownloader { |
71 | 71 | element.submit(); |
72 | 72 | WebDriverWait wait = new WebDriverWait(webDriver, 30); |
73 | 73 | wait.until(ExpectedConditions.urlContains("https://www.cell.com/action/doSearch?")); |
74 | + | |
74 | 75 | // wait.until(ExpectedConditions.visibilityOfElementLocated(By.xpath("//div[@class='cb-c']"))); |
75 | 76 | // WebElement cloudFlare = webDriver.findElement(By.xpath("//div[@class='cb-c']/label/input")); |
76 | 77 | // if (cloudFlare!=null){ | ... | ... |
target/classes/com/canrd/webmagic/controller/ChemicalController.class
No preview for this file type
target/classes/com/canrd/webmagic/controller/MatterController.class
No preview for this file type
target/classes/com/canrd/webmagic/domain/ArticleTypeEnum.class
No preview for this file type
target/classes/com/canrd/webmagic/processor/AdvancedEnergyMaterialPcoessor.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/ChemicalPcoessor.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/ChemicalsciencePcoessor.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/download/SeleniumDownloader.class
No preview for this file type