Commit 432ddd722c077bdb191dad3449658f81be91780f
1 parent
9f5b8d9b
添加其他网站的爬取方法
Showing
16 changed files
with
559 additions
and
5 deletions
Too many changes to show.
To preserve performance only 16 of 21 files are displayed.
.idea/inspectionProfiles/Project_Default.xml
1 | <component name="InspectionProjectProfileManager"> | 1 | <component name="InspectionProjectProfileManager"> |
2 | <profile version="1.0"> | 2 | <profile version="1.0"> |
3 | <option name="myName" value="Project Default" /> | 3 | <option name="myName" value="Project Default" /> |
4 | + <inspection_tool class="AutoCloseableResource" enabled="true" level="WARNING" enabled_by_default="true"> | ||
5 | + <option name="METHOD_MATCHER_CONFIG" value="java.util.Formatter,format,java.io.Writer,append,com.google.common.base.Preconditions,checkNotNull,org.hibernate.Session,close,java.io.PrintWriter,printf,java.io.PrintStream,printf,okhttp3.Call,execute" /> | ||
6 | + </inspection_tool> | ||
4 | <inspection_tool class="NonSerializableWithSerialVersionUIDField" enabled="true" level="WARNING" enabled_by_default="true" /> | 7 | <inspection_tool class="NonSerializableWithSerialVersionUIDField" enabled="true" level="WARNING" enabled_by_default="true" /> |
5 | <inspection_tool class="SpringJavaInjectionPointsAutowiringInspection" enabled="false" level="ERROR" enabled_by_default="false" /> | 8 | <inspection_tool class="SpringJavaInjectionPointsAutowiringInspection" enabled="false" level="ERROR" enabled_by_default="false" /> |
6 | </profile> | 9 | </profile> |
pom.xml
@@ -77,6 +77,12 @@ | @@ -77,6 +77,12 @@ | ||
77 | <version>2.1.8</version> | 77 | <version>2.1.8</version> |
78 | </dependency> | 78 | </dependency> |
79 | 79 | ||
80 | + <dependency> | ||
81 | + <groupId>com.squareup.okhttp3</groupId> | ||
82 | + <artifactId>okhttp</artifactId> | ||
83 | + <version>3.8.1</version> | ||
84 | + </dependency> | ||
85 | + | ||
80 | <!-- webmagic核心库 --> | 86 | <!-- webmagic核心库 --> |
81 | <dependency> | 87 | <dependency> |
82 | <groupId>us.codecraft</groupId> | 88 | <groupId>us.codecraft</groupId> |
src/main/java/com/canrd/webmagic/controller/ChemicalController.java
1 | package com.canrd.webmagic.controller; | 1 | package com.canrd.webmagic.controller; |
2 | 2 | ||
3 | 3 | ||
4 | +import com.canrd.webmagic.common.constant.ServerResult; | ||
5 | +import com.canrd.webmagic.processor.ChemicalPcoessor; | ||
6 | +import com.canrd.webmagic.processor.ChemicalsciencePcoessor; | ||
7 | +import com.canrd.webmagic.processor.MatterPagePcoessor; | ||
8 | +import com.canrd.webmagic.processor.download.SeleniumDownloader; | ||
9 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | ||
4 | import io.swagger.annotations.Api; | 10 | import io.swagger.annotations.Api; |
11 | +import io.swagger.annotations.ApiOperation; | ||
12 | +import org.apache.logging.log4j.core.util.UuidUtil; | ||
13 | +import org.springframework.web.bind.annotation.GetMapping; | ||
5 | import org.springframework.web.bind.annotation.RequestMapping; | 14 | import org.springframework.web.bind.annotation.RequestMapping; |
6 | import org.springframework.web.bind.annotation.RestController; | 15 | import org.springframework.web.bind.annotation.RestController; |
16 | +import us.codecraft.webmagic.Request; | ||
17 | +import us.codecraft.webmagic.Spider; | ||
18 | +import us.codecraft.webmagic.scheduler.RedisScheduler; | ||
19 | + | ||
20 | +import javax.annotation.Resource; | ||
7 | 21 | ||
8 | @RestController | 22 | @RestController |
9 | -@RequestMapping("/nature/article") | ||
10 | -@Api("Nature") | 23 | +@RequestMapping("/chemical/article") |
24 | +@Api("Chemical") | ||
11 | public class ChemicalController { | 25 | public class ChemicalController { |
26 | + @Resource | ||
27 | + private ChemicalPcoessor chemicalPcoessor; | ||
28 | + | ||
29 | + @Resource | ||
30 | + private SeleniumDownloader seleniumDownloader; | ||
31 | + | ||
32 | + @Resource | ||
33 | + private ArticlePipeline articlePipeline; | ||
12 | 34 | ||
35 | + @GetMapping("/start") | ||
36 | + @ApiOperation("start") | ||
37 | + public ServerResult start() { | ||
38 | + Spider.create(chemicalPcoessor) | ||
39 | + .setUUID(UuidUtil.getTimeBasedUuid().toString()) | ||
40 | + .addRequest(new Request("https://pubs.rsc.org/en/results?searchtext=battery")) | ||
41 | +// .setScheduler(new RedisScheduler("127.0.0.1")) | ||
42 | + .addPipeline(articlePipeline) | ||
43 | +// .setDownloader(seleniumDownloader) | ||
44 | + .thread(1).run(); | ||
45 | + return ServerResult.success(); | ||
46 | + } | ||
13 | } | 47 | } |
src/main/java/com/canrd/webmagic/controller/MatterController.java
@@ -26,13 +26,14 @@ public class MatterController { | @@ -26,13 +26,14 @@ public class MatterController { | ||
26 | @GetMapping("/start") | 26 | @GetMapping("/start") |
27 | @ApiOperation("start") | 27 | @ApiOperation("start") |
28 | public ServerResult start() { | 28 | public ServerResult start() { |
29 | + | ||
29 | Spider.create(matterPragePcoessor) | 30 | Spider.create(matterPragePcoessor) |
30 | // 添加这个Spider要爬取的网页地址 | 31 | // 添加这个Spider要爬取的网页地址 |
31 | .addUrl("https://www.cell.com/matter/home") | 32 | .addUrl("https://www.cell.com/matter/home") |
32 | .setUUID(UuidUtil.getTimeBasedUuid().toString()) | 33 | .setUUID(UuidUtil.getTimeBasedUuid().toString()) |
33 | .setDownloader(seleniumDownloader) | 34 | .setDownloader(seleniumDownloader) |
34 | // 开启5个线程执行,并开始爬取 | 35 | // 开启5个线程执行,并开始爬取 |
35 | - .thread(5).run(); | 36 | + .thread(60).start(); |
36 | return ServerResult.success(); | 37 | return ServerResult.success(); |
37 | } | 38 | } |
38 | } | 39 | } |
src/main/java/com/canrd/webmagic/domain/ArticleTypeEnum.java
@@ -25,6 +25,7 @@ public enum ArticleTypeEnum { | @@ -25,6 +25,7 @@ public enum ArticleTypeEnum { | ||
25 | NATURE_COMMUNICATIONS("nature-communications","nuture网站-communications"), | 25 | NATURE_COMMUNICATIONS("nature-communications","nuture网站-communications"), |
26 | NATURE_COMPUTATIONAL_SCIENCE("nature-computational|science","nuture网站-computational|science"), | 26 | NATURE_COMPUTATIONAL_SCIENCE("nature-computational|science","nuture网站-computational|science"), |
27 | NATURE_METHODS("nature-methods","nuture网站-methods"), | 27 | NATURE_METHODS("nature-methods","nuture网站-methods"), |
28 | + Chemical("chemical","chemical网站") | ||
28 | ; | 29 | ; |
29 | private String type; | 30 | private String type; |
30 | private String desc; | 31 | private String desc; |
src/main/java/com/canrd/webmagic/processor/AdvancedEnergyMaterialPcoessor.java
@@ -4,10 +4,14 @@ import us.codecraft.webmagic.Page; | @@ -4,10 +4,14 @@ import us.codecraft.webmagic.Page; | ||
4 | import us.codecraft.webmagic.Site; | 4 | import us.codecraft.webmagic.Site; |
5 | import us.codecraft.webmagic.processor.PageProcessor; | 5 | import us.codecraft.webmagic.processor.PageProcessor; |
6 | 6 | ||
7 | +import java.util.ArrayList; | ||
8 | +import java.util.List; | ||
9 | + | ||
7 | public class AdvancedEnergyMaterialPcoessor implements PageProcessor { | 10 | public class AdvancedEnergyMaterialPcoessor implements PageProcessor { |
8 | @Override | 11 | @Override |
9 | public void process(Page page) { | 12 | public void process(Page page) { |
10 | - | 13 | + String url = page.getUrl().get(); |
14 | + if (url.equals("https://techxplore.com/journals/advanced-energy-materials/")){} | ||
11 | } | 15 | } |
12 | 16 | ||
13 | @Override | 17 | @Override |
@@ -15,5 +19,5 @@ public class AdvancedEnergyMaterialPcoessor implements PageProcessor { | @@ -15,5 +19,5 @@ public class AdvancedEnergyMaterialPcoessor implements PageProcessor { | ||
15 | return PageProcessor.super.getSite(); | 19 | return PageProcessor.super.getSite(); |
16 | } | 20 | } |
17 | 21 | ||
18 | - | 22 | + |
19 | } | 23 | } |
src/main/java/com/canrd/webmagic/processor/ChemicalPcoessor.java
0 → 100644
1 | +package com.canrd.webmagic.processor; | ||
2 | + | ||
3 | +import com.alibaba.fastjson.JSONArray; | ||
4 | +import com.alibaba.fastjson.JSONObject; | ||
5 | +import com.baomidou.mybatisplus.core.toolkit.CollectionUtils; | ||
6 | +import com.canrd.webmagic.common.utils.DateUtil; | ||
7 | +import com.canrd.webmagic.common.utils.StringUtils; | ||
8 | +import com.canrd.webmagic.domain.ArticleTypeEnum; | ||
9 | +import com.canrd.webmagic.domain.dto.ArticleDO; | ||
10 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | ||
11 | +import com.google.common.net.MediaType; | ||
12 | +import lombok.extern.slf4j.Slf4j; | ||
13 | +import okhttp3.MultipartBody; | ||
14 | +import okhttp3.OkHttpClient; | ||
15 | +import okhttp3.Response; | ||
16 | +import okhttp3.ResponseBody; | ||
17 | +import org.springframework.stereotype.Component; | ||
18 | +import org.springframework.web.bind.annotation.RequestBody; | ||
19 | +import us.codecraft.webmagic.Page; | ||
20 | +import us.codecraft.webmagic.Request; | ||
21 | +import us.codecraft.webmagic.Site; | ||
22 | +import us.codecraft.webmagic.Spider; | ||
23 | +import us.codecraft.webmagic.model.HttpRequestBody; | ||
24 | +import us.codecraft.webmagic.processor.PageProcessor; | ||
25 | +import us.codecraft.webmagic.selector.Html; | ||
26 | +import us.codecraft.webmagic.selector.Selectable; | ||
27 | +import us.codecraft.webmagic.utils.HttpConstant; | ||
28 | + | ||
29 | +import java.io.IOException; | ||
30 | +import java.text.ParseException; | ||
31 | +import java.text.SimpleDateFormat; | ||
32 | +import java.util.*; | ||
33 | +import java.util.concurrent.ConcurrentHashMap; | ||
34 | +import java.util.concurrent.atomic.AtomicInteger; | ||
35 | +import java.util.stream.Collectors; | ||
36 | + | ||
37 | +@Slf4j | ||
38 | +@Component | ||
39 | +public class ChemicalPcoessor implements PageProcessor { | ||
40 | + private final Site request = Site.me().setTimeOut(300000); | ||
41 | + | ||
42 | + private int index; | ||
43 | + private String substring; | ||
44 | + | ||
45 | + private HttpRequestBody httpRequestBody; | ||
46 | + | ||
47 | + private ConcurrentHashMap<String, Object> map = new ConcurrentHashMap<>(); | ||
48 | + | ||
49 | + @Override | ||
50 | + public void process(Page page) { | ||
51 | +// synchronized (this) { | ||
52 | + String url = page.getUrl().get(); | ||
53 | + if (url.equals("https://pubs.rsc.org/en/results?searchtext=battery")) { | ||
54 | + try { | ||
55 | + getMaxPages(page); | ||
56 | + } catch (IOException e) { | ||
57 | + throw new RuntimeException(e); | ||
58 | + } | ||
59 | +// }else if (url.equals("https://pubs.rsc.org/en/search/journalresult")&&index==0) { | ||
60 | +// getResultMax(page); | ||
61 | + } else if (url.equals("https://pubs.rsc.org/en/search/journalresult")) { | ||
62 | + eveyPage(page); | ||
63 | + } else if (url.contains("https://pubs.rsc.org/en/content")) { | ||
64 | + getDetil(page); | ||
65 | + } | ||
66 | +// } | ||
67 | + } | ||
68 | + | ||
69 | + void getMaxPages(Page page) throws IOException { | ||
70 | + index = 0; | ||
71 | + Html html = page.getHtml(); | ||
72 | + String script = html.css("script").regex(".*var searchResultCounts = (.*?);").get(); | ||
73 | +// log.info(script); | ||
74 | +// String replace = script.replace("[", ""); | ||
75 | +// String replace1 = replace.replace("]", ""); | ||
76 | +// String[] split = replace1.split(","); | ||
77 | +// int i1 = split[1].indexOf(":"); | ||
78 | +// int i2 = split[1].indexOf("}"); | ||
79 | +// substring = split[1].substring(i1 + 1, i1); | ||
80 | +// if (!StringUtils.isEmpty(substring)) { | ||
81 | + for (int i = 1; i <= 2118; i++) { | ||
82 | + String baseUrl = "https://pubs.rsc.org/en/search/journalresult"; | ||
83 | + map.put("searchterm", "PD94bWwgdmVyc2lvbj0iMS4wIj8+DQo8U2VhcmNoVGVybSB4" + | ||
84 | + "bWxuczp4c2Q9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hIiB4bW" + | ||
85 | + "xuczp4c2k9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hLWluc3Rh" + | ||
86 | + "bmNlIj4NCiAgPENhdGVnb3J5PmpvdXJuYWw8L0NhdGVnb3J5Pg0KICA8U3ViQ2" + | ||
87 | + "F0ZWdvcnkgLz4NCiAgPENvbnRlbnRUeXBlPmpvdXJuYWw8L0NvbnRlbnRUeXBl" + | ||
88 | + "Pg0KICA8Q3JpdGVyaWFzPg0KICAgIDxOYW1lVmFsdWU+DQogICAgICA8TmFtZT" + | ||
89 | + "5mcmVldGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5PC9WYWx1ZT4N" + | ||
90 | + "CiAgICA8L05hbWVWYWx1ZT4NCiAgICA8TmFtZVZhbHVlPg0KICAgICAgPE5hbW" + | ||
91 | + "U+T3JpZ2luYWxGcmVlVGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5" + | ||
92 | + "PC9WYWx1ZT4NCiAgICA8L05hbWVWYWx1ZT4NCiAgPC9Dcml0ZXJpYXM+DQogID" + | ||
93 | + "xGYWNldHMgLz4NCiAgPFJlcXVlc3RUaW1lPjAwMDEtMDEtMDFUMDA6MDA6MDA8" + | ||
94 | + "L1JlcXVlc3RUaW1lPg0KICA8QXV0aG9yQ3JpdGVyaWEgLz4NCiAgPFB1YmxpY2" + | ||
95 | + "F0aW9uRGF0ZT4NCiAgICA8SXNTZWxlY3RlZERhdGU+ZmFsc2U8L0lzU2VsZWN0" + | ||
96 | + "ZWREYXRlPg0KICA8L1B1YmxpY2F0aW9uRGF0ZT4NCiAgPEV4Y2x1ZGVzIC8+DQ" + | ||
97 | + "o8L1NlYXJjaFRlcm0+"); | ||
98 | + map.put("resultcount", 52942); | ||
99 | + map.put("pageno", i); | ||
100 | + httpRequestBody = HttpRequestBody.form(map, "UTF-8"); | ||
101 | + Request request = new Request(baseUrl).setMethod(HttpConstant.Method.POST) | ||
102 | + .addHeader("Content-Type", "application/x-www-form-urlencoded") | ||
103 | + .addHeader("Connection", "keep-alive") | ||
104 | + .addHeader("Host", "pubs.rsc.org") | ||
105 | + .addHeader("Accept-Encoding", "gzip, deflate, br") | ||
106 | + .addHeader("Accept", "*/*") | ||
107 | + .addHeader("Cookie", "X-Mapping-hhmaobcf=3C9897E0250B6117CDBA3EC8F724E9A9;_PubsBFCleared=1;ASP.NET_SessionId=bang00oiyj1p1v45jkuk2cgt;ShowEUCookieLawBanner=true") | ||
108 | + .setCharset("UTF-8"); | ||
109 | +// log.info(map.toString()); | ||
110 | + request.setRequestBody(httpRequestBody); | ||
111 | + page.addTargetRequest(request); | ||
112 | + | ||
113 | +// OkHttpClient client = new OkHttpClient().newBuilder() | ||
114 | +// .build(); | ||
115 | +// MediaType mediaType = MediaType.parse("text/plain"); | ||
116 | +// for (int i = 1; i <= 3; i++) { | ||
117 | +// MultipartBody body = new MultipartBody.Builder().setType(MultipartBody.FORM) | ||
118 | +// .addFormDataPart("searchterm", "PD94bWwgdmVyc2lvbj0iMS4wIj8+DQo8U2VhcmNoVGVybSB4bWxuczp4c2Q9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hIiB4bWxuczp4c2k9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hLWluc3RhbmNlIj4NCiAgPENhdGVnb3J5PmpvdXJuYWw8L0NhdGVnb3J5Pg0KICA8U3ViQ2F0ZWdvcnkgLz4NCiAgPENvbnRlbnRUeXBlPmpvdXJuYWw8L0NvbnRlbnRUeXBlPg0KICA8Q3JpdGVyaWFzPg0KICAgIDxOYW1lVmFsdWU+DQogICAgICA8TmFtZT5mcmVldGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5PC9WYWx1ZT4NCiAgICA8L05hbWVWYWx1ZT4NCiAgICA8TmFtZVZhbHVlPg0KICAgICAgPE5hbWU+T3JpZ2luYWxGcmVlVGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5PC9WYWx1ZT4NCiAgICA8L05hbWVWYWx1ZT4NCiAgPC9Dcml0ZXJpYXM+DQogIDxGYWNldHMgLz4NCiAgPFJlcXVlc3RUaW1lPjAwMDEtMDEtMDFUMDA6MDA6MDA8L1JlcXVlc3RUaW1lPg0KICA8QXV0aG9yQ3JpdGVyaWEgLz4NCiAgPFB1YmxpY2F0aW9uRGF0ZT4NCiAgICA8SXNTZWxlY3RlZERhdGU+ZmFsc2U8L0lzU2VsZWN0ZWREYXRlPg0KICA8L1B1YmxpY2F0aW9uRGF0ZT4NCiAgPEV4Y2x1ZGVzIC8+DQo8L1NlYXJjaFRlcm0+") | ||
119 | +// .addFormDataPart("resultcount", "25") | ||
120 | +//// .addFormDataPart("category", "journal") | ||
121 | +// .addFormDataPart("pageno", String.valueOf(i)) | ||
122 | +// .build(); | ||
123 | +// okhttp3.Request okrequest = new okhttp3.Request.Builder() | ||
124 | +// .url("https://pubs.rsc.org/en/search/journalresult") | ||
125 | +// .method("POST", body) | ||
126 | +// .addHeader("Cookie", "X-Mapping-hhmaobcf=3C9897E0250B6117CDBA3EC8F724E9A9;_PubsBFCleared=1;ASP.NET_SessionId=bang00oiyj1p1v45jkuk2cgt;ShowEUCookieLawBanner=true") | ||
127 | +// .build(); | ||
128 | +// Response response = client.newCall(okrequest).execute(); | ||
129 | +// String responseBody = response.body().string(); | ||
130 | +// log.info(String.valueOf(responseBody)); | ||
131 | + } | ||
132 | + } | ||
133 | +// }else{ | ||
134 | +// throw new RuntimeException(); | ||
135 | +// } | ||
136 | +// } | ||
137 | + | ||
138 | + void getResultMax(Page page) { | ||
139 | + index = 1; | ||
140 | +// .xpath("//span[@class='paging--label']/text()") | ||
141 | + page.putField("html", page.getHtml()); | ||
142 | + String ever = page.getHtml().xpath("//div[@class='fixpadv--l']/strong/text()").get(); | ||
143 | + int total = Integer.parseInt(substring); | ||
144 | + int everReq = Integer.parseInt(ever.split(" ")[0]); | ||
145 | + int pageNo = total / everReq; | ||
146 | + if ((pageNo * everReq) % total < everReq && (pageNo * everReq) % total == 0) { | ||
147 | + pageNo = pageNo + 1; | ||
148 | + } | ||
149 | + log.info(String.valueOf(pageNo)); | ||
150 | +// String s = page.getHtml().xpath("//span[@class='paging--label']/text()").get(); | ||
151 | +// String[] pageTotal = s.split(" "); | ||
152 | +// int now = Integer.parseInt(pageTotal[4]); | ||
153 | +// int total = Integer.parseInt(pageTotal[6]); | ||
154 | + for (int i = 1; i <= total; i++) { | ||
155 | + String baseUrl = "https://pubs.rsc.org/en/search/journalresult"; | ||
156 | + Map<String, Object> map = new HashMap<>(); | ||
157 | + map.put("searchterm", "PD94bWwgdmVyc2lvbj0iMS4wIj8+DQo8U2VhcmNoVGVybSB4" + | ||
158 | + "bWxuczp4c2Q9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hIiB4bW" + | ||
159 | + "xuczp4c2k9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hLWluc3Rh" + | ||
160 | + "bmNlIj4NCiAgPENhdGVnb3J5PmpvdXJuYWw8L0NhdGVnb3J5Pg0KICA8U3ViQ2" + | ||
161 | + "F0ZWdvcnkgLz4NCiAgPENvbnRlbnRUeXBlPmpvdXJuYWw8L0NvbnRlbnRUeXBl" + | ||
162 | + "Pg0KICA8Q3JpdGVyaWFzPg0KICAgIDxOYW1lVmFsdWU+DQogICAgICA8TmFtZT" + | ||
163 | + "5mcmVldGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5PC9WYWx1ZT4N" + | ||
164 | + "CiAgICA8L05hbWVWYWx1ZT4NCiAgICA8TmFtZVZhbHVlPg0KICAgICAgPE5hbW" + | ||
165 | + "U+T3JpZ2luYWxGcmVlVGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5" + | ||
166 | + "PC9WYWx1ZT4NCiAgICA8L05hbWVWYWx1ZT4NCiAgPC9Dcml0ZXJpYXM+DQogID" + | ||
167 | + "xGYWNldHMgLz4NCiAgPFJlcXVlc3RUaW1lPjAwMDEtMDEtMDFUMDA6MDA6MDA8" + | ||
168 | + "L1JlcXVlc3RUaW1lPg0KICA8QXV0aG9yQ3JpdGVyaWEgLz4NCiAgPFB1YmxpY2" + | ||
169 | + "F0aW9uRGF0ZT4NCiAgICA8SXNTZWxlY3RlZERhdGU+ZmFsc2U8L0lzU2VsZWN0" + | ||
170 | + "ZWREYXRlPg0KICA8L1B1YmxpY2F0aW9uRGF0ZT4NCiAgPEV4Y2x1ZGVzIC8+DQ" + | ||
171 | + "o8L1NlYXJjaFRlcm0+"); | ||
172 | + map.put("resultcount", 25); | ||
173 | + map.put("pageno", i); | ||
174 | + HttpRequestBody httpRequestBody = HttpRequestBody.form(map, "UTF-8"); | ||
175 | + Request request = new Request(baseUrl).setMethod(HttpConstant.Method.POST) | ||
176 | + .addHeader("Content-Type", "application/x-www-form-urlencoded;charset=utf-8") | ||
177 | + .addHeader("Connection", "keep-alive") | ||
178 | + .addHeader("Host", "pubs.rsc.org") | ||
179 | + .addHeader("Accept-Encoding", "gzip, deflate, br, zstd") | ||
180 | + .addHeader("Accept", "text/html, */*; q=0.01") | ||
181 | + .setCharset("UTF-8"); | ||
182 | + request.setRequestBody(httpRequestBody); | ||
183 | + page.addTargetRequest(request); | ||
184 | + } | ||
185 | + } | ||
186 | + | ||
187 | + void eveyPage(Page page) { | ||
188 | +// <span class="paging--label"> - Showing page 2 of 4000</span> | ||
189 | +// log.info(page.getHtml().get()); | ||
190 | + List<String> hrefList = page.getHtml().xpath("//a[@class='capsule__action']/@href").all(); | ||
191 | + for (String herf : hrefList) { | ||
192 | + log.info("https://pubs.rsc.org" + herf); | ||
193 | + String url = "https://pubs.rsc.org" + herf; | ||
194 | + page.addTargetRequest(url); | ||
195 | + } | ||
196 | + } | ||
197 | + | ||
198 | + void getDetil(Page page) { | ||
199 | + Html html = page.getHtml(); | ||
200 | + //文章链接 | ||
201 | + String articleCode = page.getUrl().get(); | ||
202 | + //文章标题 | ||
203 | + String title = html.xpath("//div[@class='article__title']/h2/font/text()").get(); | ||
204 | + if (StringUtils.isBlank(title)) { | ||
205 | + title = html.xpath("//div[@class='article__title']/h2/text()").get(); | ||
206 | + } | ||
207 | + | ||
208 | + //文章内容 | ||
209 | + String articleDesc = html.xpath("//div[@class='capsule__text']/p/text()").get(); | ||
210 | + | ||
211 | + //时间 | ||
212 | + String publishTime; | ||
213 | + Date publishTimeDateTime = null; | ||
214 | + List<String> all = html.xpath("//div[@class='c fixpadt--l']/dd[@class='c__10']").all(); | ||
215 | + AtomicInteger timeIndex = new AtomicInteger(0); | ||
216 | + all.stream().filter(s -> { | ||
217 | + timeIndex.getAndIncrement(); | ||
218 | + return s.equals("test2"); | ||
219 | + }).findFirst(); | ||
220 | + publishTime = html.xpath("//div[@class='c fixpadt--l']/dd[@class='c__14']/text()").all().get(timeIndex.get()); | ||
221 | + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH); | ||
222 | + try { | ||
223 | + publishTimeDateTime = formatter.parse(publishTime); | ||
224 | + } catch (ParseException e) { | ||
225 | + e.printStackTrace(); | ||
226 | + } | ||
227 | + | ||
228 | + //作者名字 | ||
229 | + List<String> authorList = html.xpath("//div[@class='article__authors']/span[@class='article__author-link']/a/text()").all(); | ||
230 | + List<String> collect = authorList.stream().filter(a -> !a.equals("†")).collect(Collectors.toList()); | ||
231 | + | ||
232 | +// collect.forEach(log::info); | ||
233 | + StringBuffer stringBuffer = new StringBuffer(); | ||
234 | + collect.forEach(stringBuffer::append); | ||
235 | + | ||
236 | + StringBuffer authorName = new StringBuffer(); | ||
237 | + authorList.forEach(authorName::append); | ||
238 | + HashMap<Object, Object> authorHashMap = new HashMap<>(); | ||
239 | + List<String> cupList = html.xpath("//div[@class='article__authors']/span/span/sup/i/text()").all(); | ||
240 | + if (collect.size() == cupList.size()) { | ||
241 | + for (int i = 0; i < collect.size(); i++) { | ||
242 | + authorHashMap.put(collect.get(i), cupList.get(i)); | ||
243 | + } | ||
244 | + } | ||
245 | +// 单位地址 邮箱 | ||
246 | + JSONArray authorAddress = new JSONArray(); | ||
247 | + JSONArray authorMail = new JSONArray(); | ||
248 | + List<Selectable> addressMailList = html.xpath("//p[@class='article__author-affiliation']").nodes(); | ||
249 | + HashMap<Object, Object> addressMap = new HashMap<>(); | ||
250 | + HashMap<Object, Object> mailMap = new HashMap<>(); | ||
251 | + if (CollectionUtils.isNotEmpty(addressMailList)) { | ||
252 | + for (Selectable selectable : addressMailList) { | ||
253 | + List<Selectable> nodes = selectable.xpath("//span").nodes(); | ||
254 | + if (CollectionUtils.isNotEmpty(nodes)) { | ||
255 | + Selectable keyXpath = nodes.get(0); | ||
256 | + Selectable valueXpath = nodes.get(1); | ||
257 | + String key = keyXpath.xpath("//sup/text()").get(); | ||
258 | + String address = valueXpath.xpath("//span/text()").get(); | ||
259 | + String mail = valueXpath.xpath("//span/a/text()").get(); | ||
260 | + if (!StringUtils.isBlank(key)) { | ||
261 | + if (!StringUtils.isBlank(address)) { | ||
262 | + addressMap.put(key, address); | ||
263 | + } else { | ||
264 | + addressMap.put(key, null); | ||
265 | + } | ||
266 | + if (!StringUtils.isBlank(mail)) { | ||
267 | + mailMap.put(key, mail); | ||
268 | + } else { | ||
269 | + mailMap.put(key, null); | ||
270 | + } | ||
271 | + } else { | ||
272 | + if (!StringUtils.isBlank(address)) { | ||
273 | + addressMap.put("*", address); | ||
274 | + } else { | ||
275 | + addressMap.put("*", null); | ||
276 | + } | ||
277 | + if (!StringUtils.isBlank(mail)) { | ||
278 | + mailMap.put("*", mail); | ||
279 | + } else { | ||
280 | + mailMap.put("*", null); | ||
281 | + } | ||
282 | + } | ||
283 | + } | ||
284 | + } | ||
285 | + Object[] objects = authorHashMap.keySet().stream().toArray(); | ||
286 | +// log.info(Arrays.toString(objects)); | ||
287 | + for (int i = 0; i < objects.length; i++) { | ||
288 | + JSONObject addressObj = new JSONObject(); | ||
289 | + JSONObject mailObj = new JSONObject(); | ||
290 | + Object point = authorHashMap.get(String.valueOf(objects[i])); | ||
291 | + Object address = addressMap.get(point); | ||
292 | + Object mail = mailMap.get(point); | ||
293 | + addressObj.put("address", address); | ||
294 | + addressObj.put("authorNames", objects[i]); | ||
295 | + mailObj.put("authorEmailName", objects[i]); | ||
296 | + mailObj.put("email", mail); | ||
297 | + | ||
298 | + authorAddress.add(addressObj); | ||
299 | + authorMail.add(mailObj); | ||
300 | + } | ||
301 | + } | ||
302 | + | ||
303 | + JSONArray references = new JSONArray(); | ||
304 | + List<Selectable> referenceNodeList = html.xpath("//div[@class='ref-list']/ol/li").nodes(); | ||
305 | + if (CollectionUtils.isNotEmpty(referenceNodeList)) { | ||
306 | + for (Selectable reference : referenceNodeList) { | ||
307 | + List<String> herfOpList = reference.xpath("//a/@href").all(); | ||
308 | + ArrayList<Object> herfList = new ArrayList<>(); | ||
309 | + for (String herf : herfOpList) { | ||
310 | + if (herf.startsWith("https") || herf.startsWith("http")) { | ||
311 | + String before = "https://pubs.rsc.org"; | ||
312 | + herfList.add(before + herf); | ||
313 | + } else { | ||
314 | + herfList.add(herf); | ||
315 | + } | ||
316 | + } | ||
317 | + StringBuffer referenceTitle = new StringBuffer(); | ||
318 | + List<String> spanList = reference.xpath("//span/text()").all(); | ||
319 | + if (CollectionUtils.isNotEmpty(spanList)) { | ||
320 | + for (int i = 0; i < spanList.size(); i++) { | ||
321 | + referenceTitle.append(spanList.get(i)); | ||
322 | + if (i < spanList.size() - 1) { | ||
323 | + referenceTitle.append(","); | ||
324 | + } | ||
325 | + } | ||
326 | + } | ||
327 | + String em = reference.xpath("//em/text()").get(); | ||
328 | + String strong = reference.xpath("//strong/text()").get(); | ||
329 | + referenceTitle.append(em); | ||
330 | + referenceTitle.append(strong); | ||
331 | + JSONObject referencesObj = new JSONObject(); | ||
332 | + referencesObj.put("links", herfList); | ||
333 | + referencesObj.put("referenceTitle", referenceTitle); | ||
334 | + references.add(referencesObj); | ||
335 | + } | ||
336 | + } | ||
337 | + log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorName, authorMail.toJSONString()); | ||
338 | + page.putField("article", ArticleDO.builder() | ||
339 | + .articleType(ArticleTypeEnum.Chemical.getType()) | ||
340 | + .articleCode(articleCode) | ||
341 | + .authorName(authorName.toString()) | ||
342 | + .title(title) | ||
343 | + .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE)) | ||
344 | + .emailInfo(authorMail.toJSONString()) | ||
345 | + .articleDesc(articleDesc) | ||
346 | + .authorAddress(authorAddress.toJSONString()) | ||
347 | + .referenceInfo(references.toJSONString()).build()); | ||
348 | + } | ||
349 | + | ||
350 | + @Override | ||
351 | + public Site getSite() { | ||
352 | + return request; | ||
353 | + } | ||
354 | + | ||
355 | + public static void main(String[] args) { | ||
356 | + Spider.create(new ChemicalPcoessor()) | ||
357 | + .addUrl("https://pubs.rsc.org/en/search/journalresult") | ||
358 | + .addPipeline(new ArticlePipeline()) | ||
359 | + .thread(5).run(); | ||
360 | + } | ||
361 | +} |
src/main/java/com/canrd/webmagic/processor/ChemicalsciencePcoessor.java
0 → 100644
1 | +package com.canrd.webmagic.processor; | ||
2 | + | ||
3 | +import com.canrd.webmagic.common.utils.StringUtils; | ||
4 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | ||
5 | +import lombok.extern.slf4j.Slf4j; | ||
6 | +import org.apache.logging.log4j.core.util.UuidUtil; | ||
7 | +import org.springframework.beans.factory.annotation.Autowired; | ||
8 | +import org.springframework.stereotype.Component; | ||
9 | +import us.codecraft.webmagic.Page; | ||
10 | +import us.codecraft.webmagic.Request; | ||
11 | +import us.codecraft.webmagic.Site; | ||
12 | +import us.codecraft.webmagic.Spider; | ||
13 | +import us.codecraft.webmagic.model.HttpRequestBody; | ||
14 | +import us.codecraft.webmagic.processor.PageProcessor; | ||
15 | +import us.codecraft.webmagic.selector.*; | ||
16 | +import us.codecraft.webmagic.utils.HttpConstant; | ||
17 | + | ||
18 | +import java.util.*; | ||
19 | + | ||
20 | +@Slf4j | ||
21 | +@Component | ||
22 | +public class ChemicalsciencePcoessor implements PageProcessor { | ||
23 | + | ||
24 | + @Autowired | ||
25 | + private ChemicalPcoessor journalResultPcoessor; | ||
26 | + private final Site base = Site.me().setDomain("base").setTimeOut(200000); | ||
27 | + | ||
28 | + @Override | ||
29 | + public Site getSite() { | ||
30 | + return base; | ||
31 | + } | ||
32 | + | ||
33 | + private String substring; | ||
34 | + | ||
35 | + @Override | ||
36 | + public void process(Page page) { | ||
37 | + String url = page.getUrl().get(); | ||
38 | + if (url.equals("https://pubs.rsc.org/en/results?searchtext=battery")) { | ||
39 | + getMaxPages(page); | ||
40 | + } else if (url.equals("https://pubs.rsc.org/en/search/journalresult")) { | ||
41 | + getResultMax(page); | ||
42 | + } | ||
43 | + } | ||
44 | + | ||
45 | + void getMaxPages(Page page) { | ||
46 | + Html html = page.getHtml(); | ||
47 | + String script = page.getHtml().css("script").regex(".*var searchResultCounts = (.*?);").get(); | ||
48 | + String replace = script.replace("[", ""); | ||
49 | + String replace1 = replace.replace("]", ""); | ||
50 | + String[] split = replace1.split(","); | ||
51 | + int i = split[1].indexOf(":"); | ||
52 | + int i1 = split[1].indexOf("}"); | ||
53 | + substring = split[1].substring(i + 1, i1); | ||
54 | + if (!StringUtils.isEmpty(substring)) { | ||
55 | + String baseUrl = "https://pubs.rsc.org/en/search/journalresult"; | ||
56 | + Map<String, Object> map = new HashMap<>(); | ||
57 | + map.put("searchterm", "PD94bWwgdmVyc2lvbj0iMS4wIj8+DQo8U2VhcmNoVGVybSB4" + | ||
58 | + "bWxuczp4c2Q9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hIiB4bW" + | ||
59 | + "xuczp4c2k9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hLWluc3Rh" + | ||
60 | + "bmNlIj4NCiAgPENhdGVnb3J5PmpvdXJuYWw8L0NhdGVnb3J5Pg0KICA8U3ViQ2" + | ||
61 | + "F0ZWdvcnkgLz4NCiAgPENvbnRlbnRUeXBlPmpvdXJuYWw8L0NvbnRlbnRUeXBl" + | ||
62 | + "Pg0KICA8Q3JpdGVyaWFzPg0KICAgIDxOYW1lVmFsdWU+DQogICAgICA8TmFtZT" + | ||
63 | + "5mcmVldGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5PC9WYWx1ZT4N" + | ||
64 | + "CiAgICA8L05hbWVWYWx1ZT4NCiAgICA8TmFtZVZhbHVlPg0KICAgICAgPE5hbW" + | ||
65 | + "U+T3JpZ2luYWxGcmVlVGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5" + | ||
66 | + "PC9WYWx1ZT4NCiAgICA8L05hbWVWYWx1ZT4NCiAgPC9Dcml0ZXJpYXM+DQogID" + | ||
67 | + "xGYWNldHMgLz4NCiAgPFJlcXVlc3RUaW1lPjAwMDEtMDEtMDFUMDA6MDA6MDA8" + | ||
68 | + "L1JlcXVlc3RUaW1lPg0KICA8QXV0aG9yQ3JpdGVyaWEgLz4NCiAgPFB1YmxpY2" + | ||
69 | + "F0aW9uRGF0ZT4NCiAgICA8SXNTZWxlY3RlZERhdGU+ZmFsc2U8L0lzU2VsZWN0" + | ||
70 | + "ZWREYXRlPg0KICA8L1B1YmxpY2F0aW9uRGF0ZT4NCiAgPEV4Y2x1ZGVzIC8+DQ" + | ||
71 | + "o8L1NlYXJjaFRlcm0+"); | ||
72 | + log.info(substring); | ||
73 | + map.put("resultcount",25); | ||
74 | + map.put("pageno",2); | ||
75 | + HttpRequestBody httpRequestBody = HttpRequestBody.form(map, "UTF-8"); | ||
76 | + Request request = new Request(baseUrl).setMethod(HttpConstant.Method.POST) | ||
77 | + .addHeader("Content-Type", "application/x-www-form-urlencoded;charset=utf-8") | ||
78 | + .addHeader("Connection", "keep-alive") | ||
79 | + .addHeader("Host", "pubs.rsc.org") | ||
80 | + .addHeader("Accept-Encoding", "gzip, deflate, br, zstd") | ||
81 | + .addHeader("Accept", "text/html, */*; q=0.01") | ||
82 | + .setCharset("UTF-8"); | ||
83 | + request.setRequestBody(httpRequestBody); | ||
84 | + page.addTargetRequest(request); | ||
85 | + } else { | ||
86 | + throw new RuntimeException(); | ||
87 | + } | ||
88 | + } | ||
89 | + | ||
90 | + void getResultMax(Page page) { | ||
91 | +// .xpath("//span[@class='paging--label']/text()") | ||
92 | + page.putField("html", page.getHtml()); | ||
93 | + String ever = page.getHtml().xpath("//div[@class='fixpadv--l']/strong/text()").get(); | ||
94 | + int total = Integer.parseInt(substring); | ||
95 | + int everReq = Integer.parseInt(ever.split(" ")[0]); | ||
96 | + int pageNo=total/everReq; | ||
97 | + if ((pageNo*everReq)%total<everReq&&(pageNo*everReq)%total==0){ | ||
98 | + pageNo=pageNo+1; | ||
99 | + } | ||
100 | + log.info(String.valueOf(pageNo)); | ||
101 | +// String s = page.getHtml().xpath("//span[@class='paging--label']/text()").get(); | ||
102 | +// String[] pageTotal = s.split(" "); | ||
103 | +// int now = Integer.parseInt(pageTotal[4]); | ||
104 | +// int total = Integer.parseInt(pageTotal[6]); | ||
105 | + for (int i = 1; i <= total; i++) { | ||
106 | + String baseUrl = "https://pubs.rsc.org/en/search/journalresult"; | ||
107 | + Map<String, Object> map = new HashMap<>(); | ||
108 | + map.put("searchterm", "PD94bWwgdmVyc2lvbj0iMS4wIj8+DQo8U2VhcmNoVGVybSB4" + | ||
109 | + "bWxuczp4c2Q9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hIiB4bW" + | ||
110 | + "xuczp4c2k9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hLWluc3Rh" + | ||
111 | + "bmNlIj4NCiAgPENhdGVnb3J5PmpvdXJuYWw8L0NhdGVnb3J5Pg0KICA8U3ViQ2" + | ||
112 | + "F0ZWdvcnkgLz4NCiAgPENvbnRlbnRUeXBlPmpvdXJuYWw8L0NvbnRlbnRUeXBl" + | ||
113 | + "Pg0KICA8Q3JpdGVyaWFzPg0KICAgIDxOYW1lVmFsdWU+DQogICAgICA8TmFtZT" + | ||
114 | + "5mcmVldGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5PC9WYWx1ZT4N" + | ||
115 | + "CiAgICA8L05hbWVWYWx1ZT4NCiAgICA8TmFtZVZhbHVlPg0KICAgICAgPE5hbW" + | ||
116 | + "U+T3JpZ2luYWxGcmVlVGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5" + | ||
117 | + "PC9WYWx1ZT4NCiAgICA8L05hbWVWYWx1ZT4NCiAgPC9Dcml0ZXJpYXM+DQogID" + | ||
118 | + "xGYWNldHMgLz4NCiAgPFJlcXVlc3RUaW1lPjAwMDEtMDEtMDFUMDA6MDA6MDA8" + | ||
119 | + "L1JlcXVlc3RUaW1lPg0KICA8QXV0aG9yQ3JpdGVyaWEgLz4NCiAgPFB1YmxpY2" + | ||
120 | + "F0aW9uRGF0ZT4NCiAgICA8SXNTZWxlY3RlZERhdGU+ZmFsc2U8L0lzU2VsZWN0" + | ||
121 | + "ZWREYXRlPg0KICA8L1B1YmxpY2F0aW9uRGF0ZT4NCiAgPEV4Y2x1ZGVzIC8+DQ" + | ||
122 | + "o8L1NlYXJjaFRlcm0+"); | ||
123 | + map.put("resultcount",25); | ||
124 | + map.put("pageno",i); | ||
125 | +// map.put("pageno",1); | ||
126 | + HttpRequestBody httpRequestBody = HttpRequestBody.form(map, "UTF-8"); | ||
127 | + Request request = new Request(baseUrl).setMethod(HttpConstant.Method.POST) | ||
128 | + .addHeader("Content-Type", "application/x-www-form-urlencoded;charset=utf-8") | ||
129 | + .addHeader("Connection", "keep-alive") | ||
130 | + .addHeader("Host", "pubs.rsc.org") | ||
131 | + .addHeader("Accept-Encoding", "gzip, deflate, br, zstd") | ||
132 | + .addHeader("Accept", "text/html, */*; q=0.01") | ||
133 | + .setCharset("UTF-8"); | ||
134 | + request.setRequestBody(httpRequestBody); | ||
135 | +// page.addTargetRequest(request); | ||
136 | + Spider.create(journalResultPcoessor) | ||
137 | + .setUUID(UuidUtil.getTimeBasedUuid().toString()) | ||
138 | + .addRequest(request) | ||
139 | + .addPipeline(new ArticlePipeline()) | ||
140 | + .thread(1).start(); | ||
141 | + } | ||
142 | + } | ||
143 | +} |
src/main/java/com/canrd/webmagic/processor/download/SeleniumDownloader.java
@@ -71,6 +71,7 @@ public class SeleniumDownloader extends AbstractDownloader { | @@ -71,6 +71,7 @@ public class SeleniumDownloader extends AbstractDownloader { | ||
71 | element.submit(); | 71 | element.submit(); |
72 | WebDriverWait wait = new WebDriverWait(webDriver, 30); | 72 | WebDriverWait wait = new WebDriverWait(webDriver, 30); |
73 | wait.until(ExpectedConditions.urlContains("https://www.cell.com/action/doSearch?")); | 73 | wait.until(ExpectedConditions.urlContains("https://www.cell.com/action/doSearch?")); |
74 | + | ||
74 | // wait.until(ExpectedConditions.visibilityOfElementLocated(By.xpath("//div[@class='cb-c']"))); | 75 | // wait.until(ExpectedConditions.visibilityOfElementLocated(By.xpath("//div[@class='cb-c']"))); |
75 | // WebElement cloudFlare = webDriver.findElement(By.xpath("//div[@class='cb-c']/label/input")); | 76 | // WebElement cloudFlare = webDriver.findElement(By.xpath("//div[@class='cb-c']/label/input")); |
76 | // if (cloudFlare!=null){ | 77 | // if (cloudFlare!=null){ |
target/classes/com/canrd/webmagic/controller/ChemicalController.class
No preview for this file type
target/classes/com/canrd/webmagic/controller/MatterController.class
No preview for this file type
target/classes/com/canrd/webmagic/domain/ArticleTypeEnum.class
No preview for this file type
target/classes/com/canrd/webmagic/processor/AdvancedEnergyMaterialPcoessor.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/ChemicalPcoessor.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/ChemicalsciencePcoessor.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/download/SeleniumDownloader.class
No preview for this file type