Commit 965b4941c8d3c216a620dcf5f896c9490459a00e
1 parent
432ddd72
Sciengine
Showing
27 changed files
with
516 additions
and
4 deletions
Too many changes to show.
To preserve performance only 27 of 33 files are displayed.
.idea/inspectionProfiles/Project_Default.xml
... | ... | @@ -4,6 +4,11 @@ |
4 | 4 | <inspection_tool class="AutoCloseableResource" enabled="true" level="WARNING" enabled_by_default="true"> |
5 | 5 | <option name="METHOD_MATCHER_CONFIG" value="java.util.Formatter,format,java.io.Writer,append,com.google.common.base.Preconditions,checkNotNull,org.hibernate.Session,close,java.io.PrintWriter,printf,java.io.PrintStream,printf,okhttp3.Call,execute" /> |
6 | 6 | </inspection_tool> |
7 | + <inspection_tool class="DuplicatedCode" enabled="true" level="WEAK WARNING" enabled_by_default="true"> | |
8 | + <Languages> | |
9 | + <language minSize="757" name="Java" /> | |
10 | + </Languages> | |
11 | + </inspection_tool> | |
7 | 12 | <inspection_tool class="NonSerializableWithSerialVersionUIDField" enabled="true" level="WARNING" enabled_by_default="true" /> |
8 | 13 | <inspection_tool class="SpringJavaInjectionPointsAutowiringInspection" enabled="false" level="ERROR" enabled_by_default="false" /> |
9 | 14 | </profile> | ... | ... |
src/main/java/com/canrd/webmagic/controller/ChemicalController.java
... | ... | @@ -41,7 +41,7 @@ public class ChemicalController { |
41 | 41 | // .setScheduler(new RedisScheduler("127.0.0.1")) |
42 | 42 | .addPipeline(articlePipeline) |
43 | 43 | // .setDownloader(seleniumDownloader) |
44 | - .thread(1).run(); | |
44 | + .thread(100).run(); | |
45 | 45 | return ServerResult.success(); |
46 | 46 | } |
47 | 47 | } | ... | ... |
src/main/java/com/canrd/webmagic/controller/ScienceDirectController.java
0 → 100644
1 | +package com.canrd.webmagic.controller; | |
2 | + | |
3 | +import com.canrd.webmagic.common.constant.ServerResult; | |
4 | +import com.canrd.webmagic.processor.ScienceDirectSearchPcoessor; | |
5 | +import com.canrd.webmagic.processor.download.SeleniumDownloader; | |
6 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | |
7 | +import io.swagger.annotations.Api; | |
8 | +import io.swagger.annotations.ApiOperation; | |
9 | +import org.apache.logging.log4j.core.util.UuidUtil; | |
10 | +import org.springframework.web.bind.annotation.GetMapping; | |
11 | +import org.springframework.web.bind.annotation.RequestMapping; | |
12 | +import org.springframework.web.bind.annotation.RestController; | |
13 | +import us.codecraft.webmagic.Request; | |
14 | +import us.codecraft.webmagic.Spider; | |
15 | + | |
16 | +import javax.annotation.Resource; | |
17 | + | |
18 | +@RestController | |
19 | +@RequestMapping("/sciencedirect/article") | |
20 | +@Api("Science-Direct") | |
21 | +public class ScienceDirectController { | |
22 | + @Resource | |
23 | + private ScienceDirectSearchPcoessor scienceDirectSearchPcoessor; | |
24 | + | |
25 | + @Resource | |
26 | + private SeleniumDownloader seleniumDownloader; | |
27 | + | |
28 | + @Resource | |
29 | + private ArticlePipeline articlePipeline; | |
30 | + | |
31 | + @GetMapping("/start") | |
32 | + @ApiOperation("start") | |
33 | + public ServerResult start() { | |
34 | + Spider.create(scienceDirectSearchPcoessor) | |
35 | + .setUUID(UuidUtil.getTimeBasedUuid().toString()) | |
36 | +// .addRequest(new Request("https://www.sciencedirect.com/search?qs=battery")) | |
37 | + .addRequest(new Request("https://www.sciencedirect.com/search?qs=battery&pub=Nano%20Today&cid=273494")) | |
38 | +// .setScheduler(new RedisScheduler("127.0.0.1")) | |
39 | +// .addPipeline(articlePipeline) | |
40 | + .setDownloader(seleniumDownloader) | |
41 | + .thread(20).run(); | |
42 | + return ServerResult.success(); | |
43 | + } | |
44 | +} | ... | ... |
src/main/java/com/canrd/webmagic/controller/SciengineController.java
0 → 100644
1 | +package com.canrd.webmagic.controller; | |
2 | + | |
3 | +import com.canrd.webmagic.common.constant.ServerResult; | |
4 | +import com.canrd.webmagic.processor.ScienginePcoessor; | |
5 | +import com.canrd.webmagic.processor.download.SeleniumDownloader; | |
6 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | |
7 | +import io.swagger.annotations.Api; | |
8 | +import io.swagger.annotations.ApiOperation; | |
9 | +import org.apache.logging.log4j.core.util.UuidUtil; | |
10 | +import org.springframework.web.bind.annotation.GetMapping; | |
11 | +import org.springframework.web.bind.annotation.RequestMapping; | |
12 | +import org.springframework.web.bind.annotation.RestController; | |
13 | +import us.codecraft.webmagic.Request; | |
14 | +import us.codecraft.webmagic.Spider; | |
15 | + | |
16 | +import javax.annotation.Resource; | |
17 | + | |
18 | +@RestController | |
19 | +@RequestMapping("/sciengine/article") | |
20 | +@Api("Sciengine") | |
21 | +public class SciengineController { | |
22 | + @Resource | |
23 | + private ScienginePcoessor scienginePcoessor; | |
24 | + | |
25 | + @Resource | |
26 | + private SeleniumDownloader seleniumDownloader; | |
27 | + | |
28 | + @Resource | |
29 | + private ArticlePipeline articlePipeline; | |
30 | + | |
31 | + @GetMapping("/start") | |
32 | + @ApiOperation("start") | |
33 | + public ServerResult start() { | |
34 | + Spider.create(scienginePcoessor) | |
35 | + .setUUID(UuidUtil.getTimeBasedUuid().toString()) | |
36 | + .addRequest(new Request("https://www.sciengine.com/plat/search?queryField_a=battery")) | |
37 | + .addPipeline(articlePipeline) | |
38 | + .thread(20).run(); | |
39 | + return ServerResult.success(); | |
40 | + } | |
41 | +} | ... | ... |
src/main/java/com/canrd/webmagic/domain/ArticleTypeEnum.java
... | ... | @@ -25,7 +25,8 @@ public enum ArticleTypeEnum { |
25 | 25 | NATURE_COMMUNICATIONS("nature-communications","nuture网站-communications"), |
26 | 26 | NATURE_COMPUTATIONAL_SCIENCE("nature-computational|science","nuture网站-computational|science"), |
27 | 27 | NATURE_METHODS("nature-methods","nuture网站-methods"), |
28 | - Chemical("chemical","chemical网站") | |
28 | + Chemical("chemical","chemical网站"), | |
29 | + Sciengine("sciengine","sciengine网站") | |
29 | 30 | ; |
30 | 31 | private String type; |
31 | 32 | private String desc; | ... | ... |
src/main/java/com/canrd/webmagic/domain/dto/SciengineAffsListDo.java
0 → 100644
1 | +package com.canrd.webmagic.domain.dto; | |
2 | + | |
3 | +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; | |
4 | +import lombok.AllArgsConstructor; | |
5 | +import lombok.Data; | |
6 | +import lombok.NoArgsConstructor; | |
7 | + | |
8 | +@Data | |
9 | +@NoArgsConstructor | |
10 | +@AllArgsConstructor | |
11 | +@JsonIgnoreProperties(ignoreUnknown = true) | |
12 | +public class SciengineAffsListDo { | |
13 | + private boolean isNewRecord; | |
14 | + private String affText; | |
15 | + private String labelFlag; | |
16 | +} | ... | ... |
src/main/java/com/canrd/webmagic/domain/dto/SciengineAuthorDo.java
0 → 100644
1 | +package com.canrd.webmagic.domain.dto; | |
2 | + | |
3 | +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; | |
4 | +import lombok.AllArgsConstructor; | |
5 | +import lombok.Data; | |
6 | +import lombok.NoArgsConstructor; | |
7 | + | |
8 | +import java.util.List; | |
9 | + | |
10 | +@Data | |
11 | +@NoArgsConstructor | |
12 | +@AllArgsConstructor | |
13 | +@JsonIgnoreProperties(ignoreUnknown = true) | |
14 | +public class SciengineAuthorDo { | |
15 | + private boolean isNewRecord; | |
16 | + private String surName; | |
17 | + private String fullName; | |
18 | + private String givenName; | |
19 | + private String label; | |
20 | + private List<SciengineAffsListDo> affsList; | |
21 | + private List<SciengineAuthorNoteDo> authorNoteList; | |
22 | +// private SciengineAuthorNoteDo authorNoteList; | |
23 | +} | |
0 | 24 | \ No newline at end of file | ... | ... |
src/main/java/com/canrd/webmagic/domain/dto/SciengineAuthorNoteDo.java
0 → 100644
1 | +package com.canrd.webmagic.domain.dto; | |
2 | + | |
3 | +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; | |
4 | +import lombok.AllArgsConstructor; | |
5 | +import lombok.Data; | |
6 | +import lombok.NoArgsConstructor; | |
7 | + | |
8 | +@Data | |
9 | +@NoArgsConstructor | |
10 | +@AllArgsConstructor | |
11 | +@JsonIgnoreProperties(ignoreUnknown = true) | |
12 | +public class SciengineAuthorNoteDo { | |
13 | + private boolean isNewRecord; | |
14 | + private String labelFlag; | |
15 | + private String note; | |
16 | + private String email; | |
17 | + | |
18 | + // 构造函数、getter和setter方法 | |
19 | +} | |
0 | 20 | \ No newline at end of file | ... | ... |
src/main/java/com/canrd/webmagic/domain/dto/SciengineReferenceListDo.java
0 → 100644
1 | +package com.canrd.webmagic.domain.dto; | |
2 | + | |
3 | +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; | |
4 | +import lombok.AllArgsConstructor; | |
5 | +import lombok.Data; | |
6 | +import lombok.NoArgsConstructor; | |
7 | + | |
8 | +@Data | |
9 | +@NoArgsConstructor | |
10 | +@AllArgsConstructor | |
11 | +@JsonIgnoreProperties(ignoreUnknown = true) | |
12 | +public class SciengineReferenceListDo { | |
13 | + private String id; | |
14 | + private boolean isNewRecord; | |
15 | + private String refArticleId; | |
16 | + private String title; | |
17 | + private String flay; | |
18 | + private int sort; | |
19 | + private String doi; | |
20 | +} | ... | ... |
src/main/java/com/canrd/webmagic/processor/ScienceDirectArticlePcoessor.java
0 → 100644
1 | +package com.canrd.webmagic.processor; | |
2 | + | |
3 | +import lombok.extern.slf4j.Slf4j; | |
4 | +import org.springframework.stereotype.Component; | |
5 | +import us.codecraft.webmagic.Page; | |
6 | +import us.codecraft.webmagic.Site; | |
7 | +import us.codecraft.webmagic.processor.PageProcessor; | |
8 | +import us.codecraft.webmagic.selector.Html; | |
9 | + | |
10 | +@Slf4j | |
11 | +@Component | |
12 | +public class ScienceDirectArticlePcoessor implements PageProcessor { | |
13 | + | |
14 | + @Override | |
15 | + public void process(Page page) { | |
16 | + String url = page.getUrl().get(); | |
17 | + if (url.equals("https://www.sciencedirect.com/journal/nano-today")){ | |
18 | + findSearch(page); | |
19 | + }else if (url.equals("https://www.sciencedirect.com/search?qs=battery")){ | |
20 | + getMaxPage(page); | |
21 | + }else if (url.contains("https://www.sciencedirect.com/search?qs=battery&show=100&offset=")){ | |
22 | + everyPage(page); | |
23 | + } | |
24 | + } | |
25 | + | |
26 | + @Override | |
27 | + public Site getSite() { | |
28 | + return PageProcessor.super.getSite(); | |
29 | + } | |
30 | + | |
31 | + void findSearch(Page page){ | |
32 | + Html html = page.getHtml(); | |
33 | + page.putField("html",html); | |
34 | + } | |
35 | + | |
36 | + void getMaxPage(Page page){ | |
37 | + Html html = page.getHtml(); | |
38 | +// move-right | |
39 | + page.putField("html",html); | |
40 | + } | |
41 | + | |
42 | + void everyPage(Page page){ | |
43 | + Html html = page.getHtml(); | |
44 | + page.putField("html",html); | |
45 | + } | |
46 | +} | ... | ... |
src/main/java/com/canrd/webmagic/processor/ScienceDirectSearchPcoessor.java
0 → 100644
1 | +package com.canrd.webmagic.processor; | |
2 | + | |
3 | +import lombok.extern.slf4j.Slf4j; | |
4 | +import org.springframework.stereotype.Component; | |
5 | +import us.codecraft.webmagic.Page; | |
6 | +import us.codecraft.webmagic.Site; | |
7 | +import us.codecraft.webmagic.processor.PageProcessor; | |
8 | +import us.codecraft.webmagic.selector.Html; | |
9 | + | |
10 | +@Slf4j | |
11 | +@Component | |
12 | +public class ScienceDirectSearchPcoessor implements PageProcessor { | |
13 | + | |
14 | + @Override | |
15 | + public void process(Page page) { | |
16 | + String url = page.getUrl().get(); | |
17 | + if (url.equals("https://www.sciencedirect.com/search?qs=battery&pub=Nano%20Today&cid=273494")){ | |
18 | + getMaxPage(page); | |
19 | + }else if (url.contains("https://www.sciencedirect.com/search?qs=battery&show=100&offset=")){ | |
20 | + everyPage(page); | |
21 | + } else if (url.contains("https://www.sciencedirect.com/science/article/abs/pii")) { | |
22 | + getPageDetail(page); | |
23 | + }else if (url.equals("https://www.sciencedirect.com/journal/nano-today")){ | |
24 | + | |
25 | + } | |
26 | + } | |
27 | + | |
28 | + @Override | |
29 | + public Site getSite() { | |
30 | + return PageProcessor.super.getSite(); | |
31 | + } | |
32 | + | |
33 | + void getMaxPage(Page page){ | |
34 | + Html html = page.getHtml(); | |
35 | +// move-right | |
36 | + page.putField("html",html); | |
37 | + } | |
38 | + | |
39 | + void everyPage(Page page){ | |
40 | + | |
41 | + } | |
42 | + | |
43 | + void getPageDetail(Page page){ | |
44 | + | |
45 | + } | |
46 | +} | ... | ... |
src/main/java/com/canrd/webmagic/processor/ScienginePcoessor.java
0 → 100644
1 | +package com.canrd.webmagic.processor; | |
2 | + | |
3 | +import com.alibaba.fastjson.JSON; | |
4 | +import com.alibaba.fastjson.JSONArray; | |
5 | +import com.alibaba.fastjson.JSONObject; | |
6 | +import com.baomidou.mybatisplus.core.toolkit.CollectionUtils; | |
7 | +import com.canrd.webmagic.common.constant.ServerResult; | |
8 | +import com.canrd.webmagic.common.utils.DateUtil; | |
9 | +import com.canrd.webmagic.domain.ArticleTypeEnum; | |
10 | +import com.canrd.webmagic.domain.dto.ArticleDO; | |
11 | +import com.canrd.webmagic.domain.dto.SciengineAffsListDo; | |
12 | +import com.canrd.webmagic.domain.dto.SciengineAuthorDo; | |
13 | +import com.canrd.webmagic.domain.dto.SciengineReferenceListDo; | |
14 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | |
15 | +import com.fasterxml.jackson.core.JsonProcessingException; | |
16 | +import com.fasterxml.jackson.databind.ObjectMapper; | |
17 | +import com.jayway.jsonpath.JsonPath; | |
18 | +import lombok.extern.slf4j.Slf4j; | |
19 | +import org.apache.logging.log4j.core.util.UuidUtil; | |
20 | +import org.springframework.stereotype.Component; | |
21 | +import us.codecraft.webmagic.Page; | |
22 | +import us.codecraft.webmagic.Request; | |
23 | +import us.codecraft.webmagic.Site; | |
24 | +import us.codecraft.webmagic.Spider; | |
25 | +import us.codecraft.webmagic.model.HttpRequestBody; | |
26 | +import us.codecraft.webmagic.processor.PageProcessor; | |
27 | +import us.codecraft.webmagic.selector.Html; | |
28 | +import us.codecraft.webmagic.utils.HttpConstant; | |
29 | + | |
30 | +import java.text.ParseException; | |
31 | +import java.text.SimpleDateFormat; | |
32 | +import java.util.*; | |
33 | +import java.util.concurrent.ConcurrentHashMap; | |
34 | + | |
35 | +@Slf4j | |
36 | +@Component | |
37 | +public class ScienginePcoessor implements PageProcessor { | |
38 | + private ConcurrentHashMap<String, Object> map = new ConcurrentHashMap<>(); | |
39 | + | |
40 | + private HttpRequestBody httpRequestBody; | |
41 | + | |
42 | + private final Site site = Site.me().setTimeOut(30000); | |
43 | + | |
44 | + @Override | |
45 | + public void process(Page page) { | |
46 | + String url = page.getUrl().get(); | |
47 | + if (url.equals("https://www.sciengine.com/plat/search?queryField_a=battery")) { | |
48 | + getMaxPage(page); | |
49 | + } else if (url.equals("https://www.sciengine.com/SciSearch/searchNew")) { | |
50 | + everyPage(page); | |
51 | + } else if (url.contains("https://www.sciengine.com/restData/initArticle?")) { | |
52 | + try { | |
53 | + getPageDetail(page); | |
54 | + } catch (JsonProcessingException e) { | |
55 | + throw new RuntimeException(e); | |
56 | + } | |
57 | + } | |
58 | + } | |
59 | + | |
60 | + @Override | |
61 | + public Site getSite() { | |
62 | + return site; | |
63 | + } | |
64 | + | |
65 | + void getMaxPage(Page page) { | |
66 | + for (int i = 1; i <=490 ; i++) { | |
67 | + String baseUrl = "https://www.sciengine.com/SciSearch/searchNew"; | |
68 | + map.put("queryField_a", "battery"); | |
69 | + map.put("pageCount",10); | |
70 | + map.put("curpage",i); | |
71 | + httpRequestBody = HttpRequestBody.form(map, "UTF-8"); | |
72 | + Request request = new Request(baseUrl).setMethod(HttpConstant.Method.POST) | |
73 | + .addHeader("Content-Type", "application/x-www-form-urlencoded") | |
74 | + .addHeader("Connection", "keep-alive") | |
75 | + .addHeader("Cookie","_ga=GA1.1.12362349.1718065158; SHAREJSESSIONID=35fad62b-37db-455a-af7b-9e9eaac5e5bf; Hm_lvt_633c662645ea15827301cdfaf39e48a1=1718171741; retrievalHistory=%5B%7B%22title%22%3A%22battery%22%7D%5D; Hm_lpvt_633c662645ea15827301cdfaf39e48a1=1718172306; _ga_SB5SCK5F77=GS1.1.1718170247.7.1.1718172335.0.0.0") | |
76 | + .addHeader("Host","www.sciengine.com") | |
77 | + .addHeader("Accept-Encoding", "gzip, deflate, br") | |
78 | + .addHeader("Accept", "*/*") | |
79 | + .addHeader("Origin","https://www.sciengine.com") | |
80 | + .addHeader("Referer","https://www.sciengine.com/plat/search?queryField_a=battery") | |
81 | + .setCharset("UTF-8"); | |
82 | + request.setRequestBody(httpRequestBody); | |
83 | + page.addTargetRequest(request); | |
84 | + } | |
85 | + } | |
86 | + | |
87 | + void everyPage(Page page) { | |
88 | + String rawText = page.getRawText(); | |
89 | + List<String> BaseIdList = JsonPath.read(rawText, "$.list[*].id"); | |
90 | + List<String> doiList = JsonPath.read(rawText, "$.list[*].doi"); | |
91 | + if (BaseIdList.size() == doiList.size()) { | |
92 | + for (int i = 0; i < BaseIdList.size(); i++) { | |
93 | + String baseId = BaseIdList.get(i); | |
94 | + String doi = doiList.get(i); | |
95 | + log.info("baseId:"+baseId+",doi:"+doi); | |
96 | + page.addTargetRequest("https://www.sciengine.com/restData/initArticle?doi="+doi+"&articleBaseId="+baseId); | |
97 | + } | |
98 | + } else { | |
99 | + throw new RuntimeException("匹配不成功"); | |
100 | + } | |
101 | + } | |
102 | + | |
103 | + void getPageDetail(Page page) throws JsonProcessingException { | |
104 | + String rawText = page.getRawText(); | |
105 | + //文章链接 | |
106 | + String articleCode = page.getUrl().get(); | |
107 | + | |
108 | + //文章标题 | |
109 | + String title = JsonPath.read(rawText, "$.article.title"); | |
110 | + | |
111 | + //文章内容 | |
112 | + String articleDesc = JsonPath.read(rawText, "$.article.intro"); | |
113 | + | |
114 | + //时间 | |
115 | + Date publishTimeDateTime = null; | |
116 | + String publishTime = JsonPath.read(rawText, "$.article.pubDateStr"); | |
117 | + SimpleDateFormat formatter = new SimpleDateFormat("MMMM dd,yyyy", Locale.ENGLISH); | |
118 | + try { | |
119 | + publishTimeDateTime = formatter.parse(publishTime); | |
120 | + } catch (ParseException e) { | |
121 | + e.printStackTrace(); | |
122 | + } | |
123 | + | |
124 | + //作者名字 | |
125 | + List<String> authors = JsonPath.read(rawText, "$.authorList[*].fullName"); | |
126 | + StringBuffer authorName = new StringBuffer(); | |
127 | + authors.forEach(authorName::append); | |
128 | + | |
129 | + | |
130 | + //邮箱和地址 | |
131 | + JSONArray authorMail = new JSONArray(); | |
132 | + ObjectMapper objectMapper = new ObjectMapper(); | |
133 | + net.minidev.json.JSONArray authorJsonArray = JsonPath.read(rawText, "$.authorList"); | |
134 | + JSONArray fastJsonArray = JSON.parseArray(authorJsonArray.toJSONString()); | |
135 | + SciengineAuthorDo[] authorList = objectMapper.readValue(fastJsonArray.toJSONString(), SciengineAuthorDo[].class); | |
136 | + | |
137 | + JSONArray authorAddress = new JSONArray(); | |
138 | + boolean isNotAddress = true; | |
139 | + for (SciengineAuthorDo author : authorList) { | |
140 | + JSONObject mailObj = new JSONObject(); | |
141 | + if (author.getAuthorNoteList() != null) { | |
142 | + mailObj.put("authorEmailName", author.getFullName()); | |
143 | + mailObj.put("email", author.getAuthorNoteList().get(0).getEmail()); | |
144 | + } else { | |
145 | + } | |
146 | + if (!mailObj.isEmpty()){ | |
147 | + authorMail.add(mailObj); | |
148 | + } | |
149 | + JSONObject addressObj = new JSONObject(); | |
150 | + if (author.getAffsList() != null) { | |
151 | + addressObj.put("address", author.getAffsList().get(0).getAffText()); | |
152 | + addressObj.put("authorNames", author.getFullName()); | |
153 | + isNotAddress = false; | |
154 | + authorAddress.add(addressObj); | |
155 | + } else { | |
156 | + } | |
157 | + } | |
158 | + | |
159 | + ObjectMapper affMapper = new ObjectMapper(); | |
160 | + net.minidev.json.JSONArray affList = JsonPath.read(rawText, "$.affList"); | |
161 | + JSONArray affListJsonArray = JSON.parseArray(affList.toJSONString()); | |
162 | + SciengineAffsListDo[] affListArray = affMapper.readValue(affListJsonArray.toJSONString(), SciengineAffsListDo[].class); | |
163 | + if (isNotAddress && CollectionUtils.isNotEmpty(Arrays.asList(affListArray))) { | |
164 | + JSONObject addressObj = new JSONObject(); | |
165 | + addressObj.put("authorNames", authorName); | |
166 | + List addressList = new ArrayList(); | |
167 | + for (SciengineAffsListDo sciengineAffsListDo : affListArray) { | |
168 | + if (sciengineAffsListDo.getAffText() != null) { | |
169 | + addressList.add(sciengineAffsListDo.getAffText()); | |
170 | + } | |
171 | + } | |
172 | + addressObj.put("address", addressList); | |
173 | + authorAddress.add(addressObj); | |
174 | + } | |
175 | + | |
176 | + //引用文献 | |
177 | + JSONArray references = new JSONArray(); | |
178 | + ObjectMapper referenceMapper = new ObjectMapper(); | |
179 | + net.minidev.json.JSONArray refListJsonArray = JsonPath.read(rawText, "$.article.referenceList"); | |
180 | + JSONArray referenceListJsonArray = JSON.parseArray(refListJsonArray.toJSONString()); | |
181 | + SciengineReferenceListDo[] referenceList = referenceMapper.readValue(referenceListJsonArray.toJSONString(), SciengineReferenceListDo[].class); | |
182 | + for (SciengineReferenceListDo sciengineReferenceListDo : referenceList) { | |
183 | + StringBuffer referenceTitle = new StringBuffer(); | |
184 | + JSONObject referencesObj = new JSONObject(); | |
185 | + ArrayList<Object> herfList = new ArrayList<>(); | |
186 | + herfList.add(sciengineReferenceListDo.getTitle()); | |
187 | + referenceTitle.append("https://www.sciengine.com/JAS/doi/" + sciengineReferenceListDo.getDoi()); | |
188 | + referencesObj.put("links", herfList); | |
189 | + referencesObj.put("referenceTitle", referenceTitle); | |
190 | + references.add(referencesObj); | |
191 | + } | |
192 | + log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorName, authorMail.toJSONString()); | |
193 | + page.putField("article", ArticleDO.builder() | |
194 | + .articleType(ArticleTypeEnum.Sciengine.getType()) | |
195 | + .articleCode(articleCode) | |
196 | + .authorName(authorName.toString()) | |
197 | + .title(title) | |
198 | + .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE)) | |
199 | + .emailInfo(authorMail.toJSONString()) | |
200 | + .articleDesc(articleDesc) | |
201 | + .authorAddress(authorAddress.toJSONString()) | |
202 | + .referenceInfo(references.toJSONString()).build()); | |
203 | + } | |
204 | +} | ... | ... |
src/main/java/com/canrd/webmagic/processor/download/SeleniumDownloader.java
... | ... | @@ -17,6 +17,8 @@ import us.codecraft.webmagic.selector.PlainText; |
17 | 17 | |
18 | 18 | import javax.annotation.Resource; |
19 | 19 | import java.util.Map; |
20 | +import java.util.regex.Matcher; | |
21 | +import java.util.regex.Pattern; | |
20 | 22 | |
21 | 23 | /** |
22 | 24 | * @author: xms |
... | ... | @@ -43,6 +45,13 @@ public class SeleniumDownloader extends AbstractDownloader { |
43 | 45 | return this; |
44 | 46 | } |
45 | 47 | |
48 | + public static boolean checkUrl(String url) { | |
49 | + String pattern = "https://id.elsevier.com/as/[a-zA-Z0-9]+/resume/as/"; | |
50 | + Pattern r = Pattern.compile(pattern); | |
51 | + Matcher m = r.matcher(url); | |
52 | + return m.find(); | |
53 | + } | |
54 | + | |
46 | 55 | @Override |
47 | 56 | public Page download(Request request, Task task) { |
48 | 57 | Page page = Page.fail(); |
... | ... | @@ -78,7 +87,26 @@ public class SeleniumDownloader extends AbstractDownloader { |
78 | 87 | // cloudFlare.click(); |
79 | 88 | // } |
80 | 89 | } |
81 | - if (request.getUrl().contains("https://www.cell.com/action/doSearch?")){ | |
90 | + if (request.getUrl().equals("https://www.sciencedirect.com/search?qs=battery&pub=Nano%20Today&cid=273494")) { | |
91 | + WebDriverWait wait = new WebDriverWait(webDriver, 60); | |
92 | + Boolean until = wait.until(ExpectedConditions.urlContains("https://id.elsevier.com/")); | |
93 | + if (until) { | |
94 | + log.info(webDriver.getCurrentUrl()); | |
95 | +// if (checkUrl(webDriver.getCurrentUrl())) { | |
96 | + String currentUrl = webDriver.getCurrentUrl(); | |
97 | + log.info("currentUrl=" + currentUrl); | |
98 | + String pageSource = webDriver.getPageSource(); | |
99 | + log.info(pageSource); | |
100 | +// WebElement element = webDriver.findElement(By.xpath("//div[@class='form-row']/from/div[@id='jsEnabled']/input")); | |
101 | + webDriver.findElement(By.xpath("//a[@class='ot-sdk-show-settings cookie anchor-text']")).click(); | |
102 | + webDriver.findElement(By.xpath("//input[@id='bdd-email']")).getAttribute("1187551704@qq.com"); | |
103 | + webDriver.findElement(By.xpath("//button[@id='bdd-els-searchBtn']")).submit(); | |
104 | +// String text = element.getText(); | |
105 | +// log.info(text); | |
106 | +// } | |
107 | + } else { | |
108 | + log.info("跳转失败"); | |
109 | + } | |
82 | 110 | |
83 | 111 | } |
84 | 112 | try { |
... | ... | @@ -116,7 +144,7 @@ public class SeleniumDownloader extends AbstractDownloader { |
116 | 144 | } finally { |
117 | 145 | if (webDriver != null) { |
118 | 146 | webDriver.close(); |
119 | -// webDriver.quit(); | |
147 | + webDriver.quit(); | |
120 | 148 | webDriver = null; |
121 | 149 | } |
122 | 150 | } | ... | ... |
src/main/java/com/canrd/webmagic/util/FieldUtil.java
0 → 100644
1 | +package com.canrd.webmagic.util; | |
2 | + | |
3 | +import com.alibaba.fastjson.JSON; | |
4 | +import com.alibaba.fastjson.JSONObject; | |
5 | +import org.springframework.util.StringUtils; | |
6 | + | |
7 | +public class FieldUtil { | |
8 | + public static Boolean isExistField(String field, Object obj) { | |
9 | + if (obj == null || StringUtils.isEmpty(field)) { | |
10 | + return null; | |
11 | + } | |
12 | + Object o = JSON.toJSON(obj); | |
13 | + JSONObject jsonObj = new JSONObject(); | |
14 | + if (o instanceof JSONObject) { | |
15 | + jsonObj = (JSONObject) o; | |
16 | + } | |
17 | + return jsonObj.containsKey(field); | |
18 | + } | |
19 | +} | ... | ... |
target/classes/com/canrd/webmagic/controller/ChemicalController.class
No preview for this file type
target/classes/com/canrd/webmagic/controller/ScienceDirectController.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/controller/SciengineController.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/domain/ArticleTypeEnum.class
No preview for this file type
target/classes/com/canrd/webmagic/domain/dto/SciengineAffsListDo.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/domain/dto/SciengineAuthorDo.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/domain/dto/SciengineAuthorNoteDo.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/domain/dto/SciengineReferenceListDo.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/ScienceDirectArticlePcoessor.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/ScienceDirectSearchPcoessor.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/ScienginePcoessor.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/download/SeleniumDownloader.class
No preview for this file type
target/classes/com/canrd/webmagic/util/FieldUtil.class
0 → 100644
No preview for this file type