Commit 965b4941c8d3c216a620dcf5f896c9490459a00e
1 parent
432ddd72
Sciengine
Showing
27 changed files
with
516 additions
and
4 deletions
Too many changes to show.
To preserve performance only 27 of 33 files are displayed.
.idea/inspectionProfiles/Project_Default.xml
@@ -4,6 +4,11 @@ | @@ -4,6 +4,11 @@ | ||
4 | <inspection_tool class="AutoCloseableResource" enabled="true" level="WARNING" enabled_by_default="true"> | 4 | <inspection_tool class="AutoCloseableResource" enabled="true" level="WARNING" enabled_by_default="true"> |
5 | <option name="METHOD_MATCHER_CONFIG" value="java.util.Formatter,format,java.io.Writer,append,com.google.common.base.Preconditions,checkNotNull,org.hibernate.Session,close,java.io.PrintWriter,printf,java.io.PrintStream,printf,okhttp3.Call,execute" /> | 5 | <option name="METHOD_MATCHER_CONFIG" value="java.util.Formatter,format,java.io.Writer,append,com.google.common.base.Preconditions,checkNotNull,org.hibernate.Session,close,java.io.PrintWriter,printf,java.io.PrintStream,printf,okhttp3.Call,execute" /> |
6 | </inspection_tool> | 6 | </inspection_tool> |
7 | + <inspection_tool class="DuplicatedCode" enabled="true" level="WEAK WARNING" enabled_by_default="true"> | ||
8 | + <Languages> | ||
9 | + <language minSize="757" name="Java" /> | ||
10 | + </Languages> | ||
11 | + </inspection_tool> | ||
7 | <inspection_tool class="NonSerializableWithSerialVersionUIDField" enabled="true" level="WARNING" enabled_by_default="true" /> | 12 | <inspection_tool class="NonSerializableWithSerialVersionUIDField" enabled="true" level="WARNING" enabled_by_default="true" /> |
8 | <inspection_tool class="SpringJavaInjectionPointsAutowiringInspection" enabled="false" level="ERROR" enabled_by_default="false" /> | 13 | <inspection_tool class="SpringJavaInjectionPointsAutowiringInspection" enabled="false" level="ERROR" enabled_by_default="false" /> |
9 | </profile> | 14 | </profile> |
src/main/java/com/canrd/webmagic/controller/ChemicalController.java
@@ -41,7 +41,7 @@ public class ChemicalController { | @@ -41,7 +41,7 @@ public class ChemicalController { | ||
41 | // .setScheduler(new RedisScheduler("127.0.0.1")) | 41 | // .setScheduler(new RedisScheduler("127.0.0.1")) |
42 | .addPipeline(articlePipeline) | 42 | .addPipeline(articlePipeline) |
43 | // .setDownloader(seleniumDownloader) | 43 | // .setDownloader(seleniumDownloader) |
44 | - .thread(1).run(); | 44 | + .thread(100).run(); |
45 | return ServerResult.success(); | 45 | return ServerResult.success(); |
46 | } | 46 | } |
47 | } | 47 | } |
src/main/java/com/canrd/webmagic/controller/ScienceDirectController.java
0 → 100644
1 | +package com.canrd.webmagic.controller; | ||
2 | + | ||
3 | +import com.canrd.webmagic.common.constant.ServerResult; | ||
4 | +import com.canrd.webmagic.processor.ScienceDirectSearchPcoessor; | ||
5 | +import com.canrd.webmagic.processor.download.SeleniumDownloader; | ||
6 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | ||
7 | +import io.swagger.annotations.Api; | ||
8 | +import io.swagger.annotations.ApiOperation; | ||
9 | +import org.apache.logging.log4j.core.util.UuidUtil; | ||
10 | +import org.springframework.web.bind.annotation.GetMapping; | ||
11 | +import org.springframework.web.bind.annotation.RequestMapping; | ||
12 | +import org.springframework.web.bind.annotation.RestController; | ||
13 | +import us.codecraft.webmagic.Request; | ||
14 | +import us.codecraft.webmagic.Spider; | ||
15 | + | ||
16 | +import javax.annotation.Resource; | ||
17 | + | ||
18 | +@RestController | ||
19 | +@RequestMapping("/sciencedirect/article") | ||
20 | +@Api("Science-Direct") | ||
21 | +public class ScienceDirectController { | ||
22 | + @Resource | ||
23 | + private ScienceDirectSearchPcoessor scienceDirectSearchPcoessor; | ||
24 | + | ||
25 | + @Resource | ||
26 | + private SeleniumDownloader seleniumDownloader; | ||
27 | + | ||
28 | + @Resource | ||
29 | + private ArticlePipeline articlePipeline; | ||
30 | + | ||
31 | + @GetMapping("/start") | ||
32 | + @ApiOperation("start") | ||
33 | + public ServerResult start() { | ||
34 | + Spider.create(scienceDirectSearchPcoessor) | ||
35 | + .setUUID(UuidUtil.getTimeBasedUuid().toString()) | ||
36 | +// .addRequest(new Request("https://www.sciencedirect.com/search?qs=battery")) | ||
37 | + .addRequest(new Request("https://www.sciencedirect.com/search?qs=battery&pub=Nano%20Today&cid=273494")) | ||
38 | +// .setScheduler(new RedisScheduler("127.0.0.1")) | ||
39 | +// .addPipeline(articlePipeline) | ||
40 | + .setDownloader(seleniumDownloader) | ||
41 | + .thread(20).run(); | ||
42 | + return ServerResult.success(); | ||
43 | + } | ||
44 | +} |
src/main/java/com/canrd/webmagic/controller/SciengineController.java
0 → 100644
1 | +package com.canrd.webmagic.controller; | ||
2 | + | ||
3 | +import com.canrd.webmagic.common.constant.ServerResult; | ||
4 | +import com.canrd.webmagic.processor.ScienginePcoessor; | ||
5 | +import com.canrd.webmagic.processor.download.SeleniumDownloader; | ||
6 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | ||
7 | +import io.swagger.annotations.Api; | ||
8 | +import io.swagger.annotations.ApiOperation; | ||
9 | +import org.apache.logging.log4j.core.util.UuidUtil; | ||
10 | +import org.springframework.web.bind.annotation.GetMapping; | ||
11 | +import org.springframework.web.bind.annotation.RequestMapping; | ||
12 | +import org.springframework.web.bind.annotation.RestController; | ||
13 | +import us.codecraft.webmagic.Request; | ||
14 | +import us.codecraft.webmagic.Spider; | ||
15 | + | ||
16 | +import javax.annotation.Resource; | ||
17 | + | ||
18 | +@RestController | ||
19 | +@RequestMapping("/sciengine/article") | ||
20 | +@Api("Sciengine") | ||
21 | +public class SciengineController { | ||
22 | + @Resource | ||
23 | + private ScienginePcoessor scienginePcoessor; | ||
24 | + | ||
25 | + @Resource | ||
26 | + private SeleniumDownloader seleniumDownloader; | ||
27 | + | ||
28 | + @Resource | ||
29 | + private ArticlePipeline articlePipeline; | ||
30 | + | ||
31 | + @GetMapping("/start") | ||
32 | + @ApiOperation("start") | ||
33 | + public ServerResult start() { | ||
34 | + Spider.create(scienginePcoessor) | ||
35 | + .setUUID(UuidUtil.getTimeBasedUuid().toString()) | ||
36 | + .addRequest(new Request("https://www.sciengine.com/plat/search?queryField_a=battery")) | ||
37 | + .addPipeline(articlePipeline) | ||
38 | + .thread(20).run(); | ||
39 | + return ServerResult.success(); | ||
40 | + } | ||
41 | +} |
src/main/java/com/canrd/webmagic/domain/ArticleTypeEnum.java
@@ -25,7 +25,8 @@ public enum ArticleTypeEnum { | @@ -25,7 +25,8 @@ public enum ArticleTypeEnum { | ||
25 | NATURE_COMMUNICATIONS("nature-communications","nuture网站-communications"), | 25 | NATURE_COMMUNICATIONS("nature-communications","nuture网站-communications"), |
26 | NATURE_COMPUTATIONAL_SCIENCE("nature-computational|science","nuture网站-computational|science"), | 26 | NATURE_COMPUTATIONAL_SCIENCE("nature-computational|science","nuture网站-computational|science"), |
27 | NATURE_METHODS("nature-methods","nuture网站-methods"), | 27 | NATURE_METHODS("nature-methods","nuture网站-methods"), |
28 | - Chemical("chemical","chemical网站") | 28 | + Chemical("chemical","chemical网站"), |
29 | + Sciengine("sciengine","sciengine网站") | ||
29 | ; | 30 | ; |
30 | private String type; | 31 | private String type; |
31 | private String desc; | 32 | private String desc; |
src/main/java/com/canrd/webmagic/domain/dto/SciengineAffsListDo.java
0 → 100644
1 | +package com.canrd.webmagic.domain.dto; | ||
2 | + | ||
3 | +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; | ||
4 | +import lombok.AllArgsConstructor; | ||
5 | +import lombok.Data; | ||
6 | +import lombok.NoArgsConstructor; | ||
7 | + | ||
8 | +@Data | ||
9 | +@NoArgsConstructor | ||
10 | +@AllArgsConstructor | ||
11 | +@JsonIgnoreProperties(ignoreUnknown = true) | ||
12 | +public class SciengineAffsListDo { | ||
13 | + private boolean isNewRecord; | ||
14 | + private String affText; | ||
15 | + private String labelFlag; | ||
16 | +} |
src/main/java/com/canrd/webmagic/domain/dto/SciengineAuthorDo.java
0 → 100644
1 | +package com.canrd.webmagic.domain.dto; | ||
2 | + | ||
3 | +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; | ||
4 | +import lombok.AllArgsConstructor; | ||
5 | +import lombok.Data; | ||
6 | +import lombok.NoArgsConstructor; | ||
7 | + | ||
8 | +import java.util.List; | ||
9 | + | ||
10 | +@Data | ||
11 | +@NoArgsConstructor | ||
12 | +@AllArgsConstructor | ||
13 | +@JsonIgnoreProperties(ignoreUnknown = true) | ||
14 | +public class SciengineAuthorDo { | ||
15 | + private boolean isNewRecord; | ||
16 | + private String surName; | ||
17 | + private String fullName; | ||
18 | + private String givenName; | ||
19 | + private String label; | ||
20 | + private List<SciengineAffsListDo> affsList; | ||
21 | + private List<SciengineAuthorNoteDo> authorNoteList; | ||
22 | +// private SciengineAuthorNoteDo authorNoteList; | ||
23 | +} | ||
0 | \ No newline at end of file | 24 | \ No newline at end of file |
src/main/java/com/canrd/webmagic/domain/dto/SciengineAuthorNoteDo.java
0 → 100644
1 | +package com.canrd.webmagic.domain.dto; | ||
2 | + | ||
3 | +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; | ||
4 | +import lombok.AllArgsConstructor; | ||
5 | +import lombok.Data; | ||
6 | +import lombok.NoArgsConstructor; | ||
7 | + | ||
8 | +@Data | ||
9 | +@NoArgsConstructor | ||
10 | +@AllArgsConstructor | ||
11 | +@JsonIgnoreProperties(ignoreUnknown = true) | ||
12 | +public class SciengineAuthorNoteDo { | ||
13 | + private boolean isNewRecord; | ||
14 | + private String labelFlag; | ||
15 | + private String note; | ||
16 | + private String email; | ||
17 | + | ||
18 | + // 构造函数、getter和setter方法 | ||
19 | +} | ||
0 | \ No newline at end of file | 20 | \ No newline at end of file |
src/main/java/com/canrd/webmagic/domain/dto/SciengineReferenceListDo.java
0 → 100644
1 | +package com.canrd.webmagic.domain.dto; | ||
2 | + | ||
3 | +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; | ||
4 | +import lombok.AllArgsConstructor; | ||
5 | +import lombok.Data; | ||
6 | +import lombok.NoArgsConstructor; | ||
7 | + | ||
8 | +@Data | ||
9 | +@NoArgsConstructor | ||
10 | +@AllArgsConstructor | ||
11 | +@JsonIgnoreProperties(ignoreUnknown = true) | ||
12 | +public class SciengineReferenceListDo { | ||
13 | + private String id; | ||
14 | + private boolean isNewRecord; | ||
15 | + private String refArticleId; | ||
16 | + private String title; | ||
17 | + private String flay; | ||
18 | + private int sort; | ||
19 | + private String doi; | ||
20 | +} |
src/main/java/com/canrd/webmagic/processor/ScienceDirectArticlePcoessor.java
0 → 100644
1 | +package com.canrd.webmagic.processor; | ||
2 | + | ||
3 | +import lombok.extern.slf4j.Slf4j; | ||
4 | +import org.springframework.stereotype.Component; | ||
5 | +import us.codecraft.webmagic.Page; | ||
6 | +import us.codecraft.webmagic.Site; | ||
7 | +import us.codecraft.webmagic.processor.PageProcessor; | ||
8 | +import us.codecraft.webmagic.selector.Html; | ||
9 | + | ||
10 | +@Slf4j | ||
11 | +@Component | ||
12 | +public class ScienceDirectArticlePcoessor implements PageProcessor { | ||
13 | + | ||
14 | + @Override | ||
15 | + public void process(Page page) { | ||
16 | + String url = page.getUrl().get(); | ||
17 | + if (url.equals("https://www.sciencedirect.com/journal/nano-today")){ | ||
18 | + findSearch(page); | ||
19 | + }else if (url.equals("https://www.sciencedirect.com/search?qs=battery")){ | ||
20 | + getMaxPage(page); | ||
21 | + }else if (url.contains("https://www.sciencedirect.com/search?qs=battery&show=100&offset=")){ | ||
22 | + everyPage(page); | ||
23 | + } | ||
24 | + } | ||
25 | + | ||
26 | + @Override | ||
27 | + public Site getSite() { | ||
28 | + return PageProcessor.super.getSite(); | ||
29 | + } | ||
30 | + | ||
31 | + void findSearch(Page page){ | ||
32 | + Html html = page.getHtml(); | ||
33 | + page.putField("html",html); | ||
34 | + } | ||
35 | + | ||
36 | + void getMaxPage(Page page){ | ||
37 | + Html html = page.getHtml(); | ||
38 | +// move-right | ||
39 | + page.putField("html",html); | ||
40 | + } | ||
41 | + | ||
42 | + void everyPage(Page page){ | ||
43 | + Html html = page.getHtml(); | ||
44 | + page.putField("html",html); | ||
45 | + } | ||
46 | +} |
src/main/java/com/canrd/webmagic/processor/ScienceDirectSearchPcoessor.java
0 → 100644
1 | +package com.canrd.webmagic.processor; | ||
2 | + | ||
3 | +import lombok.extern.slf4j.Slf4j; | ||
4 | +import org.springframework.stereotype.Component; | ||
5 | +import us.codecraft.webmagic.Page; | ||
6 | +import us.codecraft.webmagic.Site; | ||
7 | +import us.codecraft.webmagic.processor.PageProcessor; | ||
8 | +import us.codecraft.webmagic.selector.Html; | ||
9 | + | ||
10 | +@Slf4j | ||
11 | +@Component | ||
12 | +public class ScienceDirectSearchPcoessor implements PageProcessor { | ||
13 | + | ||
14 | + @Override | ||
15 | + public void process(Page page) { | ||
16 | + String url = page.getUrl().get(); | ||
17 | + if (url.equals("https://www.sciencedirect.com/search?qs=battery&pub=Nano%20Today&cid=273494")){ | ||
18 | + getMaxPage(page); | ||
19 | + }else if (url.contains("https://www.sciencedirect.com/search?qs=battery&show=100&offset=")){ | ||
20 | + everyPage(page); | ||
21 | + } else if (url.contains("https://www.sciencedirect.com/science/article/abs/pii")) { | ||
22 | + getPageDetail(page); | ||
23 | + }else if (url.equals("https://www.sciencedirect.com/journal/nano-today")){ | ||
24 | + | ||
25 | + } | ||
26 | + } | ||
27 | + | ||
28 | + @Override | ||
29 | + public Site getSite() { | ||
30 | + return PageProcessor.super.getSite(); | ||
31 | + } | ||
32 | + | ||
33 | + void getMaxPage(Page page){ | ||
34 | + Html html = page.getHtml(); | ||
35 | +// move-right | ||
36 | + page.putField("html",html); | ||
37 | + } | ||
38 | + | ||
39 | + void everyPage(Page page){ | ||
40 | + | ||
41 | + } | ||
42 | + | ||
43 | + void getPageDetail(Page page){ | ||
44 | + | ||
45 | + } | ||
46 | +} |
src/main/java/com/canrd/webmagic/processor/ScienginePcoessor.java
0 → 100644
1 | +package com.canrd.webmagic.processor; | ||
2 | + | ||
3 | +import com.alibaba.fastjson.JSON; | ||
4 | +import com.alibaba.fastjson.JSONArray; | ||
5 | +import com.alibaba.fastjson.JSONObject; | ||
6 | +import com.baomidou.mybatisplus.core.toolkit.CollectionUtils; | ||
7 | +import com.canrd.webmagic.common.constant.ServerResult; | ||
8 | +import com.canrd.webmagic.common.utils.DateUtil; | ||
9 | +import com.canrd.webmagic.domain.ArticleTypeEnum; | ||
10 | +import com.canrd.webmagic.domain.dto.ArticleDO; | ||
11 | +import com.canrd.webmagic.domain.dto.SciengineAffsListDo; | ||
12 | +import com.canrd.webmagic.domain.dto.SciengineAuthorDo; | ||
13 | +import com.canrd.webmagic.domain.dto.SciengineReferenceListDo; | ||
14 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | ||
15 | +import com.fasterxml.jackson.core.JsonProcessingException; | ||
16 | +import com.fasterxml.jackson.databind.ObjectMapper; | ||
17 | +import com.jayway.jsonpath.JsonPath; | ||
18 | +import lombok.extern.slf4j.Slf4j; | ||
19 | +import org.apache.logging.log4j.core.util.UuidUtil; | ||
20 | +import org.springframework.stereotype.Component; | ||
21 | +import us.codecraft.webmagic.Page; | ||
22 | +import us.codecraft.webmagic.Request; | ||
23 | +import us.codecraft.webmagic.Site; | ||
24 | +import us.codecraft.webmagic.Spider; | ||
25 | +import us.codecraft.webmagic.model.HttpRequestBody; | ||
26 | +import us.codecraft.webmagic.processor.PageProcessor; | ||
27 | +import us.codecraft.webmagic.selector.Html; | ||
28 | +import us.codecraft.webmagic.utils.HttpConstant; | ||
29 | + | ||
30 | +import java.text.ParseException; | ||
31 | +import java.text.SimpleDateFormat; | ||
32 | +import java.util.*; | ||
33 | +import java.util.concurrent.ConcurrentHashMap; | ||
34 | + | ||
35 | +@Slf4j | ||
36 | +@Component | ||
37 | +public class ScienginePcoessor implements PageProcessor { | ||
38 | + private ConcurrentHashMap<String, Object> map = new ConcurrentHashMap<>(); | ||
39 | + | ||
40 | + private HttpRequestBody httpRequestBody; | ||
41 | + | ||
42 | + private final Site site = Site.me().setTimeOut(30000); | ||
43 | + | ||
44 | + @Override | ||
45 | + public void process(Page page) { | ||
46 | + String url = page.getUrl().get(); | ||
47 | + if (url.equals("https://www.sciengine.com/plat/search?queryField_a=battery")) { | ||
48 | + getMaxPage(page); | ||
49 | + } else if (url.equals("https://www.sciengine.com/SciSearch/searchNew")) { | ||
50 | + everyPage(page); | ||
51 | + } else if (url.contains("https://www.sciengine.com/restData/initArticle?")) { | ||
52 | + try { | ||
53 | + getPageDetail(page); | ||
54 | + } catch (JsonProcessingException e) { | ||
55 | + throw new RuntimeException(e); | ||
56 | + } | ||
57 | + } | ||
58 | + } | ||
59 | + | ||
60 | + @Override | ||
61 | + public Site getSite() { | ||
62 | + return site; | ||
63 | + } | ||
64 | + | ||
65 | + void getMaxPage(Page page) { | ||
66 | + for (int i = 1; i <=490 ; i++) { | ||
67 | + String baseUrl = "https://www.sciengine.com/SciSearch/searchNew"; | ||
68 | + map.put("queryField_a", "battery"); | ||
69 | + map.put("pageCount",10); | ||
70 | + map.put("curpage",i); | ||
71 | + httpRequestBody = HttpRequestBody.form(map, "UTF-8"); | ||
72 | + Request request = new Request(baseUrl).setMethod(HttpConstant.Method.POST) | ||
73 | + .addHeader("Content-Type", "application/x-www-form-urlencoded") | ||
74 | + .addHeader("Connection", "keep-alive") | ||
75 | + .addHeader("Cookie","_ga=GA1.1.12362349.1718065158; SHAREJSESSIONID=35fad62b-37db-455a-af7b-9e9eaac5e5bf; Hm_lvt_633c662645ea15827301cdfaf39e48a1=1718171741; retrievalHistory=%5B%7B%22title%22%3A%22battery%22%7D%5D; Hm_lpvt_633c662645ea15827301cdfaf39e48a1=1718172306; _ga_SB5SCK5F77=GS1.1.1718170247.7.1.1718172335.0.0.0") | ||
76 | + .addHeader("Host","www.sciengine.com") | ||
77 | + .addHeader("Accept-Encoding", "gzip, deflate, br") | ||
78 | + .addHeader("Accept", "*/*") | ||
79 | + .addHeader("Origin","https://www.sciengine.com") | ||
80 | + .addHeader("Referer","https://www.sciengine.com/plat/search?queryField_a=battery") | ||
81 | + .setCharset("UTF-8"); | ||
82 | + request.setRequestBody(httpRequestBody); | ||
83 | + page.addTargetRequest(request); | ||
84 | + } | ||
85 | + } | ||
86 | + | ||
87 | + void everyPage(Page page) { | ||
88 | + String rawText = page.getRawText(); | ||
89 | + List<String> BaseIdList = JsonPath.read(rawText, "$.list[*].id"); | ||
90 | + List<String> doiList = JsonPath.read(rawText, "$.list[*].doi"); | ||
91 | + if (BaseIdList.size() == doiList.size()) { | ||
92 | + for (int i = 0; i < BaseIdList.size(); i++) { | ||
93 | + String baseId = BaseIdList.get(i); | ||
94 | + String doi = doiList.get(i); | ||
95 | + log.info("baseId:"+baseId+",doi:"+doi); | ||
96 | + page.addTargetRequest("https://www.sciengine.com/restData/initArticle?doi="+doi+"&articleBaseId="+baseId); | ||
97 | + } | ||
98 | + } else { | ||
99 | + throw new RuntimeException("匹配不成功"); | ||
100 | + } | ||
101 | + } | ||
102 | + | ||
103 | + void getPageDetail(Page page) throws JsonProcessingException { | ||
104 | + String rawText = page.getRawText(); | ||
105 | + //文章链接 | ||
106 | + String articleCode = page.getUrl().get(); | ||
107 | + | ||
108 | + //文章标题 | ||
109 | + String title = JsonPath.read(rawText, "$.article.title"); | ||
110 | + | ||
111 | + //文章内容 | ||
112 | + String articleDesc = JsonPath.read(rawText, "$.article.intro"); | ||
113 | + | ||
114 | + //时间 | ||
115 | + Date publishTimeDateTime = null; | ||
116 | + String publishTime = JsonPath.read(rawText, "$.article.pubDateStr"); | ||
117 | + SimpleDateFormat formatter = new SimpleDateFormat("MMMM dd,yyyy", Locale.ENGLISH); | ||
118 | + try { | ||
119 | + publishTimeDateTime = formatter.parse(publishTime); | ||
120 | + } catch (ParseException e) { | ||
121 | + e.printStackTrace(); | ||
122 | + } | ||
123 | + | ||
124 | + //作者名字 | ||
125 | + List<String> authors = JsonPath.read(rawText, "$.authorList[*].fullName"); | ||
126 | + StringBuffer authorName = new StringBuffer(); | ||
127 | + authors.forEach(authorName::append); | ||
128 | + | ||
129 | + | ||
130 | + //邮箱和地址 | ||
131 | + JSONArray authorMail = new JSONArray(); | ||
132 | + ObjectMapper objectMapper = new ObjectMapper(); | ||
133 | + net.minidev.json.JSONArray authorJsonArray = JsonPath.read(rawText, "$.authorList"); | ||
134 | + JSONArray fastJsonArray = JSON.parseArray(authorJsonArray.toJSONString()); | ||
135 | + SciengineAuthorDo[] authorList = objectMapper.readValue(fastJsonArray.toJSONString(), SciengineAuthorDo[].class); | ||
136 | + | ||
137 | + JSONArray authorAddress = new JSONArray(); | ||
138 | + boolean isNotAddress = true; | ||
139 | + for (SciengineAuthorDo author : authorList) { | ||
140 | + JSONObject mailObj = new JSONObject(); | ||
141 | + if (author.getAuthorNoteList() != null) { | ||
142 | + mailObj.put("authorEmailName", author.getFullName()); | ||
143 | + mailObj.put("email", author.getAuthorNoteList().get(0).getEmail()); | ||
144 | + } else { | ||
145 | + } | ||
146 | + if (!mailObj.isEmpty()){ | ||
147 | + authorMail.add(mailObj); | ||
148 | + } | ||
149 | + JSONObject addressObj = new JSONObject(); | ||
150 | + if (author.getAffsList() != null) { | ||
151 | + addressObj.put("address", author.getAffsList().get(0).getAffText()); | ||
152 | + addressObj.put("authorNames", author.getFullName()); | ||
153 | + isNotAddress = false; | ||
154 | + authorAddress.add(addressObj); | ||
155 | + } else { | ||
156 | + } | ||
157 | + } | ||
158 | + | ||
159 | + ObjectMapper affMapper = new ObjectMapper(); | ||
160 | + net.minidev.json.JSONArray affList = JsonPath.read(rawText, "$.affList"); | ||
161 | + JSONArray affListJsonArray = JSON.parseArray(affList.toJSONString()); | ||
162 | + SciengineAffsListDo[] affListArray = affMapper.readValue(affListJsonArray.toJSONString(), SciengineAffsListDo[].class); | ||
163 | + if (isNotAddress && CollectionUtils.isNotEmpty(Arrays.asList(affListArray))) { | ||
164 | + JSONObject addressObj = new JSONObject(); | ||
165 | + addressObj.put("authorNames", authorName); | ||
166 | + List addressList = new ArrayList(); | ||
167 | + for (SciengineAffsListDo sciengineAffsListDo : affListArray) { | ||
168 | + if (sciengineAffsListDo.getAffText() != null) { | ||
169 | + addressList.add(sciengineAffsListDo.getAffText()); | ||
170 | + } | ||
171 | + } | ||
172 | + addressObj.put("address", addressList); | ||
173 | + authorAddress.add(addressObj); | ||
174 | + } | ||
175 | + | ||
176 | + //引用文献 | ||
177 | + JSONArray references = new JSONArray(); | ||
178 | + ObjectMapper referenceMapper = new ObjectMapper(); | ||
179 | + net.minidev.json.JSONArray refListJsonArray = JsonPath.read(rawText, "$.article.referenceList"); | ||
180 | + JSONArray referenceListJsonArray = JSON.parseArray(refListJsonArray.toJSONString()); | ||
181 | + SciengineReferenceListDo[] referenceList = referenceMapper.readValue(referenceListJsonArray.toJSONString(), SciengineReferenceListDo[].class); | ||
182 | + for (SciengineReferenceListDo sciengineReferenceListDo : referenceList) { | ||
183 | + StringBuffer referenceTitle = new StringBuffer(); | ||
184 | + JSONObject referencesObj = new JSONObject(); | ||
185 | + ArrayList<Object> herfList = new ArrayList<>(); | ||
186 | + herfList.add(sciengineReferenceListDo.getTitle()); | ||
187 | + referenceTitle.append("https://www.sciengine.com/JAS/doi/" + sciengineReferenceListDo.getDoi()); | ||
188 | + referencesObj.put("links", herfList); | ||
189 | + referencesObj.put("referenceTitle", referenceTitle); | ||
190 | + references.add(referencesObj); | ||
191 | + } | ||
192 | + log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorName, authorMail.toJSONString()); | ||
193 | + page.putField("article", ArticleDO.builder() | ||
194 | + .articleType(ArticleTypeEnum.Sciengine.getType()) | ||
195 | + .articleCode(articleCode) | ||
196 | + .authorName(authorName.toString()) | ||
197 | + .title(title) | ||
198 | + .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE)) | ||
199 | + .emailInfo(authorMail.toJSONString()) | ||
200 | + .articleDesc(articleDesc) | ||
201 | + .authorAddress(authorAddress.toJSONString()) | ||
202 | + .referenceInfo(references.toJSONString()).build()); | ||
203 | + } | ||
204 | +} |
src/main/java/com/canrd/webmagic/processor/download/SeleniumDownloader.java
@@ -17,6 +17,8 @@ import us.codecraft.webmagic.selector.PlainText; | @@ -17,6 +17,8 @@ import us.codecraft.webmagic.selector.PlainText; | ||
17 | 17 | ||
18 | import javax.annotation.Resource; | 18 | import javax.annotation.Resource; |
19 | import java.util.Map; | 19 | import java.util.Map; |
20 | +import java.util.regex.Matcher; | ||
21 | +import java.util.regex.Pattern; | ||
20 | 22 | ||
21 | /** | 23 | /** |
22 | * @author: xms | 24 | * @author: xms |
@@ -43,6 +45,13 @@ public class SeleniumDownloader extends AbstractDownloader { | @@ -43,6 +45,13 @@ public class SeleniumDownloader extends AbstractDownloader { | ||
43 | return this; | 45 | return this; |
44 | } | 46 | } |
45 | 47 | ||
48 | + public static boolean checkUrl(String url) { | ||
49 | + String pattern = "https://id.elsevier.com/as/[a-zA-Z0-9]+/resume/as/"; | ||
50 | + Pattern r = Pattern.compile(pattern); | ||
51 | + Matcher m = r.matcher(url); | ||
52 | + return m.find(); | ||
53 | + } | ||
54 | + | ||
46 | @Override | 55 | @Override |
47 | public Page download(Request request, Task task) { | 56 | public Page download(Request request, Task task) { |
48 | Page page = Page.fail(); | 57 | Page page = Page.fail(); |
@@ -78,7 +87,26 @@ public class SeleniumDownloader extends AbstractDownloader { | @@ -78,7 +87,26 @@ public class SeleniumDownloader extends AbstractDownloader { | ||
78 | // cloudFlare.click(); | 87 | // cloudFlare.click(); |
79 | // } | 88 | // } |
80 | } | 89 | } |
81 | - if (request.getUrl().contains("https://www.cell.com/action/doSearch?")){ | 90 | + if (request.getUrl().equals("https://www.sciencedirect.com/search?qs=battery&pub=Nano%20Today&cid=273494")) { |
91 | + WebDriverWait wait = new WebDriverWait(webDriver, 60); | ||
92 | + Boolean until = wait.until(ExpectedConditions.urlContains("https://id.elsevier.com/")); | ||
93 | + if (until) { | ||
94 | + log.info(webDriver.getCurrentUrl()); | ||
95 | +// if (checkUrl(webDriver.getCurrentUrl())) { | ||
96 | + String currentUrl = webDriver.getCurrentUrl(); | ||
97 | + log.info("currentUrl=" + currentUrl); | ||
98 | + String pageSource = webDriver.getPageSource(); | ||
99 | + log.info(pageSource); | ||
100 | +// WebElement element = webDriver.findElement(By.xpath("//div[@class='form-row']/from/div[@id='jsEnabled']/input")); | ||
101 | + webDriver.findElement(By.xpath("//a[@class='ot-sdk-show-settings cookie anchor-text']")).click(); | ||
102 | + webDriver.findElement(By.xpath("//input[@id='bdd-email']")).getAttribute("1187551704@qq.com"); | ||
103 | + webDriver.findElement(By.xpath("//button[@id='bdd-els-searchBtn']")).submit(); | ||
104 | +// String text = element.getText(); | ||
105 | +// log.info(text); | ||
106 | +// } | ||
107 | + } else { | ||
108 | + log.info("跳转失败"); | ||
109 | + } | ||
82 | 110 | ||
83 | } | 111 | } |
84 | try { | 112 | try { |
@@ -116,7 +144,7 @@ public class SeleniumDownloader extends AbstractDownloader { | @@ -116,7 +144,7 @@ public class SeleniumDownloader extends AbstractDownloader { | ||
116 | } finally { | 144 | } finally { |
117 | if (webDriver != null) { | 145 | if (webDriver != null) { |
118 | webDriver.close(); | 146 | webDriver.close(); |
119 | -// webDriver.quit(); | 147 | + webDriver.quit(); |
120 | webDriver = null; | 148 | webDriver = null; |
121 | } | 149 | } |
122 | } | 150 | } |
src/main/java/com/canrd/webmagic/util/FieldUtil.java
0 → 100644
1 | +package com.canrd.webmagic.util; | ||
2 | + | ||
3 | +import com.alibaba.fastjson.JSON; | ||
4 | +import com.alibaba.fastjson.JSONObject; | ||
5 | +import org.springframework.util.StringUtils; | ||
6 | + | ||
7 | +public class FieldUtil { | ||
8 | + public static Boolean isExistField(String field, Object obj) { | ||
9 | + if (obj == null || StringUtils.isEmpty(field)) { | ||
10 | + return null; | ||
11 | + } | ||
12 | + Object o = JSON.toJSON(obj); | ||
13 | + JSONObject jsonObj = new JSONObject(); | ||
14 | + if (o instanceof JSONObject) { | ||
15 | + jsonObj = (JSONObject) o; | ||
16 | + } | ||
17 | + return jsonObj.containsKey(field); | ||
18 | + } | ||
19 | +} |
target/classes/com/canrd/webmagic/controller/ChemicalController.class
No preview for this file type
target/classes/com/canrd/webmagic/controller/ScienceDirectController.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/controller/SciengineController.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/domain/ArticleTypeEnum.class
No preview for this file type
target/classes/com/canrd/webmagic/domain/dto/SciengineAffsListDo.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/domain/dto/SciengineAuthorDo.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/domain/dto/SciengineAuthorNoteDo.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/domain/dto/SciengineReferenceListDo.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/ScienceDirectArticlePcoessor.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/ScienceDirectSearchPcoessor.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/ScienginePcoessor.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/download/SeleniumDownloader.class
No preview for this file type
target/classes/com/canrd/webmagic/util/FieldUtil.class
0 → 100644
No preview for this file type