Commit 965b4941c8d3c216a620dcf5f896c9490459a00e

Authored by PurelzMgnead
1 parent 432ddd72

Sciengine

Showing 27 changed files with 516 additions and 4 deletions

Too many changes to show.

To preserve performance only 27 of 33 files are displayed.

.idea/inspectionProfiles/Project_Default.xml
... ... @@ -4,6 +4,11 @@
4 4 <inspection_tool class="AutoCloseableResource" enabled="true" level="WARNING" enabled_by_default="true">
5 5 <option name="METHOD_MATCHER_CONFIG" value="java.util.Formatter,format,java.io.Writer,append,com.google.common.base.Preconditions,checkNotNull,org.hibernate.Session,close,java.io.PrintWriter,printf,java.io.PrintStream,printf,okhttp3.Call,execute" />
6 6 </inspection_tool>
  7 + <inspection_tool class="DuplicatedCode" enabled="true" level="WEAK WARNING" enabled_by_default="true">
  8 + <Languages>
  9 + <language minSize="757" name="Java" />
  10 + </Languages>
  11 + </inspection_tool>
7 12 <inspection_tool class="NonSerializableWithSerialVersionUIDField" enabled="true" level="WARNING" enabled_by_default="true" />
8 13 <inspection_tool class="SpringJavaInjectionPointsAutowiringInspection" enabled="false" level="ERROR" enabled_by_default="false" />
9 14 </profile>
... ...
src/main/java/com/canrd/webmagic/controller/ChemicalController.java
... ... @@ -41,7 +41,7 @@ public class ChemicalController {
41 41 // .setScheduler(new RedisScheduler("127.0.0.1"))
42 42 .addPipeline(articlePipeline)
43 43 // .setDownloader(seleniumDownloader)
44   - .thread(1).run();
  44 + .thread(100).run();
45 45 return ServerResult.success();
46 46 }
47 47 }
... ...
src/main/java/com/canrd/webmagic/controller/ScienceDirectController.java 0 → 100644
  1 +package com.canrd.webmagic.controller;
  2 +
  3 +import com.canrd.webmagic.common.constant.ServerResult;
  4 +import com.canrd.webmagic.processor.ScienceDirectSearchPcoessor;
  5 +import com.canrd.webmagic.processor.download.SeleniumDownloader;
  6 +import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
  7 +import io.swagger.annotations.Api;
  8 +import io.swagger.annotations.ApiOperation;
  9 +import org.apache.logging.log4j.core.util.UuidUtil;
  10 +import org.springframework.web.bind.annotation.GetMapping;
  11 +import org.springframework.web.bind.annotation.RequestMapping;
  12 +import org.springframework.web.bind.annotation.RestController;
  13 +import us.codecraft.webmagic.Request;
  14 +import us.codecraft.webmagic.Spider;
  15 +
  16 +import javax.annotation.Resource;
  17 +
  18 +@RestController
  19 +@RequestMapping("/sciencedirect/article")
  20 +@Api("Science-Direct")
  21 +public class ScienceDirectController {
  22 + @Resource
  23 + private ScienceDirectSearchPcoessor scienceDirectSearchPcoessor;
  24 +
  25 + @Resource
  26 + private SeleniumDownloader seleniumDownloader;
  27 +
  28 + @Resource
  29 + private ArticlePipeline articlePipeline;
  30 +
  31 + @GetMapping("/start")
  32 + @ApiOperation("start")
  33 + public ServerResult start() {
  34 + Spider.create(scienceDirectSearchPcoessor)
  35 + .setUUID(UuidUtil.getTimeBasedUuid().toString())
  36 +// .addRequest(new Request("https://www.sciencedirect.com/search?qs=battery"))
  37 + .addRequest(new Request("https://www.sciencedirect.com/search?qs=battery&pub=Nano%20Today&cid=273494"))
  38 +// .setScheduler(new RedisScheduler("127.0.0.1"))
  39 +// .addPipeline(articlePipeline)
  40 + .setDownloader(seleniumDownloader)
  41 + .thread(20).run();
  42 + return ServerResult.success();
  43 + }
  44 +}
... ...
src/main/java/com/canrd/webmagic/controller/SciengineController.java 0 → 100644
  1 +package com.canrd.webmagic.controller;
  2 +
  3 +import com.canrd.webmagic.common.constant.ServerResult;
  4 +import com.canrd.webmagic.processor.ScienginePcoessor;
  5 +import com.canrd.webmagic.processor.download.SeleniumDownloader;
  6 +import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
  7 +import io.swagger.annotations.Api;
  8 +import io.swagger.annotations.ApiOperation;
  9 +import org.apache.logging.log4j.core.util.UuidUtil;
  10 +import org.springframework.web.bind.annotation.GetMapping;
  11 +import org.springframework.web.bind.annotation.RequestMapping;
  12 +import org.springframework.web.bind.annotation.RestController;
  13 +import us.codecraft.webmagic.Request;
  14 +import us.codecraft.webmagic.Spider;
  15 +
  16 +import javax.annotation.Resource;
  17 +
  18 +@RestController
  19 +@RequestMapping("/sciengine/article")
  20 +@Api("Sciengine")
  21 +public class SciengineController {
  22 + @Resource
  23 + private ScienginePcoessor scienginePcoessor;
  24 +
  25 + @Resource
  26 + private SeleniumDownloader seleniumDownloader;
  27 +
  28 + @Resource
  29 + private ArticlePipeline articlePipeline;
  30 +
  31 + @GetMapping("/start")
  32 + @ApiOperation("start")
  33 + public ServerResult start() {
  34 + Spider.create(scienginePcoessor)
  35 + .setUUID(UuidUtil.getTimeBasedUuid().toString())
  36 + .addRequest(new Request("https://www.sciengine.com/plat/search?queryField_a=battery"))
  37 + .addPipeline(articlePipeline)
  38 + .thread(20).run();
  39 + return ServerResult.success();
  40 + }
  41 +}
... ...
src/main/java/com/canrd/webmagic/domain/ArticleTypeEnum.java
... ... @@ -25,7 +25,8 @@ public enum ArticleTypeEnum {
25 25 NATURE_COMMUNICATIONS("nature-communications","nuture网站-communications"),
26 26 NATURE_COMPUTATIONAL_SCIENCE("nature-computational|science","nuture网站-computational|science"),
27 27 NATURE_METHODS("nature-methods","nuture网站-methods"),
28   - Chemical("chemical","chemical网站")
  28 + Chemical("chemical","chemical网站"),
  29 + Sciengine("sciengine","sciengine网站")
29 30 ;
30 31 private String type;
31 32 private String desc;
... ...
src/main/java/com/canrd/webmagic/domain/dto/SciengineAffsListDo.java 0 → 100644
  1 +package com.canrd.webmagic.domain.dto;
  2 +
  3 +import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
  4 +import lombok.AllArgsConstructor;
  5 +import lombok.Data;
  6 +import lombok.NoArgsConstructor;
  7 +
  8 +@Data
  9 +@NoArgsConstructor
  10 +@AllArgsConstructor
  11 +@JsonIgnoreProperties(ignoreUnknown = true)
  12 +public class SciengineAffsListDo {
  13 + private boolean isNewRecord;
  14 + private String affText;
  15 + private String labelFlag;
  16 +}
... ...
src/main/java/com/canrd/webmagic/domain/dto/SciengineAuthorDo.java 0 → 100644
  1 +package com.canrd.webmagic.domain.dto;
  2 +
  3 +import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
  4 +import lombok.AllArgsConstructor;
  5 +import lombok.Data;
  6 +import lombok.NoArgsConstructor;
  7 +
  8 +import java.util.List;
  9 +
  10 +@Data
  11 +@NoArgsConstructor
  12 +@AllArgsConstructor
  13 +@JsonIgnoreProperties(ignoreUnknown = true)
  14 +public class SciengineAuthorDo {
  15 + private boolean isNewRecord;
  16 + private String surName;
  17 + private String fullName;
  18 + private String givenName;
  19 + private String label;
  20 + private List<SciengineAffsListDo> affsList;
  21 + private List<SciengineAuthorNoteDo> authorNoteList;
  22 +// private SciengineAuthorNoteDo authorNoteList;
  23 +}
0 24 \ No newline at end of file
... ...
src/main/java/com/canrd/webmagic/domain/dto/SciengineAuthorNoteDo.java 0 → 100644
  1 +package com.canrd.webmagic.domain.dto;
  2 +
  3 +import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
  4 +import lombok.AllArgsConstructor;
  5 +import lombok.Data;
  6 +import lombok.NoArgsConstructor;
  7 +
  8 +@Data
  9 +@NoArgsConstructor
  10 +@AllArgsConstructor
  11 +@JsonIgnoreProperties(ignoreUnknown = true)
  12 +public class SciengineAuthorNoteDo {
  13 + private boolean isNewRecord;
  14 + private String labelFlag;
  15 + private String note;
  16 + private String email;
  17 +
  18 + // 构造函数、getter和setter方法
  19 +}
0 20 \ No newline at end of file
... ...
src/main/java/com/canrd/webmagic/domain/dto/SciengineReferenceListDo.java 0 → 100644
  1 +package com.canrd.webmagic.domain.dto;
  2 +
  3 +import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
  4 +import lombok.AllArgsConstructor;
  5 +import lombok.Data;
  6 +import lombok.NoArgsConstructor;
  7 +
  8 +@Data
  9 +@NoArgsConstructor
  10 +@AllArgsConstructor
  11 +@JsonIgnoreProperties(ignoreUnknown = true)
  12 +public class SciengineReferenceListDo {
  13 + private String id;
  14 + private boolean isNewRecord;
  15 + private String refArticleId;
  16 + private String title;
  17 + private String flay;
  18 + private int sort;
  19 + private String doi;
  20 +}
... ...
src/main/java/com/canrd/webmagic/processor/ScienceDirectArticlePcoessor.java 0 → 100644
  1 +package com.canrd.webmagic.processor;
  2 +
  3 +import lombok.extern.slf4j.Slf4j;
  4 +import org.springframework.stereotype.Component;
  5 +import us.codecraft.webmagic.Page;
  6 +import us.codecraft.webmagic.Site;
  7 +import us.codecraft.webmagic.processor.PageProcessor;
  8 +import us.codecraft.webmagic.selector.Html;
  9 +
  10 +@Slf4j
  11 +@Component
  12 +public class ScienceDirectArticlePcoessor implements PageProcessor {
  13 +
  14 + @Override
  15 + public void process(Page page) {
  16 + String url = page.getUrl().get();
  17 + if (url.equals("https://www.sciencedirect.com/journal/nano-today")){
  18 + findSearch(page);
  19 + }else if (url.equals("https://www.sciencedirect.com/search?qs=battery")){
  20 + getMaxPage(page);
  21 + }else if (url.contains("https://www.sciencedirect.com/search?qs=battery&show=100&offset=")){
  22 + everyPage(page);
  23 + }
  24 + }
  25 +
  26 + @Override
  27 + public Site getSite() {
  28 + return PageProcessor.super.getSite();
  29 + }
  30 +
  31 + void findSearch(Page page){
  32 + Html html = page.getHtml();
  33 + page.putField("html",html);
  34 + }
  35 +
  36 + void getMaxPage(Page page){
  37 + Html html = page.getHtml();
  38 +// move-right
  39 + page.putField("html",html);
  40 + }
  41 +
  42 + void everyPage(Page page){
  43 + Html html = page.getHtml();
  44 + page.putField("html",html);
  45 + }
  46 +}
... ...
src/main/java/com/canrd/webmagic/processor/ScienceDirectSearchPcoessor.java 0 → 100644
  1 +package com.canrd.webmagic.processor;
  2 +
  3 +import lombok.extern.slf4j.Slf4j;
  4 +import org.springframework.stereotype.Component;
  5 +import us.codecraft.webmagic.Page;
  6 +import us.codecraft.webmagic.Site;
  7 +import us.codecraft.webmagic.processor.PageProcessor;
  8 +import us.codecraft.webmagic.selector.Html;
  9 +
  10 +@Slf4j
  11 +@Component
  12 +public class ScienceDirectSearchPcoessor implements PageProcessor {
  13 +
  14 + @Override
  15 + public void process(Page page) {
  16 + String url = page.getUrl().get();
  17 + if (url.equals("https://www.sciencedirect.com/search?qs=battery&pub=Nano%20Today&cid=273494")){
  18 + getMaxPage(page);
  19 + }else if (url.contains("https://www.sciencedirect.com/search?qs=battery&show=100&offset=")){
  20 + everyPage(page);
  21 + } else if (url.contains("https://www.sciencedirect.com/science/article/abs/pii")) {
  22 + getPageDetail(page);
  23 + }else if (url.equals("https://www.sciencedirect.com/journal/nano-today")){
  24 +
  25 + }
  26 + }
  27 +
  28 + @Override
  29 + public Site getSite() {
  30 + return PageProcessor.super.getSite();
  31 + }
  32 +
  33 + void getMaxPage(Page page){
  34 + Html html = page.getHtml();
  35 +// move-right
  36 + page.putField("html",html);
  37 + }
  38 +
  39 + void everyPage(Page page){
  40 +
  41 + }
  42 +
  43 + void getPageDetail(Page page){
  44 +
  45 + }
  46 +}
... ...
src/main/java/com/canrd/webmagic/processor/ScienginePcoessor.java 0 → 100644
  1 +package com.canrd.webmagic.processor;
  2 +
  3 +import com.alibaba.fastjson.JSON;
  4 +import com.alibaba.fastjson.JSONArray;
  5 +import com.alibaba.fastjson.JSONObject;
  6 +import com.baomidou.mybatisplus.core.toolkit.CollectionUtils;
  7 +import com.canrd.webmagic.common.constant.ServerResult;
  8 +import com.canrd.webmagic.common.utils.DateUtil;
  9 +import com.canrd.webmagic.domain.ArticleTypeEnum;
  10 +import com.canrd.webmagic.domain.dto.ArticleDO;
  11 +import com.canrd.webmagic.domain.dto.SciengineAffsListDo;
  12 +import com.canrd.webmagic.domain.dto.SciengineAuthorDo;
  13 +import com.canrd.webmagic.domain.dto.SciengineReferenceListDo;
  14 +import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
  15 +import com.fasterxml.jackson.core.JsonProcessingException;
  16 +import com.fasterxml.jackson.databind.ObjectMapper;
  17 +import com.jayway.jsonpath.JsonPath;
  18 +import lombok.extern.slf4j.Slf4j;
  19 +import org.apache.logging.log4j.core.util.UuidUtil;
  20 +import org.springframework.stereotype.Component;
  21 +import us.codecraft.webmagic.Page;
  22 +import us.codecraft.webmagic.Request;
  23 +import us.codecraft.webmagic.Site;
  24 +import us.codecraft.webmagic.Spider;
  25 +import us.codecraft.webmagic.model.HttpRequestBody;
  26 +import us.codecraft.webmagic.processor.PageProcessor;
  27 +import us.codecraft.webmagic.selector.Html;
  28 +import us.codecraft.webmagic.utils.HttpConstant;
  29 +
  30 +import java.text.ParseException;
  31 +import java.text.SimpleDateFormat;
  32 +import java.util.*;
  33 +import java.util.concurrent.ConcurrentHashMap;
  34 +
  35 +@Slf4j
  36 +@Component
  37 +public class ScienginePcoessor implements PageProcessor {
  38 + private ConcurrentHashMap<String, Object> map = new ConcurrentHashMap<>();
  39 +
  40 + private HttpRequestBody httpRequestBody;
  41 +
  42 + private final Site site = Site.me().setTimeOut(30000);
  43 +
  44 + @Override
  45 + public void process(Page page) {
  46 + String url = page.getUrl().get();
  47 + if (url.equals("https://www.sciengine.com/plat/search?queryField_a=battery")) {
  48 + getMaxPage(page);
  49 + } else if (url.equals("https://www.sciengine.com/SciSearch/searchNew")) {
  50 + everyPage(page);
  51 + } else if (url.contains("https://www.sciengine.com/restData/initArticle?")) {
  52 + try {
  53 + getPageDetail(page);
  54 + } catch (JsonProcessingException e) {
  55 + throw new RuntimeException(e);
  56 + }
  57 + }
  58 + }
  59 +
  60 + @Override
  61 + public Site getSite() {
  62 + return site;
  63 + }
  64 +
  65 + void getMaxPage(Page page) {
  66 + for (int i = 1; i <=490 ; i++) {
  67 + String baseUrl = "https://www.sciengine.com/SciSearch/searchNew";
  68 + map.put("queryField_a", "battery");
  69 + map.put("pageCount",10);
  70 + map.put("curpage",i);
  71 + httpRequestBody = HttpRequestBody.form(map, "UTF-8");
  72 + Request request = new Request(baseUrl).setMethod(HttpConstant.Method.POST)
  73 + .addHeader("Content-Type", "application/x-www-form-urlencoded")
  74 + .addHeader("Connection", "keep-alive")
  75 + .addHeader("Cookie","_ga=GA1.1.12362349.1718065158; SHAREJSESSIONID=35fad62b-37db-455a-af7b-9e9eaac5e5bf; Hm_lvt_633c662645ea15827301cdfaf39e48a1=1718171741; retrievalHistory=%5B%7B%22title%22%3A%22battery%22%7D%5D; Hm_lpvt_633c662645ea15827301cdfaf39e48a1=1718172306; _ga_SB5SCK5F77=GS1.1.1718170247.7.1.1718172335.0.0.0")
  76 + .addHeader("Host","www.sciengine.com")
  77 + .addHeader("Accept-Encoding", "gzip, deflate, br")
  78 + .addHeader("Accept", "*/*")
  79 + .addHeader("Origin","https://www.sciengine.com")
  80 + .addHeader("Referer","https://www.sciengine.com/plat/search?queryField_a=battery")
  81 + .setCharset("UTF-8");
  82 + request.setRequestBody(httpRequestBody);
  83 + page.addTargetRequest(request);
  84 + }
  85 + }
  86 +
  87 + void everyPage(Page page) {
  88 + String rawText = page.getRawText();
  89 + List<String> BaseIdList = JsonPath.read(rawText, "$.list[*].id");
  90 + List<String> doiList = JsonPath.read(rawText, "$.list[*].doi");
  91 + if (BaseIdList.size() == doiList.size()) {
  92 + for (int i = 0; i < BaseIdList.size(); i++) {
  93 + String baseId = BaseIdList.get(i);
  94 + String doi = doiList.get(i);
  95 + log.info("baseId:"+baseId+",doi:"+doi);
  96 + page.addTargetRequest("https://www.sciengine.com/restData/initArticle?doi="+doi+"&articleBaseId="+baseId);
  97 + }
  98 + } else {
  99 + throw new RuntimeException("匹配不成功");
  100 + }
  101 + }
  102 +
  103 + void getPageDetail(Page page) throws JsonProcessingException {
  104 + String rawText = page.getRawText();
  105 + //文章链接
  106 + String articleCode = page.getUrl().get();
  107 +
  108 + //文章标题
  109 + String title = JsonPath.read(rawText, "$.article.title");
  110 +
  111 + //文章内容
  112 + String articleDesc = JsonPath.read(rawText, "$.article.intro");
  113 +
  114 + //时间
  115 + Date publishTimeDateTime = null;
  116 + String publishTime = JsonPath.read(rawText, "$.article.pubDateStr");
  117 + SimpleDateFormat formatter = new SimpleDateFormat("MMMM dd,yyyy", Locale.ENGLISH);
  118 + try {
  119 + publishTimeDateTime = formatter.parse(publishTime);
  120 + } catch (ParseException e) {
  121 + e.printStackTrace();
  122 + }
  123 +
  124 + //作者名字
  125 + List<String> authors = JsonPath.read(rawText, "$.authorList[*].fullName");
  126 + StringBuffer authorName = new StringBuffer();
  127 + authors.forEach(authorName::append);
  128 +
  129 +
  130 + //邮箱和地址
  131 + JSONArray authorMail = new JSONArray();
  132 + ObjectMapper objectMapper = new ObjectMapper();
  133 + net.minidev.json.JSONArray authorJsonArray = JsonPath.read(rawText, "$.authorList");
  134 + JSONArray fastJsonArray = JSON.parseArray(authorJsonArray.toJSONString());
  135 + SciengineAuthorDo[] authorList = objectMapper.readValue(fastJsonArray.toJSONString(), SciengineAuthorDo[].class);
  136 +
  137 + JSONArray authorAddress = new JSONArray();
  138 + boolean isNotAddress = true;
  139 + for (SciengineAuthorDo author : authorList) {
  140 + JSONObject mailObj = new JSONObject();
  141 + if (author.getAuthorNoteList() != null) {
  142 + mailObj.put("authorEmailName", author.getFullName());
  143 + mailObj.put("email", author.getAuthorNoteList().get(0).getEmail());
  144 + } else {
  145 + }
  146 + if (!mailObj.isEmpty()){
  147 + authorMail.add(mailObj);
  148 + }
  149 + JSONObject addressObj = new JSONObject();
  150 + if (author.getAffsList() != null) {
  151 + addressObj.put("address", author.getAffsList().get(0).getAffText());
  152 + addressObj.put("authorNames", author.getFullName());
  153 + isNotAddress = false;
  154 + authorAddress.add(addressObj);
  155 + } else {
  156 + }
  157 + }
  158 +
  159 + ObjectMapper affMapper = new ObjectMapper();
  160 + net.minidev.json.JSONArray affList = JsonPath.read(rawText, "$.affList");
  161 + JSONArray affListJsonArray = JSON.parseArray(affList.toJSONString());
  162 + SciengineAffsListDo[] affListArray = affMapper.readValue(affListJsonArray.toJSONString(), SciengineAffsListDo[].class);
  163 + if (isNotAddress && CollectionUtils.isNotEmpty(Arrays.asList(affListArray))) {
  164 + JSONObject addressObj = new JSONObject();
  165 + addressObj.put("authorNames", authorName);
  166 + List addressList = new ArrayList();
  167 + for (SciengineAffsListDo sciengineAffsListDo : affListArray) {
  168 + if (sciengineAffsListDo.getAffText() != null) {
  169 + addressList.add(sciengineAffsListDo.getAffText());
  170 + }
  171 + }
  172 + addressObj.put("address", addressList);
  173 + authorAddress.add(addressObj);
  174 + }
  175 +
  176 + //引用文献
  177 + JSONArray references = new JSONArray();
  178 + ObjectMapper referenceMapper = new ObjectMapper();
  179 + net.minidev.json.JSONArray refListJsonArray = JsonPath.read(rawText, "$.article.referenceList");
  180 + JSONArray referenceListJsonArray = JSON.parseArray(refListJsonArray.toJSONString());
  181 + SciengineReferenceListDo[] referenceList = referenceMapper.readValue(referenceListJsonArray.toJSONString(), SciengineReferenceListDo[].class);
  182 + for (SciengineReferenceListDo sciengineReferenceListDo : referenceList) {
  183 + StringBuffer referenceTitle = new StringBuffer();
  184 + JSONObject referencesObj = new JSONObject();
  185 + ArrayList<Object> herfList = new ArrayList<>();
  186 + herfList.add(sciengineReferenceListDo.getTitle());
  187 + referenceTitle.append("https://www.sciengine.com/JAS/doi/" + sciengineReferenceListDo.getDoi());
  188 + referencesObj.put("links", herfList);
  189 + referencesObj.put("referenceTitle", referenceTitle);
  190 + references.add(referencesObj);
  191 + }
  192 + log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorName, authorMail.toJSONString());
  193 + page.putField("article", ArticleDO.builder()
  194 + .articleType(ArticleTypeEnum.Sciengine.getType())
  195 + .articleCode(articleCode)
  196 + .authorName(authorName.toString())
  197 + .title(title)
  198 + .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE))
  199 + .emailInfo(authorMail.toJSONString())
  200 + .articleDesc(articleDesc)
  201 + .authorAddress(authorAddress.toJSONString())
  202 + .referenceInfo(references.toJSONString()).build());
  203 + }
  204 +}
... ...
src/main/java/com/canrd/webmagic/processor/download/SeleniumDownloader.java
... ... @@ -17,6 +17,8 @@ import us.codecraft.webmagic.selector.PlainText;
17 17  
18 18 import javax.annotation.Resource;
19 19 import java.util.Map;
  20 +import java.util.regex.Matcher;
  21 +import java.util.regex.Pattern;
20 22  
21 23 /**
22 24 * @author: xms
... ... @@ -43,6 +45,13 @@ public class SeleniumDownloader extends AbstractDownloader {
43 45 return this;
44 46 }
45 47  
  48 + public static boolean checkUrl(String url) {
  49 + String pattern = "https://id.elsevier.com/as/[a-zA-Z0-9]+/resume/as/";
  50 + Pattern r = Pattern.compile(pattern);
  51 + Matcher m = r.matcher(url);
  52 + return m.find();
  53 + }
  54 +
46 55 @Override
47 56 public Page download(Request request, Task task) {
48 57 Page page = Page.fail();
... ... @@ -78,7 +87,26 @@ public class SeleniumDownloader extends AbstractDownloader {
78 87 // cloudFlare.click();
79 88 // }
80 89 }
81   - if (request.getUrl().contains("https://www.cell.com/action/doSearch?")){
  90 + if (request.getUrl().equals("https://www.sciencedirect.com/search?qs=battery&pub=Nano%20Today&cid=273494")) {
  91 + WebDriverWait wait = new WebDriverWait(webDriver, 60);
  92 + Boolean until = wait.until(ExpectedConditions.urlContains("https://id.elsevier.com/"));
  93 + if (until) {
  94 + log.info(webDriver.getCurrentUrl());
  95 +// if (checkUrl(webDriver.getCurrentUrl())) {
  96 + String currentUrl = webDriver.getCurrentUrl();
  97 + log.info("currentUrl=" + currentUrl);
  98 + String pageSource = webDriver.getPageSource();
  99 + log.info(pageSource);
  100 +// WebElement element = webDriver.findElement(By.xpath("//div[@class='form-row']/from/div[@id='jsEnabled']/input"));
  101 + webDriver.findElement(By.xpath("//a[@class='ot-sdk-show-settings cookie anchor-text']")).click();
  102 + webDriver.findElement(By.xpath("//input[@id='bdd-email']")).getAttribute("1187551704@qq.com");
  103 + webDriver.findElement(By.xpath("//button[@id='bdd-els-searchBtn']")).submit();
  104 +// String text = element.getText();
  105 +// log.info(text);
  106 +// }
  107 + } else {
  108 + log.info("跳转失败");
  109 + }
82 110  
83 111 }
84 112 try {
... ... @@ -116,7 +144,7 @@ public class SeleniumDownloader extends AbstractDownloader {
116 144 } finally {
117 145 if (webDriver != null) {
118 146 webDriver.close();
119   -// webDriver.quit();
  147 + webDriver.quit();
120 148 webDriver = null;
121 149 }
122 150 }
... ...
src/main/java/com/canrd/webmagic/util/FieldUtil.java 0 → 100644
  1 +package com.canrd.webmagic.util;
  2 +
  3 +import com.alibaba.fastjson.JSON;
  4 +import com.alibaba.fastjson.JSONObject;
  5 +import org.springframework.util.StringUtils;
  6 +
  7 +public class FieldUtil {
  8 + public static Boolean isExistField(String field, Object obj) {
  9 + if (obj == null || StringUtils.isEmpty(field)) {
  10 + return null;
  11 + }
  12 + Object o = JSON.toJSON(obj);
  13 + JSONObject jsonObj = new JSONObject();
  14 + if (o instanceof JSONObject) {
  15 + jsonObj = (JSONObject) o;
  16 + }
  17 + return jsonObj.containsKey(field);
  18 + }
  19 +}
... ...
target/classes/com/canrd/webmagic/controller/ChemicalController.class
No preview for this file type
target/classes/com/canrd/webmagic/controller/ScienceDirectController.class 0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/controller/SciengineController.class 0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/domain/ArticleTypeEnum.class
No preview for this file type
target/classes/com/canrd/webmagic/domain/dto/SciengineAffsListDo.class 0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/domain/dto/SciengineAuthorDo.class 0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/domain/dto/SciengineAuthorNoteDo.class 0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/domain/dto/SciengineReferenceListDo.class 0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/ScienceDirectArticlePcoessor.class 0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/ScienceDirectSearchPcoessor.class 0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/ScienginePcoessor.class 0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/download/SeleniumDownloader.class
No preview for this file type
target/classes/com/canrd/webmagic/util/FieldUtil.class 0 → 100644
No preview for this file type