Commit 432ddd722c077bdb191dad3449658f81be91780f

Authored by PurelzMgnead
1 parent 9f5b8d9b

添加其他网站的爬取方法

Too many changes to show.

To preserve performance only 16 of 21 files are displayed.

.idea/inspectionProfiles/Project_Default.xml
1 1 <component name="InspectionProjectProfileManager">
2 2 <profile version="1.0">
3 3 <option name="myName" value="Project Default" />
  4 + <inspection_tool class="AutoCloseableResource" enabled="true" level="WARNING" enabled_by_default="true">
  5 + <option name="METHOD_MATCHER_CONFIG" value="java.util.Formatter,format,java.io.Writer,append,com.google.common.base.Preconditions,checkNotNull,org.hibernate.Session,close,java.io.PrintWriter,printf,java.io.PrintStream,printf,okhttp3.Call,execute" />
  6 + </inspection_tool>
4 7 <inspection_tool class="NonSerializableWithSerialVersionUIDField" enabled="true" level="WARNING" enabled_by_default="true" />
5 8 <inspection_tool class="SpringJavaInjectionPointsAutowiringInspection" enabled="false" level="ERROR" enabled_by_default="false" />
6 9 </profile>
... ...
... ... @@ -77,6 +77,12 @@
77 77 <version>2.1.8</version>
78 78 </dependency>
79 79  
  80 + <dependency>
  81 + <groupId>com.squareup.okhttp3</groupId>
  82 + <artifactId>okhttp</artifactId>
  83 + <version>3.8.1</version>
  84 + </dependency>
  85 +
80 86 <!-- webmagic核心库 -->
81 87 <dependency>
82 88 <groupId>us.codecraft</groupId>
... ...
src/main/java/com/canrd/webmagic/controller/ChemicalController.java
1 1 package com.canrd.webmagic.controller;
2 2  
3 3  
  4 +import com.canrd.webmagic.common.constant.ServerResult;
  5 +import com.canrd.webmagic.processor.ChemicalPcoessor;
  6 +import com.canrd.webmagic.processor.ChemicalsciencePcoessor;
  7 +import com.canrd.webmagic.processor.MatterPagePcoessor;
  8 +import com.canrd.webmagic.processor.download.SeleniumDownloader;
  9 +import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
4 10 import io.swagger.annotations.Api;
  11 +import io.swagger.annotations.ApiOperation;
  12 +import org.apache.logging.log4j.core.util.UuidUtil;
  13 +import org.springframework.web.bind.annotation.GetMapping;
5 14 import org.springframework.web.bind.annotation.RequestMapping;
6 15 import org.springframework.web.bind.annotation.RestController;
  16 +import us.codecraft.webmagic.Request;
  17 +import us.codecraft.webmagic.Spider;
  18 +import us.codecraft.webmagic.scheduler.RedisScheduler;
  19 +
  20 +import javax.annotation.Resource;
7 21  
8 22 @RestController
9   -@RequestMapping("/nature/article")
10   -@Api("Nature")
  23 +@RequestMapping("/chemical/article")
  24 +@Api("Chemical")
11 25 public class ChemicalController {
  26 + @Resource
  27 + private ChemicalPcoessor chemicalPcoessor;
  28 +
  29 + @Resource
  30 + private SeleniumDownloader seleniumDownloader;
  31 +
  32 + @Resource
  33 + private ArticlePipeline articlePipeline;
12 34  
  35 + @GetMapping("/start")
  36 + @ApiOperation("start")
  37 + public ServerResult start() {
  38 + Spider.create(chemicalPcoessor)
  39 + .setUUID(UuidUtil.getTimeBasedUuid().toString())
  40 + .addRequest(new Request("https://pubs.rsc.org/en/results?searchtext=battery"))
  41 +// .setScheduler(new RedisScheduler("127.0.0.1"))
  42 + .addPipeline(articlePipeline)
  43 +// .setDownloader(seleniumDownloader)
  44 + .thread(1).run();
  45 + return ServerResult.success();
  46 + }
13 47 }
... ...
src/main/java/com/canrd/webmagic/controller/MatterController.java
... ... @@ -26,13 +26,14 @@ public class MatterController {
26 26 @GetMapping("/start")
27 27 @ApiOperation("start")
28 28 public ServerResult start() {
  29 +
29 30 Spider.create(matterPragePcoessor)
30 31 // 添加这个Spider要爬取的网页地址
31 32 .addUrl("https://www.cell.com/matter/home")
32 33 .setUUID(UuidUtil.getTimeBasedUuid().toString())
33 34 .setDownloader(seleniumDownloader)
34 35 // 开启5个线程执行,并开始爬取
35   - .thread(5).run();
  36 + .thread(60).start();
36 37 return ServerResult.success();
37 38 }
38 39 }
... ...
src/main/java/com/canrd/webmagic/domain/ArticleTypeEnum.java
... ... @@ -25,6 +25,7 @@ public enum ArticleTypeEnum {
25 25 NATURE_COMMUNICATIONS("nature-communications","nuture网站-communications"),
26 26 NATURE_COMPUTATIONAL_SCIENCE("nature-computational|science","nuture网站-computational|science"),
27 27 NATURE_METHODS("nature-methods","nuture网站-methods"),
  28 + Chemical("chemical","chemical网站")
28 29 ;
29 30 private String type;
30 31 private String desc;
... ...
src/main/java/com/canrd/webmagic/processor/AdvancedEnergyMaterialPcoessor.java
... ... @@ -4,10 +4,14 @@ import us.codecraft.webmagic.Page;
4 4 import us.codecraft.webmagic.Site;
5 5 import us.codecraft.webmagic.processor.PageProcessor;
6 6  
  7 +import java.util.ArrayList;
  8 +import java.util.List;
  9 +
7 10 public class AdvancedEnergyMaterialPcoessor implements PageProcessor {
8 11 @Override
9 12 public void process(Page page) {
10   -
  13 + String url = page.getUrl().get();
  14 + if (url.equals("https://techxplore.com/journals/advanced-energy-materials/")){}
11 15 }
12 16  
13 17 @Override
... ... @@ -15,5 +19,5 @@ public class AdvancedEnergyMaterialPcoessor implements PageProcessor {
15 19 return PageProcessor.super.getSite();
16 20 }
17 21  
18   -
  22 +
19 23 }
... ...
src/main/java/com/canrd/webmagic/processor/ChemicalPcoessor.java 0 → 100644
  1 +package com.canrd.webmagic.processor;
  2 +
  3 +import com.alibaba.fastjson.JSONArray;
  4 +import com.alibaba.fastjson.JSONObject;
  5 +import com.baomidou.mybatisplus.core.toolkit.CollectionUtils;
  6 +import com.canrd.webmagic.common.utils.DateUtil;
  7 +import com.canrd.webmagic.common.utils.StringUtils;
  8 +import com.canrd.webmagic.domain.ArticleTypeEnum;
  9 +import com.canrd.webmagic.domain.dto.ArticleDO;
  10 +import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
  11 +import com.google.common.net.MediaType;
  12 +import lombok.extern.slf4j.Slf4j;
  13 +import okhttp3.MultipartBody;
  14 +import okhttp3.OkHttpClient;
  15 +import okhttp3.Response;
  16 +import okhttp3.ResponseBody;
  17 +import org.springframework.stereotype.Component;
  18 +import org.springframework.web.bind.annotation.RequestBody;
  19 +import us.codecraft.webmagic.Page;
  20 +import us.codecraft.webmagic.Request;
  21 +import us.codecraft.webmagic.Site;
  22 +import us.codecraft.webmagic.Spider;
  23 +import us.codecraft.webmagic.model.HttpRequestBody;
  24 +import us.codecraft.webmagic.processor.PageProcessor;
  25 +import us.codecraft.webmagic.selector.Html;
  26 +import us.codecraft.webmagic.selector.Selectable;
  27 +import us.codecraft.webmagic.utils.HttpConstant;
  28 +
  29 +import java.io.IOException;
  30 +import java.text.ParseException;
  31 +import java.text.SimpleDateFormat;
  32 +import java.util.*;
  33 +import java.util.concurrent.ConcurrentHashMap;
  34 +import java.util.concurrent.atomic.AtomicInteger;
  35 +import java.util.stream.Collectors;
  36 +
  37 +@Slf4j
  38 +@Component
  39 +public class ChemicalPcoessor implements PageProcessor {
  40 + private final Site request = Site.me().setTimeOut(300000);
  41 +
  42 + private int index;
  43 + private String substring;
  44 +
  45 + private HttpRequestBody httpRequestBody;
  46 +
  47 + private ConcurrentHashMap<String, Object> map = new ConcurrentHashMap<>();
  48 +
  49 + @Override
  50 + public void process(Page page) {
  51 +// synchronized (this) {
  52 + String url = page.getUrl().get();
  53 + if (url.equals("https://pubs.rsc.org/en/results?searchtext=battery")) {
  54 + try {
  55 + getMaxPages(page);
  56 + } catch (IOException e) {
  57 + throw new RuntimeException(e);
  58 + }
  59 +// }else if (url.equals("https://pubs.rsc.org/en/search/journalresult")&&index==0) {
  60 +// getResultMax(page);
  61 + } else if (url.equals("https://pubs.rsc.org/en/search/journalresult")) {
  62 + eveyPage(page);
  63 + } else if (url.contains("https://pubs.rsc.org/en/content")) {
  64 + getDetil(page);
  65 + }
  66 +// }
  67 + }
  68 +
  69 + void getMaxPages(Page page) throws IOException {
  70 + index = 0;
  71 + Html html = page.getHtml();
  72 + String script = html.css("script").regex(".*var searchResultCounts = (.*?);").get();
  73 +// log.info(script);
  74 +// String replace = script.replace("[", "");
  75 +// String replace1 = replace.replace("]", "");
  76 +// String[] split = replace1.split(",");
  77 +// int i1 = split[1].indexOf(":");
  78 +// int i2 = split[1].indexOf("}");
  79 +// substring = split[1].substring(i1 + 1, i1);
  80 +// if (!StringUtils.isEmpty(substring)) {
  81 + for (int i = 1; i <= 2118; i++) {
  82 + String baseUrl = "https://pubs.rsc.org/en/search/journalresult";
  83 + map.put("searchterm", "PD94bWwgdmVyc2lvbj0iMS4wIj8+DQo8U2VhcmNoVGVybSB4" +
  84 + "bWxuczp4c2Q9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hIiB4bW" +
  85 + "xuczp4c2k9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hLWluc3Rh" +
  86 + "bmNlIj4NCiAgPENhdGVnb3J5PmpvdXJuYWw8L0NhdGVnb3J5Pg0KICA8U3ViQ2" +
  87 + "F0ZWdvcnkgLz4NCiAgPENvbnRlbnRUeXBlPmpvdXJuYWw8L0NvbnRlbnRUeXBl" +
  88 + "Pg0KICA8Q3JpdGVyaWFzPg0KICAgIDxOYW1lVmFsdWU+DQogICAgICA8TmFtZT" +
  89 + "5mcmVldGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5PC9WYWx1ZT4N" +
  90 + "CiAgICA8L05hbWVWYWx1ZT4NCiAgICA8TmFtZVZhbHVlPg0KICAgICAgPE5hbW" +
  91 + "U+T3JpZ2luYWxGcmVlVGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5" +
  92 + "PC9WYWx1ZT4NCiAgICA8L05hbWVWYWx1ZT4NCiAgPC9Dcml0ZXJpYXM+DQogID" +
  93 + "xGYWNldHMgLz4NCiAgPFJlcXVlc3RUaW1lPjAwMDEtMDEtMDFUMDA6MDA6MDA8" +
  94 + "L1JlcXVlc3RUaW1lPg0KICA8QXV0aG9yQ3JpdGVyaWEgLz4NCiAgPFB1YmxpY2" +
  95 + "F0aW9uRGF0ZT4NCiAgICA8SXNTZWxlY3RlZERhdGU+ZmFsc2U8L0lzU2VsZWN0" +
  96 + "ZWREYXRlPg0KICA8L1B1YmxpY2F0aW9uRGF0ZT4NCiAgPEV4Y2x1ZGVzIC8+DQ" +
  97 + "o8L1NlYXJjaFRlcm0+");
  98 + map.put("resultcount", 52942);
  99 + map.put("pageno", i);
  100 + httpRequestBody = HttpRequestBody.form(map, "UTF-8");
  101 + Request request = new Request(baseUrl).setMethod(HttpConstant.Method.POST)
  102 + .addHeader("Content-Type", "application/x-www-form-urlencoded")
  103 + .addHeader("Connection", "keep-alive")
  104 + .addHeader("Host", "pubs.rsc.org")
  105 + .addHeader("Accept-Encoding", "gzip, deflate, br")
  106 + .addHeader("Accept", "*/*")
  107 + .addHeader("Cookie", "X-Mapping-hhmaobcf=3C9897E0250B6117CDBA3EC8F724E9A9;_PubsBFCleared=1;ASP.NET_SessionId=bang00oiyj1p1v45jkuk2cgt;ShowEUCookieLawBanner=true")
  108 + .setCharset("UTF-8");
  109 +// log.info(map.toString());
  110 + request.setRequestBody(httpRequestBody);
  111 + page.addTargetRequest(request);
  112 +
  113 +// OkHttpClient client = new OkHttpClient().newBuilder()
  114 +// .build();
  115 +// MediaType mediaType = MediaType.parse("text/plain");
  116 +// for (int i = 1; i <= 3; i++) {
  117 +// MultipartBody body = new MultipartBody.Builder().setType(MultipartBody.FORM)
  118 +// .addFormDataPart("searchterm", "PD94bWwgdmVyc2lvbj0iMS4wIj8+DQo8U2VhcmNoVGVybSB4bWxuczp4c2Q9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hIiB4bWxuczp4c2k9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hLWluc3RhbmNlIj4NCiAgPENhdGVnb3J5PmpvdXJuYWw8L0NhdGVnb3J5Pg0KICA8U3ViQ2F0ZWdvcnkgLz4NCiAgPENvbnRlbnRUeXBlPmpvdXJuYWw8L0NvbnRlbnRUeXBlPg0KICA8Q3JpdGVyaWFzPg0KICAgIDxOYW1lVmFsdWU+DQogICAgICA8TmFtZT5mcmVldGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5PC9WYWx1ZT4NCiAgICA8L05hbWVWYWx1ZT4NCiAgICA8TmFtZVZhbHVlPg0KICAgICAgPE5hbWU+T3JpZ2luYWxGcmVlVGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5PC9WYWx1ZT4NCiAgICA8L05hbWVWYWx1ZT4NCiAgPC9Dcml0ZXJpYXM+DQogIDxGYWNldHMgLz4NCiAgPFJlcXVlc3RUaW1lPjAwMDEtMDEtMDFUMDA6MDA6MDA8L1JlcXVlc3RUaW1lPg0KICA8QXV0aG9yQ3JpdGVyaWEgLz4NCiAgPFB1YmxpY2F0aW9uRGF0ZT4NCiAgICA8SXNTZWxlY3RlZERhdGU+ZmFsc2U8L0lzU2VsZWN0ZWREYXRlPg0KICA8L1B1YmxpY2F0aW9uRGF0ZT4NCiAgPEV4Y2x1ZGVzIC8+DQo8L1NlYXJjaFRlcm0+")
  119 +// .addFormDataPart("resultcount", "25")
  120 +//// .addFormDataPart("category", "journal")
  121 +// .addFormDataPart("pageno", String.valueOf(i))
  122 +// .build();
  123 +// okhttp3.Request okrequest = new okhttp3.Request.Builder()
  124 +// .url("https://pubs.rsc.org/en/search/journalresult")
  125 +// .method("POST", body)
  126 +// .addHeader("Cookie", "X-Mapping-hhmaobcf=3C9897E0250B6117CDBA3EC8F724E9A9;_PubsBFCleared=1;ASP.NET_SessionId=bang00oiyj1p1v45jkuk2cgt;ShowEUCookieLawBanner=true")
  127 +// .build();
  128 +// Response response = client.newCall(okrequest).execute();
  129 +// String responseBody = response.body().string();
  130 +// log.info(String.valueOf(responseBody));
  131 + }
  132 + }
  133 +// }else{
  134 +// throw new RuntimeException();
  135 +// }
  136 +// }
  137 +
  138 + void getResultMax(Page page) {
  139 + index = 1;
  140 +// .xpath("//span[@class='paging--label']/text()")
  141 + page.putField("html", page.getHtml());
  142 + String ever = page.getHtml().xpath("//div[@class='fixpadv--l']/strong/text()").get();
  143 + int total = Integer.parseInt(substring);
  144 + int everReq = Integer.parseInt(ever.split(" ")[0]);
  145 + int pageNo = total / everReq;
  146 + if ((pageNo * everReq) % total < everReq && (pageNo * everReq) % total == 0) {
  147 + pageNo = pageNo + 1;
  148 + }
  149 + log.info(String.valueOf(pageNo));
  150 +// String s = page.getHtml().xpath("//span[@class='paging--label']/text()").get();
  151 +// String[] pageTotal = s.split(" ");
  152 +// int now = Integer.parseInt(pageTotal[4]);
  153 +// int total = Integer.parseInt(pageTotal[6]);
  154 + for (int i = 1; i <= total; i++) {
  155 + String baseUrl = "https://pubs.rsc.org/en/search/journalresult";
  156 + Map<String, Object> map = new HashMap<>();
  157 + map.put("searchterm", "PD94bWwgdmVyc2lvbj0iMS4wIj8+DQo8U2VhcmNoVGVybSB4" +
  158 + "bWxuczp4c2Q9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hIiB4bW" +
  159 + "xuczp4c2k9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hLWluc3Rh" +
  160 + "bmNlIj4NCiAgPENhdGVnb3J5PmpvdXJuYWw8L0NhdGVnb3J5Pg0KICA8U3ViQ2" +
  161 + "F0ZWdvcnkgLz4NCiAgPENvbnRlbnRUeXBlPmpvdXJuYWw8L0NvbnRlbnRUeXBl" +
  162 + "Pg0KICA8Q3JpdGVyaWFzPg0KICAgIDxOYW1lVmFsdWU+DQogICAgICA8TmFtZT" +
  163 + "5mcmVldGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5PC9WYWx1ZT4N" +
  164 + "CiAgICA8L05hbWVWYWx1ZT4NCiAgICA8TmFtZVZhbHVlPg0KICAgICAgPE5hbW" +
  165 + "U+T3JpZ2luYWxGcmVlVGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5" +
  166 + "PC9WYWx1ZT4NCiAgICA8L05hbWVWYWx1ZT4NCiAgPC9Dcml0ZXJpYXM+DQogID" +
  167 + "xGYWNldHMgLz4NCiAgPFJlcXVlc3RUaW1lPjAwMDEtMDEtMDFUMDA6MDA6MDA8" +
  168 + "L1JlcXVlc3RUaW1lPg0KICA8QXV0aG9yQ3JpdGVyaWEgLz4NCiAgPFB1YmxpY2" +
  169 + "F0aW9uRGF0ZT4NCiAgICA8SXNTZWxlY3RlZERhdGU+ZmFsc2U8L0lzU2VsZWN0" +
  170 + "ZWREYXRlPg0KICA8L1B1YmxpY2F0aW9uRGF0ZT4NCiAgPEV4Y2x1ZGVzIC8+DQ" +
  171 + "o8L1NlYXJjaFRlcm0+");
  172 + map.put("resultcount", 25);
  173 + map.put("pageno", i);
  174 + HttpRequestBody httpRequestBody = HttpRequestBody.form(map, "UTF-8");
  175 + Request request = new Request(baseUrl).setMethod(HttpConstant.Method.POST)
  176 + .addHeader("Content-Type", "application/x-www-form-urlencoded;charset=utf-8")
  177 + .addHeader("Connection", "keep-alive")
  178 + .addHeader("Host", "pubs.rsc.org")
  179 + .addHeader("Accept-Encoding", "gzip, deflate, br, zstd")
  180 + .addHeader("Accept", "text/html, */*; q=0.01")
  181 + .setCharset("UTF-8");
  182 + request.setRequestBody(httpRequestBody);
  183 + page.addTargetRequest(request);
  184 + }
  185 + }
  186 +
  187 + void eveyPage(Page page) {
  188 +// <span class="paging--label"> - Showing page 2 of 4000</span>
  189 +// log.info(page.getHtml().get());
  190 + List<String> hrefList = page.getHtml().xpath("//a[@class='capsule__action']/@href").all();
  191 + for (String herf : hrefList) {
  192 + log.info("https://pubs.rsc.org" + herf);
  193 + String url = "https://pubs.rsc.org" + herf;
  194 + page.addTargetRequest(url);
  195 + }
  196 + }
  197 +
  198 + void getDetil(Page page) {
  199 + Html html = page.getHtml();
  200 + //文章链接
  201 + String articleCode = page.getUrl().get();
  202 + //文章标题
  203 + String title = html.xpath("//div[@class='article__title']/h2/font/text()").get();
  204 + if (StringUtils.isBlank(title)) {
  205 + title = html.xpath("//div[@class='article__title']/h2/text()").get();
  206 + }
  207 +
  208 + //文章内容
  209 + String articleDesc = html.xpath("//div[@class='capsule__text']/p/text()").get();
  210 +
  211 + //时间
  212 + String publishTime;
  213 + Date publishTimeDateTime = null;
  214 + List<String> all = html.xpath("//div[@class='c fixpadt--l']/dd[@class='c__10']").all();
  215 + AtomicInteger timeIndex = new AtomicInteger(0);
  216 + all.stream().filter(s -> {
  217 + timeIndex.getAndIncrement();
  218 + return s.equals("test2");
  219 + }).findFirst();
  220 + publishTime = html.xpath("//div[@class='c fixpadt--l']/dd[@class='c__14']/text()").all().get(timeIndex.get());
  221 + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH);
  222 + try {
  223 + publishTimeDateTime = formatter.parse(publishTime);
  224 + } catch (ParseException e) {
  225 + e.printStackTrace();
  226 + }
  227 +
  228 + //作者名字
  229 + List<String> authorList = html.xpath("//div[@class='article__authors']/span[@class='article__author-link']/a/text()").all();
  230 + List<String> collect = authorList.stream().filter(a -> !a.equals("†")).collect(Collectors.toList());
  231 +
  232 +// collect.forEach(log::info);
  233 + StringBuffer stringBuffer = new StringBuffer();
  234 + collect.forEach(stringBuffer::append);
  235 +
  236 + StringBuffer authorName = new StringBuffer();
  237 + authorList.forEach(authorName::append);
  238 + HashMap<Object, Object> authorHashMap = new HashMap<>();
  239 + List<String> cupList = html.xpath("//div[@class='article__authors']/span/span/sup/i/text()").all();
  240 + if (collect.size() == cupList.size()) {
  241 + for (int i = 0; i < collect.size(); i++) {
  242 + authorHashMap.put(collect.get(i), cupList.get(i));
  243 + }
  244 + }
  245 +// 单位地址 邮箱
  246 + JSONArray authorAddress = new JSONArray();
  247 + JSONArray authorMail = new JSONArray();
  248 + List<Selectable> addressMailList = html.xpath("//p[@class='article__author-affiliation']").nodes();
  249 + HashMap<Object, Object> addressMap = new HashMap<>();
  250 + HashMap<Object, Object> mailMap = new HashMap<>();
  251 + if (CollectionUtils.isNotEmpty(addressMailList)) {
  252 + for (Selectable selectable : addressMailList) {
  253 + List<Selectable> nodes = selectable.xpath("//span").nodes();
  254 + if (CollectionUtils.isNotEmpty(nodes)) {
  255 + Selectable keyXpath = nodes.get(0);
  256 + Selectable valueXpath = nodes.get(1);
  257 + String key = keyXpath.xpath("//sup/text()").get();
  258 + String address = valueXpath.xpath("//span/text()").get();
  259 + String mail = valueXpath.xpath("//span/a/text()").get();
  260 + if (!StringUtils.isBlank(key)) {
  261 + if (!StringUtils.isBlank(address)) {
  262 + addressMap.put(key, address);
  263 + } else {
  264 + addressMap.put(key, null);
  265 + }
  266 + if (!StringUtils.isBlank(mail)) {
  267 + mailMap.put(key, mail);
  268 + } else {
  269 + mailMap.put(key, null);
  270 + }
  271 + } else {
  272 + if (!StringUtils.isBlank(address)) {
  273 + addressMap.put("*", address);
  274 + } else {
  275 + addressMap.put("*", null);
  276 + }
  277 + if (!StringUtils.isBlank(mail)) {
  278 + mailMap.put("*", mail);
  279 + } else {
  280 + mailMap.put("*", null);
  281 + }
  282 + }
  283 + }
  284 + }
  285 + Object[] objects = authorHashMap.keySet().stream().toArray();
  286 +// log.info(Arrays.toString(objects));
  287 + for (int i = 0; i < objects.length; i++) {
  288 + JSONObject addressObj = new JSONObject();
  289 + JSONObject mailObj = new JSONObject();
  290 + Object point = authorHashMap.get(String.valueOf(objects[i]));
  291 + Object address = addressMap.get(point);
  292 + Object mail = mailMap.get(point);
  293 + addressObj.put("address", address);
  294 + addressObj.put("authorNames", objects[i]);
  295 + mailObj.put("authorEmailName", objects[i]);
  296 + mailObj.put("email", mail);
  297 +
  298 + authorAddress.add(addressObj);
  299 + authorMail.add(mailObj);
  300 + }
  301 + }
  302 +
  303 + JSONArray references = new JSONArray();
  304 + List<Selectable> referenceNodeList = html.xpath("//div[@class='ref-list']/ol/li").nodes();
  305 + if (CollectionUtils.isNotEmpty(referenceNodeList)) {
  306 + for (Selectable reference : referenceNodeList) {
  307 + List<String> herfOpList = reference.xpath("//a/@href").all();
  308 + ArrayList<Object> herfList = new ArrayList<>();
  309 + for (String herf : herfOpList) {
  310 + if (herf.startsWith("https") || herf.startsWith("http")) {
  311 + String before = "https://pubs.rsc.org";
  312 + herfList.add(before + herf);
  313 + } else {
  314 + herfList.add(herf);
  315 + }
  316 + }
  317 + StringBuffer referenceTitle = new StringBuffer();
  318 + List<String> spanList = reference.xpath("//span/text()").all();
  319 + if (CollectionUtils.isNotEmpty(spanList)) {
  320 + for (int i = 0; i < spanList.size(); i++) {
  321 + referenceTitle.append(spanList.get(i));
  322 + if (i < spanList.size() - 1) {
  323 + referenceTitle.append(",");
  324 + }
  325 + }
  326 + }
  327 + String em = reference.xpath("//em/text()").get();
  328 + String strong = reference.xpath("//strong/text()").get();
  329 + referenceTitle.append(em);
  330 + referenceTitle.append(strong);
  331 + JSONObject referencesObj = new JSONObject();
  332 + referencesObj.put("links", herfList);
  333 + referencesObj.put("referenceTitle", referenceTitle);
  334 + references.add(referencesObj);
  335 + }
  336 + }
  337 + log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorName, authorMail.toJSONString());
  338 + page.putField("article", ArticleDO.builder()
  339 + .articleType(ArticleTypeEnum.Chemical.getType())
  340 + .articleCode(articleCode)
  341 + .authorName(authorName.toString())
  342 + .title(title)
  343 + .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE))
  344 + .emailInfo(authorMail.toJSONString())
  345 + .articleDesc(articleDesc)
  346 + .authorAddress(authorAddress.toJSONString())
  347 + .referenceInfo(references.toJSONString()).build());
  348 + }
  349 +
  350 + @Override
  351 + public Site getSite() {
  352 + return request;
  353 + }
  354 +
  355 + public static void main(String[] args) {
  356 + Spider.create(new ChemicalPcoessor())
  357 + .addUrl("https://pubs.rsc.org/en/search/journalresult")
  358 + .addPipeline(new ArticlePipeline())
  359 + .thread(5).run();
  360 + }
  361 +}
... ...
src/main/java/com/canrd/webmagic/processor/ChemicalsciencePcoessor.java 0 → 100644
  1 +package com.canrd.webmagic.processor;
  2 +
  3 +import com.canrd.webmagic.common.utils.StringUtils;
  4 +import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
  5 +import lombok.extern.slf4j.Slf4j;
  6 +import org.apache.logging.log4j.core.util.UuidUtil;
  7 +import org.springframework.beans.factory.annotation.Autowired;
  8 +import org.springframework.stereotype.Component;
  9 +import us.codecraft.webmagic.Page;
  10 +import us.codecraft.webmagic.Request;
  11 +import us.codecraft.webmagic.Site;
  12 +import us.codecraft.webmagic.Spider;
  13 +import us.codecraft.webmagic.model.HttpRequestBody;
  14 +import us.codecraft.webmagic.processor.PageProcessor;
  15 +import us.codecraft.webmagic.selector.*;
  16 +import us.codecraft.webmagic.utils.HttpConstant;
  17 +
  18 +import java.util.*;
  19 +
  20 +@Slf4j
  21 +@Component
  22 +public class ChemicalsciencePcoessor implements PageProcessor {
  23 +
  24 + @Autowired
  25 + private ChemicalPcoessor journalResultPcoessor;
  26 + private final Site base = Site.me().setDomain("base").setTimeOut(200000);
  27 +
  28 + @Override
  29 + public Site getSite() {
  30 + return base;
  31 + }
  32 +
  33 + private String substring;
  34 +
  35 + @Override
  36 + public void process(Page page) {
  37 + String url = page.getUrl().get();
  38 + if (url.equals("https://pubs.rsc.org/en/results?searchtext=battery")) {
  39 + getMaxPages(page);
  40 + } else if (url.equals("https://pubs.rsc.org/en/search/journalresult")) {
  41 + getResultMax(page);
  42 + }
  43 + }
  44 +
  45 + void getMaxPages(Page page) {
  46 + Html html = page.getHtml();
  47 + String script = page.getHtml().css("script").regex(".*var searchResultCounts = (.*?);").get();
  48 + String replace = script.replace("[", "");
  49 + String replace1 = replace.replace("]", "");
  50 + String[] split = replace1.split(",");
  51 + int i = split[1].indexOf(":");
  52 + int i1 = split[1].indexOf("}");
  53 + substring = split[1].substring(i + 1, i1);
  54 + if (!StringUtils.isEmpty(substring)) {
  55 + String baseUrl = "https://pubs.rsc.org/en/search/journalresult";
  56 + Map<String, Object> map = new HashMap<>();
  57 + map.put("searchterm", "PD94bWwgdmVyc2lvbj0iMS4wIj8+DQo8U2VhcmNoVGVybSB4" +
  58 + "bWxuczp4c2Q9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hIiB4bW" +
  59 + "xuczp4c2k9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hLWluc3Rh" +
  60 + "bmNlIj4NCiAgPENhdGVnb3J5PmpvdXJuYWw8L0NhdGVnb3J5Pg0KICA8U3ViQ2" +
  61 + "F0ZWdvcnkgLz4NCiAgPENvbnRlbnRUeXBlPmpvdXJuYWw8L0NvbnRlbnRUeXBl" +
  62 + "Pg0KICA8Q3JpdGVyaWFzPg0KICAgIDxOYW1lVmFsdWU+DQogICAgICA8TmFtZT" +
  63 + "5mcmVldGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5PC9WYWx1ZT4N" +
  64 + "CiAgICA8L05hbWVWYWx1ZT4NCiAgICA8TmFtZVZhbHVlPg0KICAgICAgPE5hbW" +
  65 + "U+T3JpZ2luYWxGcmVlVGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5" +
  66 + "PC9WYWx1ZT4NCiAgICA8L05hbWVWYWx1ZT4NCiAgPC9Dcml0ZXJpYXM+DQogID" +
  67 + "xGYWNldHMgLz4NCiAgPFJlcXVlc3RUaW1lPjAwMDEtMDEtMDFUMDA6MDA6MDA8" +
  68 + "L1JlcXVlc3RUaW1lPg0KICA8QXV0aG9yQ3JpdGVyaWEgLz4NCiAgPFB1YmxpY2" +
  69 + "F0aW9uRGF0ZT4NCiAgICA8SXNTZWxlY3RlZERhdGU+ZmFsc2U8L0lzU2VsZWN0" +
  70 + "ZWREYXRlPg0KICA8L1B1YmxpY2F0aW9uRGF0ZT4NCiAgPEV4Y2x1ZGVzIC8+DQ" +
  71 + "o8L1NlYXJjaFRlcm0+");
  72 + log.info(substring);
  73 + map.put("resultcount",25);
  74 + map.put("pageno",2);
  75 + HttpRequestBody httpRequestBody = HttpRequestBody.form(map, "UTF-8");
  76 + Request request = new Request(baseUrl).setMethod(HttpConstant.Method.POST)
  77 + .addHeader("Content-Type", "application/x-www-form-urlencoded;charset=utf-8")
  78 + .addHeader("Connection", "keep-alive")
  79 + .addHeader("Host", "pubs.rsc.org")
  80 + .addHeader("Accept-Encoding", "gzip, deflate, br, zstd")
  81 + .addHeader("Accept", "text/html, */*; q=0.01")
  82 + .setCharset("UTF-8");
  83 + request.setRequestBody(httpRequestBody);
  84 + page.addTargetRequest(request);
  85 + } else {
  86 + throw new RuntimeException();
  87 + }
  88 + }
  89 +
  90 + void getResultMax(Page page) {
  91 +// .xpath("//span[@class='paging--label']/text()")
  92 + page.putField("html", page.getHtml());
  93 + String ever = page.getHtml().xpath("//div[@class='fixpadv--l']/strong/text()").get();
  94 + int total = Integer.parseInt(substring);
  95 + int everReq = Integer.parseInt(ever.split(" ")[0]);
  96 + int pageNo=total/everReq;
  97 + if ((pageNo*everReq)%total<everReq&&(pageNo*everReq)%total==0){
  98 + pageNo=pageNo+1;
  99 + }
  100 + log.info(String.valueOf(pageNo));
  101 +// String s = page.getHtml().xpath("//span[@class='paging--label']/text()").get();
  102 +// String[] pageTotal = s.split(" ");
  103 +// int now = Integer.parseInt(pageTotal[4]);
  104 +// int total = Integer.parseInt(pageTotal[6]);
  105 + for (int i = 1; i <= total; i++) {
  106 + String baseUrl = "https://pubs.rsc.org/en/search/journalresult";
  107 + Map<String, Object> map = new HashMap<>();
  108 + map.put("searchterm", "PD94bWwgdmVyc2lvbj0iMS4wIj8+DQo8U2VhcmNoVGVybSB4" +
  109 + "bWxuczp4c2Q9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hIiB4bW" +
  110 + "xuczp4c2k9Imh0dHA6Ly93d3cudzMub3JnLzIwMDEvWE1MU2NoZW1hLWluc3Rh" +
  111 + "bmNlIj4NCiAgPENhdGVnb3J5PmpvdXJuYWw8L0NhdGVnb3J5Pg0KICA8U3ViQ2" +
  112 + "F0ZWdvcnkgLz4NCiAgPENvbnRlbnRUeXBlPmpvdXJuYWw8L0NvbnRlbnRUeXBl" +
  113 + "Pg0KICA8Q3JpdGVyaWFzPg0KICAgIDxOYW1lVmFsdWU+DQogICAgICA8TmFtZT" +
  114 + "5mcmVldGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5PC9WYWx1ZT4N" +
  115 + "CiAgICA8L05hbWVWYWx1ZT4NCiAgICA8TmFtZVZhbHVlPg0KICAgICAgPE5hbW" +
  116 + "U+T3JpZ2luYWxGcmVlVGV4dDwvTmFtZT4NCiAgICAgIDxWYWx1ZT5iYXR0ZXJ5" +
  117 + "PC9WYWx1ZT4NCiAgICA8L05hbWVWYWx1ZT4NCiAgPC9Dcml0ZXJpYXM+DQogID" +
  118 + "xGYWNldHMgLz4NCiAgPFJlcXVlc3RUaW1lPjAwMDEtMDEtMDFUMDA6MDA6MDA8" +
  119 + "L1JlcXVlc3RUaW1lPg0KICA8QXV0aG9yQ3JpdGVyaWEgLz4NCiAgPFB1YmxpY2" +
  120 + "F0aW9uRGF0ZT4NCiAgICA8SXNTZWxlY3RlZERhdGU+ZmFsc2U8L0lzU2VsZWN0" +
  121 + "ZWREYXRlPg0KICA8L1B1YmxpY2F0aW9uRGF0ZT4NCiAgPEV4Y2x1ZGVzIC8+DQ" +
  122 + "o8L1NlYXJjaFRlcm0+");
  123 + map.put("resultcount",25);
  124 + map.put("pageno",i);
  125 +// map.put("pageno",1);
  126 + HttpRequestBody httpRequestBody = HttpRequestBody.form(map, "UTF-8");
  127 + Request request = new Request(baseUrl).setMethod(HttpConstant.Method.POST)
  128 + .addHeader("Content-Type", "application/x-www-form-urlencoded;charset=utf-8")
  129 + .addHeader("Connection", "keep-alive")
  130 + .addHeader("Host", "pubs.rsc.org")
  131 + .addHeader("Accept-Encoding", "gzip, deflate, br, zstd")
  132 + .addHeader("Accept", "text/html, */*; q=0.01")
  133 + .setCharset("UTF-8");
  134 + request.setRequestBody(httpRequestBody);
  135 +// page.addTargetRequest(request);
  136 + Spider.create(journalResultPcoessor)
  137 + .setUUID(UuidUtil.getTimeBasedUuid().toString())
  138 + .addRequest(request)
  139 + .addPipeline(new ArticlePipeline())
  140 + .thread(1).start();
  141 + }
  142 + }
  143 +}
... ...
src/main/java/com/canrd/webmagic/processor/download/SeleniumDownloader.java
... ... @@ -71,6 +71,7 @@ public class SeleniumDownloader extends AbstractDownloader {
71 71 element.submit();
72 72 WebDriverWait wait = new WebDriverWait(webDriver, 30);
73 73 wait.until(ExpectedConditions.urlContains("https://www.cell.com/action/doSearch?"));
  74 +
74 75 // wait.until(ExpectedConditions.visibilityOfElementLocated(By.xpath("//div[@class='cb-c']")));
75 76 // WebElement cloudFlare = webDriver.findElement(By.xpath("//div[@class='cb-c']/label/input"));
76 77 // if (cloudFlare!=null){
... ...
target/classes/com/canrd/webmagic/controller/ChemicalController.class
No preview for this file type
target/classes/com/canrd/webmagic/controller/MatterController.class
No preview for this file type
target/classes/com/canrd/webmagic/domain/ArticleTypeEnum.class
No preview for this file type
target/classes/com/canrd/webmagic/processor/AdvancedEnergyMaterialPcoessor.class 0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/ChemicalPcoessor.class 0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/ChemicalsciencePcoessor.class 0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/download/SeleniumDownloader.class
No preview for this file type