Commit 9f5b8d9b5b36694b8e7f1b1c1cba1d4a742e8449

Authored by 凌世锦
1 parent 1680e181

提交

Showing 52 changed files with 1561 additions and 29 deletions

Too many changes to show.

To preserve performance only 52 of 55 files are displayed.

.idea/inspectionProfiles/Project_Default.xml
... ... @@ -2,5 +2,6 @@
2 2 <profile version="1.0">
3 3 <option name="myName" value="Project Default" />
4 4 <inspection_tool class="NonSerializableWithSerialVersionUIDField" enabled="true" level="WARNING" enabled_by_default="true" />
  5 + <inspection_tool class="SpringJavaInjectionPointsAutowiringInspection" enabled="false" level="ERROR" enabled_by_default="false" />
5 6 </profile>
6 7 </component>
7 8 \ No newline at end of file
... ...
... ... @@ -70,6 +70,12 @@
70 70 <!-- <version>${browsermob.version}</version>-->
71 71 <!-- </dependency>-->
72 72  
  73 +<!-- DNS-->
  74 + <dependency>
  75 + <groupId>dnsjava</groupId>
  76 + <artifactId>dnsjava</artifactId>
  77 + <version>2.1.8</version>
  78 + </dependency>
73 79  
74 80 <!-- webmagic核心库 -->
75 81 <dependency>
... ...
src/main/java/com/canrd/webmagic/DNS/DnsResolver.java 0 → 100644
  1 +package com.canrd.webmagic.DNS;
  2 +
  3 +import org.xbill.DNS.*;
  4 +
  5 +public class DnsResolver {
  6 + public static String resolve(String domain) {
  7 + try {
  8 + Record[] records = new Lookup(domain, Type.A).run();
  9 + if (records != null && records.length > 0) {
  10 + ARecord aRecord = (ARecord) records[0];
  11 + return aRecord.getAddress().getHostAddress();
  12 + }
  13 + } catch (TextParseException e) {
  14 + e.printStackTrace();
  15 + }
  16 + return null;
  17 + }
  18 +}
0 19 \ No newline at end of file
... ...
src/main/java/com/canrd/webmagic/controller/MatterController.java
1 1 package com.canrd.webmagic.controller;
2 2  
3 3 import com.canrd.webmagic.common.constant.ServerResult;
4   -import com.canrd.webmagic.processor.MatterPragePcoessor;
  4 +import com.canrd.webmagic.processor.MatterPagePcoessor;
5 5 import com.canrd.webmagic.processor.download.SeleniumDownloader;
6 6 import io.swagger.annotations.Api;
7 7 import io.swagger.annotations.ApiOperation;
... ... @@ -18,7 +18,7 @@ import javax.annotation.Resource;
18 18 @Api("Matter")
19 19 public class MatterController {
20 20 @Resource
21   - private MatterPragePcoessor matterPragePcoessor;
  21 + private MatterPagePcoessor matterPragePcoessor;
22 22  
23 23 @Resource
24 24 private SeleniumDownloader seleniumDownloader;
... ... @@ -26,11 +26,11 @@ public class MatterController {
26 26 @GetMapping("/start")
27 27 @ApiOperation("start")
28 28 public ServerResult start() {
29   - Spider.create(new MatterPragePcoessor())
  29 + Spider.create(matterPragePcoessor)
30 30 // 添加这个Spider要爬取的网页地址
31   - .addUrl("https://www.cell.com/matter/fulltext/S2590-2385(21)00631-7#%20")
  31 + .addUrl("https://www.cell.com/matter/home")
32 32 .setUUID(UuidUtil.getTimeBasedUuid().toString())
33   - .setDownloader(seleniumDownloader.setSleepTime(30000))
  33 + .setDownloader(seleniumDownloader)
34 34 // 开启5个线程执行,并开始爬取
35 35 .thread(5).run();
36 36 return ServerResult.success();
... ...
src/main/java/com/canrd/webmagic/controller/NatureCommunicatiosController.java 0 → 100644
  1 +package com.canrd.webmagic.controller;
  2 +
  3 +import com.canrd.webmagic.common.constant.ServerResult;
  4 +import com.canrd.webmagic.processor.NatureCommunicatiosPcoessor;
  5 +import com.canrd.webmagic.processor.NatureEnergyPagePcoessor;
  6 +import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
  7 +import io.swagger.annotations.Api;
  8 +import io.swagger.annotations.ApiOperation;
  9 +import org.apache.logging.log4j.core.util.UuidUtil;
  10 +import org.springframework.web.bind.annotation.GetMapping;
  11 +import org.springframework.web.bind.annotation.RequestMapping;
  12 +import org.springframework.web.bind.annotation.RestController;
  13 +import us.codecraft.webmagic.Spider;
  14 +import us.codecraft.webmagic.scheduler.RedisScheduler;
  15 +
  16 +import javax.annotation.Resource;
  17 +@RestController
  18 +@RequestMapping("/nature-communicatios/article")
  19 +@Api("Nature")
  20 +public class NatureCommunicatiosController {
  21 + @Resource
  22 + private NatureCommunicatiosPcoessor natureCommunicatiosPcoessor;
  23 + @Resource
  24 + private ArticlePipeline articlePipeline;
  25 +
  26 + @GetMapping("/start")
  27 + @ApiOperation("start")
  28 + public ServerResult start() {
  29 + Spider.create(natureCommunicatiosPcoessor)
  30 + .addUrl("https://www.nature.com/ncomms/articles?type=editorial")
  31 + .setUUID(UuidUtil.getTimeBasedUuid().toString())
  32 + .addPipeline(articlePipeline)
  33 +// .setScheduler(new RedisScheduler("127.0.0.1"))
  34 + .thread(20).run();
  35 + return ServerResult.success();
  36 + }
  37 +}
... ...
src/main/java/com/canrd/webmagic/controller/NatureComputationalController.java 0 → 100644
  1 +package com.canrd.webmagic.controller;
  2 +
  3 +import com.canrd.webmagic.common.constant.ServerResult;
  4 +import com.canrd.webmagic.processor.NatureComputationalPcoessor;
  5 +import com.canrd.webmagic.processor.NatureEnergyPagePcoessor;
  6 +import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
  7 +import io.swagger.annotations.Api;
  8 +import io.swagger.annotations.ApiOperation;
  9 +import org.apache.logging.log4j.core.util.UuidUtil;
  10 +import org.springframework.web.bind.annotation.GetMapping;
  11 +import org.springframework.web.bind.annotation.RequestMapping;
  12 +import org.springframework.web.bind.annotation.RestController;
  13 +import us.codecraft.webmagic.Spider;
  14 +import us.codecraft.webmagic.scheduler.RedisScheduler;
  15 +
  16 +import javax.annotation.Resource;
  17 +
  18 +@RestController
  19 +@RequestMapping("/nature-computational/article")
  20 +@Api("Nature")
  21 +public class NatureComputationalController {
  22 + @Resource
  23 + private NatureComputationalPcoessor natureComputationalPcoessor;
  24 + @Resource
  25 + private ArticlePipeline articlePipeline;
  26 +
  27 + @GetMapping("/start")
  28 + @ApiOperation("start")
  29 + public ServerResult start() {
  30 + Spider.create(natureComputationalPcoessor)
  31 + .addUrl("https://www.nature.com/natcomputsci")
  32 + .setUUID(UuidUtil.getTimeBasedUuid().toString())
  33 + .addPipeline(articlePipeline)
  34 + .setScheduler(new RedisScheduler("127.0.0.1"))
  35 + .thread(20).run();
  36 + return ServerResult.success();
  37 + }
  38 +}
... ...
src/main/java/com/canrd/webmagic/controller/NatureEnergyController.java 0 → 100644
  1 +package com.canrd.webmagic.controller;
  2 +
  3 +import com.canrd.webmagic.common.constant.ServerResult;
  4 +import com.canrd.webmagic.processor.NatureEnergyPagePcoessor;
  5 +import com.canrd.webmagic.processor.NatureMaterialPagePcoessor;
  6 +import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
  7 +import io.swagger.annotations.Api;
  8 +import io.swagger.annotations.ApiOperation;
  9 +import org.apache.logging.log4j.core.util.UuidUtil;
  10 +import org.springframework.web.bind.annotation.GetMapping;
  11 +import org.springframework.web.bind.annotation.RequestMapping;
  12 +import org.springframework.web.bind.annotation.RestController;
  13 +import us.codecraft.webmagic.Spider;
  14 +import us.codecraft.webmagic.scheduler.RedisScheduler;
  15 +
  16 +import javax.annotation.Resource;
  17 +
  18 +@RestController
  19 +@RequestMapping("/nature-energy/article")
  20 +@Api("Nature")
  21 +public class NatureEnergyController {
  22 + @Resource
  23 + private NatureEnergyPagePcoessor natureEnergyPagePcoessor;
  24 + @Resource
  25 + private ArticlePipeline articlePipeline;
  26 +
  27 + @GetMapping("/start")
  28 + @ApiOperation("start")
  29 + public ServerResult start() {
  30 + Spider.create(natureEnergyPagePcoessor)
  31 + .addUrl("https://www.nature.com/nenergy/research-articles")
  32 + .setUUID(UuidUtil.getTimeBasedUuid().toString())
  33 + .addPipeline(articlePipeline)
  34 + .setScheduler(new RedisScheduler("127.0.0.1"))
  35 + .thread(20).run();
  36 + return ServerResult.success();
  37 + }
  38 +}
... ...
src/main/java/com/canrd/webmagic/controller/NatureMaterialController.java 0 → 100644
  1 +package com.canrd.webmagic.controller;
  2 +
  3 +import com.canrd.webmagic.common.constant.ServerResult;
  4 +import com.canrd.webmagic.processor.MatterPagePcoessor;
  5 +import com.canrd.webmagic.processor.NatureMaterialPagePcoessor;
  6 +import com.canrd.webmagic.processor.download.SeleniumDownloader;
  7 +import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
  8 +import io.swagger.annotations.Api;
  9 +import io.swagger.annotations.ApiOperation;
  10 +import org.apache.logging.log4j.core.util.UuidUtil;
  11 +import org.springframework.web.bind.annotation.GetMapping;
  12 +import org.springframework.web.bind.annotation.RequestMapping;
  13 +import org.springframework.web.bind.annotation.RestController;
  14 +import us.codecraft.webmagic.Spider;
  15 +import us.codecraft.webmagic.scheduler.RedisScheduler;
  16 +
  17 +import javax.annotation.Resource;
  18 +
  19 +@RestController
  20 +@RequestMapping("/nature-material/article")
  21 +@Api("Nature")
  22 +public class NatureMaterialController {
  23 + @Resource
  24 + private NatureMaterialPagePcoessor natureMaterialPagePcoessor;
  25 + @Resource
  26 + private ArticlePipeline articlePipeline;
  27 +
  28 + @GetMapping("/start")
  29 + @ApiOperation("start")
  30 + public ServerResult start() {
  31 + Spider.create(natureMaterialPagePcoessor)
  32 + // 添加这个Spider要爬取的网页地址
  33 + .addUrl("https://www.nature.com/nmat/articles")
  34 + .addUrl("https://www.nature.com/search?q=battery&journal=nmat&order=relevance")
  35 + .addUrl("https://www.nature.com/search?q=batteries&journal=nmat&order=relevance")
  36 + .addUrl("https://www.nature.com/search?q=battery")
  37 + .addUrl("https://www.nature.com/nature/research-articles")
  38 + .setUUID(UuidUtil.getTimeBasedUuid().toString())
  39 + .addPipeline(articlePipeline)
  40 + .setScheduler(new RedisScheduler("127.0.0.1"))
  41 + // 开启5个线程执行,并开始爬取
  42 + .thread(60).run();
  43 + return ServerResult.success();
  44 + }
  45 +}
... ...
src/main/java/com/canrd/webmagic/controller/NatureMethodsController.java 0 → 100644
  1 +package com.canrd.webmagic.controller;
  2 +
  3 +import com.canrd.webmagic.common.constant.ServerResult;
  4 +import com.canrd.webmagic.processor.NatureMethodsPcoessor;
  5 +import com.canrd.webmagic.processor.NatureNanotechnologyProcessor;
  6 +import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
  7 +import io.swagger.annotations.Api;
  8 +import io.swagger.annotations.ApiOperation;
  9 +import org.apache.logging.log4j.core.util.UuidUtil;
  10 +import org.springframework.web.bind.annotation.GetMapping;
  11 +import org.springframework.web.bind.annotation.RequestMapping;
  12 +import org.springframework.web.bind.annotation.RestController;
  13 +import us.codecraft.webmagic.Spider;
  14 +import us.codecraft.webmagic.scheduler.RedisScheduler;
  15 +
  16 +import javax.annotation.Resource;
  17 +
  18 +@RestController
  19 +@RequestMapping("/nature-methods/article")
  20 +@Api("Nature")
  21 +public class NatureMethodsController {
  22 + @Resource
  23 + private NatureMethodsPcoessor natureMethodsPcoessor;
  24 + @Resource
  25 + private ArticlePipeline articlePipeline;
  26 +
  27 + @GetMapping("/start")
  28 + @ApiOperation("start")
  29 + public ServerResult start() {
  30 + Spider.create(natureMethodsPcoessor)
  31 + // 添加这个Spider要爬取的网页地址
  32 + .addUrl("https://www.nature.com/nmeth/")
  33 + .setUUID(UuidUtil.getTimeBasedUuid().toString())
  34 + .addPipeline(articlePipeline)
  35 + .setScheduler(new RedisScheduler("127.0.0.1"))
  36 + // 开启5个线程执行,并开始爬取
  37 + .thread(20).run();
  38 + return ServerResult.success();
  39 + }
  40 +}
... ...
src/main/java/com/canrd/webmagic/controller/NatureNanotechnologyController.java 0 → 100644
  1 +package com.canrd.webmagic.controller;
  2 +
  3 +import com.canrd.webmagic.common.constant.ServerResult;
  4 +import com.canrd.webmagic.processor.NatureMaterialPagePcoessor;
  5 +import com.canrd.webmagic.processor.NatureNanotechnologyProcessor;
  6 +import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
  7 +import io.swagger.annotations.Api;
  8 +import io.swagger.annotations.ApiOperation;
  9 +import org.apache.logging.log4j.core.util.UuidUtil;
  10 +import org.springframework.web.bind.annotation.GetMapping;
  11 +import org.springframework.web.bind.annotation.RequestMapping;
  12 +import org.springframework.web.bind.annotation.RestController;
  13 +import us.codecraft.webmagic.Spider;
  14 +import us.codecraft.webmagic.scheduler.RedisScheduler;
  15 +
  16 +import javax.annotation.Resource;
  17 +
  18 +@RestController
  19 +@RequestMapping("/nature-nanotechnology/article")
  20 +@Api("Nature")
  21 +public class NatureNanotechnologyController {
  22 + @Resource
  23 + private NatureNanotechnologyProcessor natureNanotechnologyProcessor;
  24 + @Resource
  25 + private ArticlePipeline articlePipeline;
  26 +
  27 + @GetMapping("/start")
  28 + @ApiOperation("start")
  29 + public ServerResult start() {
  30 + Spider.create(natureNanotechnologyProcessor)
  31 + // 添加这个Spider要爬取的网页地址
  32 + .addUrl("https://www.nature.com/nnano/")
  33 + .setUUID(UuidUtil.getTimeBasedUuid().toString())
  34 + .addPipeline(articlePipeline)
  35 + .setScheduler(new RedisScheduler("127.0.0.1"))
  36 + // 开启5个线程执行,并开始爬取
  37 + .thread(60).run();
  38 + return ServerResult.success();
  39 + }
  40 +}
... ...
src/main/java/com/canrd/webmagic/controller/NaturePhysicsController.java 0 → 100644
  1 +package com.canrd.webmagic.controller;
  2 +
  3 +import com.canrd.webmagic.common.constant.ServerResult;
  4 +import com.canrd.webmagic.processor.NatureEnergyPagePcoessor;
  5 +import com.canrd.webmagic.processor.NaturePhysicsProcessor;
  6 +import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
  7 +import io.swagger.annotations.Api;
  8 +import io.swagger.annotations.ApiOperation;
  9 +import org.apache.logging.log4j.core.util.UuidUtil;
  10 +import org.springframework.web.bind.annotation.GetMapping;
  11 +import org.springframework.web.bind.annotation.RequestMapping;
  12 +import org.springframework.web.bind.annotation.RestController;
  13 +import us.codecraft.webmagic.Spider;
  14 +import us.codecraft.webmagic.scheduler.RedisScheduler;
  15 +
  16 +import javax.annotation.Resource;
  17 +@RestController
  18 +@RequestMapping("/nature-physics/article")
  19 +@Api("Nature")
  20 +public class NaturePhysicsController {
  21 + @Resource
  22 + private NaturePhysicsProcessor naturePhysicsProcessor;
  23 + @Resource
  24 + private ArticlePipeline articlePipeline;
  25 +
  26 + @GetMapping("/start")
  27 + @ApiOperation("start")
  28 + public ServerResult start() {
  29 + Spider.create(naturePhysicsProcessor)
  30 + .addUrl("https://www.nature.com/nphys/")
  31 + .setUUID(UuidUtil.getTimeBasedUuid().toString())
  32 + .addPipeline(articlePipeline)
  33 +// .setScheduler(new RedisScheduler("127.0.0.1"))
  34 + .thread(20).run();
  35 + return ServerResult.success();
  36 + }
  37 +}
0 38 \ No newline at end of file
... ...
src/main/java/com/canrd/webmagic/domain/ArticleTypeEnum.java
... ... @@ -18,6 +18,13 @@ public enum ArticleTypeEnum {
18 18 SCIENCE("science", "science网址"),
19 19 SCIENCE_SPJ("science-spj", "science网址-spj"),
20 20 UNIVIE_PHYSNANO("univie-physnano", "univie网址-physnano"),
  21 + NATURE_MATERIAL("nuture-material","nuture网站-material"),
  22 + NATURE_NANOTECHNOLOGY("nature-nanotechnology","nuture网站-nanotechnology"),
  23 + NATURE_PHYSICS("nature-physics","nuture网站-physics"),
  24 + NATURE_ENERGY("nature-energy","nuture网站-energy"),
  25 + NATURE_COMMUNICATIONS("nature-communications","nuture网站-communications"),
  26 + NATURE_COMPUTATIONAL_SCIENCE("nature-computational|science","nuture网站-computational|science"),
  27 + NATURE_METHODS("nature-methods","nuture网站-methods"),
21 28 ;
22 29 private String type;
23 30 private String desc;
... ...
src/main/java/com/canrd/webmagic/processor/AdvancedEnergyMaterialPcoessor.java 0 → 100644
  1 +package com.canrd.webmagic.processor;
  2 +
  3 +import us.codecraft.webmagic.Page;
  4 +import us.codecraft.webmagic.Site;
  5 +import us.codecraft.webmagic.processor.PageProcessor;
  6 +
  7 +public class AdvancedEnergyMaterialPcoessor implements PageProcessor {
  8 + @Override
  9 + public void process(Page page) {
  10 +
  11 + }
  12 +
  13 + @Override
  14 + public Site getSite() {
  15 + return PageProcessor.super.getSite();
  16 + }
  17 +
  18 +
  19 +}
... ...
src/main/java/com/canrd/webmagic/processor/ChemicalPagePcoessor.java
1 1 package com.canrd.webmagic.processor;
2 2  
  3 +import com.canrd.webmagic.DNS.DnsResolver;
3 4 import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
  5 +import com.gargoylesoftware.htmlunit.AbstractPage;
4 6 import us.codecraft.webmagic.Page;
  7 +import us.codecraft.webmagic.Request;
5 8 import us.codecraft.webmagic.Site;
6 9 import us.codecraft.webmagic.Spider;
7 10 import us.codecraft.webmagic.processor.PageProcessor;
8 11 import us.codecraft.webmagic.selector.Html;
  12 +import us.codecraft.webmagic.selector.PlainText;
9 13  
10 14 public class ChemicalPagePcoessor implements PageProcessor {
11   - private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
  15 +
  16 + private Request request;
  17 + String domain = new PlainText(request.getUrl()).regex("//(.*?)/").get();
  18 +
  19 + String ip = DnsResolver.resolve(domain);
  20 + private Site site = Site.me().setRetryTimes(3).setSleepTime(100).setDomain(domain).addCookie("ip", ip);;
12 21 @Override
13 22 public void process(Page page) {
14 23  
... ...
src/main/java/com/canrd/webmagic/processor/MatterPragePcoessor.java renamed to src/main/java/com/canrd/webmagic/processor/MatterPagePcoessor.java
1 1 package com.canrd.webmagic.processor;
2 2  
  3 +import com.canrd.webmagic.processor.config.Agent;
3 4 import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
4 5 import org.springframework.stereotype.Component;
  6 +import lombok.extern.slf4j.Slf4j;
5 7 import us.codecraft.webmagic.Page;
6 8 import us.codecraft.webmagic.Site;
7 9 import us.codecraft.webmagic.Spider;
8 10 import us.codecraft.webmagic.processor.PageProcessor;
9 11 import us.codecraft.webmagic.selector.Html;
  12 +
10 13 @Component
11   -public class MatterPragePcoessor implements PageProcessor {
12   - private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
  14 +@Slf4j
  15 +public class MatterPagePcoessor implements PageProcessor {
  16 +
  17 + private Site site = Site.me()
  18 + .setRetryTimes(3)
  19 + .setSleepTime(5)
  20 + .setUserAgent(Agent.getRandom());
  21 +
13 22 @Override
14 23 public void process(Page page) {
  24 + //首页
  25 + if (page.getUrl().get().equals("https://www.cell.com/matter/home")){
  26 +
  27 + }
  28 + //搜索页
  29 + else if (page.getUrl().get().contains("https://www.cell.com/action/doSearch?")) {
15 30  
  31 + }
  32 + //详情页
  33 + else if (page.getUrl().get().contains("https://www.cell.com/matter/fulltext/")) {
  34 + doArticleContent(page);
  35 + }
16 36 }
17 37  
18 38 @Override
19 39 public Site getSite() {
20   - return PageProcessor.super.getSite();
  40 + return this.site;
21 41 }
22 42  
23   - public void doArticleContent(Page page){
  43 + public void doArticleContent(Page page) {
24 44 Html html = page.getHtml();
  45 + log.info(String.valueOf(html));
25 46 String articleCode = page.getUrl().get();
26 47 // html.xpath()
27 48 }
28 49  
29 50 public static void main(String[] args) {
30 51 // 创建一个Spider,并把我们的处理器放进去
31   - Spider.create(new MatterPragePcoessor())
  52 + Spider.create(new MatterPagePcoessor())
32 53 // 添加这个Spider要爬取的网页地址
33   - .addUrl("https://www.cell.com/matter/fulltext/S2590-2385(21)00631-7#%20")
  54 + .addUrl("https://www.cell.com/matter/home")
34 55 .addPipeline(new ArticlePipeline())
35 56 // 开启5个线程执行,并开始爬取
36   - .thread(5).run();
  57 + .thread(1).run();
37 58 }
38 59 }
... ...
src/main/java/com/canrd/webmagic/processor/NatureCommunicatiosPcoessor.java 0 → 100644
  1 +package com.canrd.webmagic.processor;
  2 +
  3 +import com.alibaba.fastjson.JSONArray;
  4 +import com.alibaba.fastjson.JSONObject;
  5 +import com.baomidou.mybatisplus.core.toolkit.CollectionUtils;
  6 +import com.canrd.webmagic.common.utils.DateUtil;
  7 +import com.canrd.webmagic.common.utils.StringUtils;
  8 +import com.canrd.webmagic.domain.ArticleTypeEnum;
  9 +import com.canrd.webmagic.domain.dto.ArticleDO;
  10 +import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
  11 +import lombok.extern.slf4j.Slf4j;
  12 +import org.springframework.stereotype.Component;
  13 +import us.codecraft.webmagic.Page;
  14 +import us.codecraft.webmagic.Site;
  15 +import us.codecraft.webmagic.Spider;
  16 +import us.codecraft.webmagic.processor.PageProcessor;
  17 +import us.codecraft.webmagic.selector.Html;
  18 +import us.codecraft.webmagic.selector.Selectable;
  19 +import us.codecraft.webmagic.selector.XpathSelector;
  20 +
  21 +import java.text.ParseException;
  22 +import java.text.SimpleDateFormat;
  23 +import java.util.*;
  24 +import java.util.stream.Collectors;
  25 +
  26 +@Slf4j
  27 +@Component
  28 +public class NatureCommunicatiosPcoessor implements PageProcessor {
  29 +
  30 + @Override
  31 + public void process(Page page) {
  32 + if (page.getUrl().get().equals("https://www.nature.com/ncomms/articles?type=editorial")){
  33 + getIndex(page);
  34 + } else if (page.getUrl().get().contains("https://www.nature.com/ncomms/articles?searchType=journalSearch&sort=PubDate&type=editorial&page=")) {
  35 + everyPage(page);
  36 + }else if (page.getUrl().get().contains("https://www.nature.com/articles/s41467-022-29269-6")){
  37 + doArticleContent(page);
  38 + }
  39 + }
  40 +
  41 + @Override
  42 + public Site getSite() {
  43 + return PageProcessor.super.getSite();
  44 + }
  45 +
  46 + public void getIndex(Page page){
  47 + String maxIndex = page.getHtml().xpath("//div[@class='u-mb-48']/nav/ul/li[5]/a/text()").get();
  48 + log.info(maxIndex);
  49 + String trim = maxIndex.trim();
  50 + int number = Integer.parseInt(trim);
  51 + System.out.printf("", number);
  52 + for (int i = 1; i <= number; i++) {
  53 + log.info(String.valueOf(i));
  54 + page.addTargetRequest("https://www.nature.com/ncomms/articles?searchType=journalSearch&sort=PubDate&type=editorial&page="+i);
  55 + }
  56 + }
  57 +
  58 + public void everyPage(Page page){
  59 + List<String> all = page.getHtml().xpath("//div[@class='u-flex-direction-column']/h3/a//@href").all();
  60 + for (int i = 1; i <= all.size(); i++) {
  61 + log.info(String.valueOf(i));
  62 + page.addTargetRequest("https://www.nature.com"+all.get(i));
  63 + }
  64 + }
  65 +
  66 + private void doArticleContent(Page page) {
  67 + Html html = page.getHtml();
  68 + String articleCode = page.getUrl().get();
  69 + Selectable headSelectable = html.xpath("//div[@class='c-article-header']/header");
  70 + List<Selectable> authorEmailSelectables = html.xpath("//p[@id='corresponding-author-list']/a").nodes();
  71 + Selectable referencesSelectable = html.xpath("//ol[@class='c-article-references']").select(new XpathSelector("li[@class='c-article-references__item js-c-reading-companion-references-item']"));
  72 + Selectable authorAddressSelectable = html.xpath("//ol[@class='c-article-author-affiliation__list']").select(new XpathSelector("li"));
  73 +
  74 + String title = headSelectable.xpath("//div/h1/text()").get();
  75 + if (StringUtils.isBlank(title)) {
  76 + title = headSelectable.xpath("//h1/text()").get();
  77 + }
  78 + String articleDesc = html.xpath("//div[@class='c-article-section__content']/p/text()").get();
  79 + String publishTime;
  80 + Date publishTimeDateTime = null;
  81 + try {
  82 + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get();
  83 + } catch (Exception e) {
  84 + try {
  85 + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(1).xpath("//li/time/text()").get();
  86 + } catch (Exception e1) {
  87 + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(0).xpath("//li/time/text()").get();
  88 + }
  89 + }
  90 + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH);
  91 +
  92 + try {
  93 + publishTimeDateTime = formatter.parse(publishTime);
  94 + } catch (ParseException e) {
  95 + e.printStackTrace();
  96 + }
  97 + Selectable authorSelectable = headSelectable.xpath("//ul").nodes().get(1).select(new XpathSelector("li[@class='c-article-author-list__item']"));
  98 + List<Selectable> authorNodes = authorSelectable.nodes();
  99 + StringBuffer authorName = new StringBuffer();
  100 + for (Selectable node : authorNodes) {
  101 + authorName.append(node.xpath("//a/text()"));
  102 + }
  103 +
  104 + JSONArray authorAddress = new JSONArray();
  105 + List<Selectable> authorAddressList = authorAddressSelectable.nodes();
  106 + if (CollectionUtils.isNotEmpty(authorAddressList)) {
  107 + for (Selectable selectable : authorAddressList) {
  108 + String address = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__address']/text()").get();
  109 + String authorNames = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__authors-list']/text()").get();
  110 + JSONObject object = new JSONObject();
  111 + object.put("address", address);
  112 + object.put("authorNames", authorNames);
  113 + authorAddress.add(object);
  114 + }
  115 + }
  116 +
  117 + JSONArray references = new JSONArray();
  118 + List<Selectable> referenceList = referencesSelectable.nodes();
  119 + if (CollectionUtils.isNotEmpty(referenceList)) {
  120 + for (Selectable reference : referenceList) {
  121 + String referenceTitle = reference.xpath("//p").xpath("//p[@class='c-article-references__text']/text()").get();
  122 + List<Selectable> referenceLinks = reference.xpath("//p").xpath("//p[@class='c-article-references__links u-hide-print']").links().nodes();
  123 + List<String> links = new ArrayList<>();
  124 + if (CollectionUtils.isNotEmpty(referenceLinks)) {
  125 + links = referenceLinks.stream().map(x -> x.get()).collect(Collectors.toList());
  126 + }
  127 + JSONObject object = new JSONObject();
  128 + object.put("referenceTitle", referenceTitle);
  129 + object.put("links", links);
  130 +// if (CollectionUtils.isNotEmpty(links)) {
  131 +// page.addTargetRequests(links.stream().filter(x -> x.contains("nature")).collect(Collectors.toList()));
  132 +// }
  133 + references.add(object);
  134 + }
  135 + }
  136 +
  137 + JSONArray authorEmail = new JSONArray();
  138 + for (Selectable authorEmailSelectable : authorEmailSelectables) {
  139 + String[] split = authorEmailSelectable.xpath("//a").links().get().split(":");
  140 + String email = Objects.isNull(split) ? "" : split[split.length - 1];
  141 + String authorEmailName = authorEmailSelectable.xpath("//a/text()").get();
  142 + JSONObject jsonObject = new JSONObject();
  143 + jsonObject.put("authorEmailName", authorEmailName);
  144 + jsonObject.put("email", email);
  145 + authorEmail.add(jsonObject);
  146 + }
  147 + log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorEmail.toJSONString());
  148 +
  149 + page.putField("article", ArticleDO.builder()
  150 + .articleType(ArticleTypeEnum.NATURE_COMMUNICATIONS.getType())
  151 + .articleCode(articleCode)
  152 + .authorName(authorName.toString())
  153 + .title(title)
  154 + .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE))
  155 + .emailInfo(authorEmail.toJSONString())
  156 + .articleDesc(articleDesc)
  157 + .authorAddress(authorAddress.toJSONString())
  158 + .referenceInfo(references.toJSONString()).build());
  159 + }
  160 +
  161 + public static void main(String[] args) {
  162 + Spider.create(new MatterPagePcoessor())
  163 + .addUrl("https://www.nature.com/nenergy/research-articles")
  164 + .addPipeline(new ArticlePipeline())
  165 + .thread(1).run();
  166 + }
  167 +}
... ...
src/main/java/com/canrd/webmagic/processor/NatureComputationalPcoessor.java 0 → 100644
  1 +package com.canrd.webmagic.processor;
  2 +
  3 +import com.alibaba.fastjson.JSONArray;
  4 +import com.alibaba.fastjson.JSONObject;
  5 +import com.baomidou.mybatisplus.core.toolkit.CollectionUtils;
  6 +import com.canrd.webmagic.common.utils.DateUtil;
  7 +import com.canrd.webmagic.common.utils.StringUtils;
  8 +import com.canrd.webmagic.domain.ArticleTypeEnum;
  9 +import com.canrd.webmagic.domain.dto.ArticleDO;
  10 +import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
  11 +import lombok.extern.slf4j.Slf4j;
  12 +import org.springframework.stereotype.Component;
  13 +import us.codecraft.webmagic.Page;
  14 +import us.codecraft.webmagic.Site;
  15 +import us.codecraft.webmagic.Spider;
  16 +import us.codecraft.webmagic.processor.PageProcessor;
  17 +import us.codecraft.webmagic.selector.Html;
  18 +import us.codecraft.webmagic.selector.Selectable;
  19 +import us.codecraft.webmagic.selector.XpathSelector;
  20 +
  21 +import java.text.ParseException;
  22 +import java.text.SimpleDateFormat;
  23 +import java.util.*;
  24 +import java.util.stream.Collectors;
  25 +
  26 +@Slf4j
  27 +@Component
  28 +public class NatureComputationalPcoessor implements PageProcessor{
  29 + @Override
  30 + public void process(Page page) {
  31 + String url = page.getUrl().get();
  32 + if (url.contains("https://www.nature.com/nenergy/research-articles?searchType=journalSearch&sort=PubDate&page=")){
  33 + everyPage(page);
  34 + } else if (url.contains("https://www.nature.com/")){
  35 + doArticleContent(page);
  36 + }
  37 + }
  38 +
  39 + @Override
  40 + public Site getSite() {
  41 + return PageProcessor.super.getSite();
  42 + }
  43 +
  44 +
  45 + public void everyPage(Page page){
  46 + List<String> all = page.getHtml().xpath("//div[@class='u-flex-direction-column']/h3/a//@href").all();
  47 + for (int i = 0; i < all.size(); i++) {
  48 + page.addTargetRequest("https://www.nature.com/"+all.get(i));
  49 + }
  50 + }
  51 +
  52 + private void doArticleContent(Page page) {
  53 + Html html = page.getHtml();
  54 + String articleCode = page.getUrl().get();
  55 + Selectable headSelectable = html.xpath("//div[@class='c-article-header']/header");
  56 + List<Selectable> authorEmailSelectables = html.xpath("//p[@id='corresponding-author-list']/a").nodes();
  57 + Selectable referencesSelectable = html.xpath("//ol[@class='c-article-references']").select(new XpathSelector("li[@class='c-article-references__item js-c-reading-companion-references-item']"));
  58 + Selectable authorAddressSelectable = html.xpath("//ol[@class='c-article-author-affiliation__list']").select(new XpathSelector("li"));
  59 +
  60 + String title = headSelectable.xpath("//div/h1/text()").get();
  61 + if (StringUtils.isBlank(title)) {
  62 + title = headSelectable.xpath("//h1/text()").get();
  63 + }
  64 + String articleDesc = html.xpath("//div[@class='c-article-section__content']/p/text()").get();
  65 + String publishTime;
  66 + Date publishTimeDateTime = null;
  67 + try {
  68 + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get();
  69 + } catch (Exception e) {
  70 + try {
  71 + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(1).xpath("//li/time/text()").get();
  72 + } catch (Exception e1) {
  73 + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(0).xpath("//li/time/text()").get();
  74 + }
  75 + }
  76 + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH);
  77 +
  78 + try {
  79 + publishTimeDateTime = formatter.parse(publishTime);
  80 + } catch (ParseException e) {
  81 + e.printStackTrace();
  82 + }
  83 + Selectable authorSelectable = headSelectable.xpath("//ul").nodes().get(1).select(new XpathSelector("li[@class='c-article-author-list__item']"));
  84 + List<Selectable> authorNodes = authorSelectable.nodes();
  85 + StringBuffer authorName = new StringBuffer();
  86 + for (Selectable node : authorNodes) {
  87 + authorName.append(node.xpath("//a/text()"));
  88 + }
  89 +
  90 + JSONArray authorAddress = new JSONArray();
  91 + List<Selectable> authorAddressList = authorAddressSelectable.nodes();
  92 + if (CollectionUtils.isNotEmpty(authorAddressList)) {
  93 + for (Selectable selectable : authorAddressList) {
  94 + String address = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__address']/text()").get();
  95 + String authorNames = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__authors-list']/text()").get();
  96 + JSONObject object = new JSONObject();
  97 + object.put("address", address);
  98 + object.put("authorNames", authorNames);
  99 + authorAddress.add(object);
  100 + }
  101 + }
  102 +
  103 + JSONArray references = new JSONArray();
  104 + List<Selectable> referenceList = referencesSelectable.nodes();
  105 + if (CollectionUtils.isNotEmpty(referenceList)) {
  106 + for (Selectable reference : referenceList) {
  107 + String referenceTitle = reference.xpath("//p").xpath("//p[@class='c-article-references__text']/text()").get();
  108 + List<Selectable> referenceLinks = reference.xpath("//p").xpath("//p[@class='c-article-references__links u-hide-print']").links().nodes();
  109 + List<String> links = new ArrayList<>();
  110 + if (CollectionUtils.isNotEmpty(referenceLinks)) {
  111 + links = referenceLinks.stream().map(x -> x.get()).collect(Collectors.toList());
  112 + }
  113 + JSONObject object = new JSONObject();
  114 + object.put("referenceTitle", referenceTitle);
  115 + object.put("links", links);
  116 +// if (CollectionUtils.isNotEmpty(links)) {
  117 +// page.addTargetRequests(links.stream().filter(x -> x.contains("nature")).collect(Collectors.toList()));
  118 +// }
  119 + references.add(object);
  120 + }
  121 + }
  122 +
  123 + JSONArray authorEmail = new JSONArray();
  124 + for (Selectable authorEmailSelectable : authorEmailSelectables) {
  125 + String[] split = authorEmailSelectable.xpath("//a").links().get().split(":");
  126 + String email = Objects.isNull(split) ? "" : split[split.length - 1];
  127 + String authorEmailName = authorEmailSelectable.xpath("//a/text()").get();
  128 + JSONObject jsonObject = new JSONObject();
  129 + jsonObject.put("authorEmailName", authorEmailName);
  130 + jsonObject.put("email", email);
  131 + authorEmail.add(jsonObject);
  132 + }
  133 + log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorEmail.toJSONString());
  134 +
  135 + page.putField("article", ArticleDO.builder()
  136 + .articleType(ArticleTypeEnum.NATURE_COMPUTATIONAL_SCIENCE.getType())
  137 + .articleCode(articleCode)
  138 + .authorName(authorName.toString())
  139 + .title(title)
  140 + .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE))
  141 + .emailInfo(authorEmail.toJSONString())
  142 + .articleDesc(articleDesc)
  143 + .authorAddress(authorAddress.toJSONString())
  144 + .referenceInfo(references.toJSONString()).build());
  145 + }
  146 +
  147 + public static void main(String[] args) {
  148 + Spider.create(new MatterPagePcoessor())
  149 + .addUrl("https://www.nature.com/nenergy/research-articles")
  150 + .addPipeline(new ArticlePipeline())
  151 + .thread(1).run();
  152 + }
  153 +}
... ...
src/main/java/com/canrd/webmagic/processor/NatureEnergyPagePcoessor.java 0 → 100644
  1 +package com.canrd.webmagic.processor;
  2 +
  3 +import com.alibaba.fastjson.JSONArray;
  4 +import com.alibaba.fastjson.JSONObject;
  5 +import com.baomidou.mybatisplus.core.toolkit.CollectionUtils;
  6 +import com.canrd.webmagic.common.utils.DateUtil;
  7 +import com.canrd.webmagic.common.utils.StringUtils;
  8 +import com.canrd.webmagic.domain.ArticleTypeEnum;
  9 +import com.canrd.webmagic.domain.dto.ArticleDO;
  10 +import com.canrd.webmagic.processor.MatterPagePcoessor;
  11 +import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
  12 +import lombok.extern.slf4j.Slf4j;
  13 +import org.springframework.stereotype.Component;
  14 +import us.codecraft.webmagic.Page;
  15 +import us.codecraft.webmagic.Site;
  16 +import us.codecraft.webmagic.Spider;
  17 +import us.codecraft.webmagic.processor.PageProcessor;
  18 +import us.codecraft.webmagic.selector.Html;
  19 +import us.codecraft.webmagic.selector.Selectable;
  20 +import us.codecraft.webmagic.selector.XpathSelector;
  21 +
  22 +import java.text.ParseException;
  23 +import java.text.SimpleDateFormat;
  24 +import java.util.*;
  25 +import java.util.stream.Collectors;
  26 +
  27 +@Component
  28 +@Slf4j
  29 +public class NatureEnergyPagePcoessor implements PageProcessor {
  30 + @Override
  31 + public void process(Page page) {
  32 + String url = page.getUrl().get();
  33 + if (url.equals("https://www.nature.com/nenergy/research-articles")){
  34 + getIndex(page);
  35 + } else if (url.contains("https://www.nature.com/nenergy/research-articles?searchType=journalSearch&sort=PubDate&page=")){
  36 + everyPage(page);
  37 + } else if (url.contains("https://www.nature.com/")){
  38 + doArticleContent(page);
  39 + }
  40 + }
  41 +
  42 + @Override
  43 + public Site getSite() {
  44 + return PageProcessor.super.getSite();
  45 + }
  46 +
  47 + public void getIndex(Page page){
  48 + String maxIndex = page.getHtml().xpath("//div[@class='u-mb-48']/nav/ul/li[6]/a/text()").get();
  49 + log.info(maxIndex);
  50 + String trim = maxIndex.trim();
  51 + int number = Integer.parseInt(trim);
  52 + System.out.printf("", number);
  53 + for (int i = 0; i < number; i++) {
  54 + page.addTargetRequest("https://www.nature.com/nenergy/research-articles?searchType=journalSearch&sort=PubDate&page="+i);
  55 + }
  56 + }
  57 +
  58 + public void everyPage(Page page){
  59 + List<String> all = page.getHtml().xpath("//div[@class='u-flex-direction-column']/h3/a//@href").all();
  60 + for (int i = 0; i < all.size(); i++) {
  61 + page.addTargetRequest("https://www.nature.com/"+all.get(i));
  62 + }
  63 + }
  64 +
  65 + private void doArticleContent(Page page) {
  66 + Html html = page.getHtml();
  67 + String articleCode = page.getUrl().get();
  68 + Selectable headSelectable = html.xpath("//div[@class='c-article-header']/header");
  69 + List<Selectable> authorEmailSelectables = html.xpath("//p[@id='corresponding-author-list']/a").nodes();
  70 + Selectable referencesSelectable = html.xpath("//ol[@class='c-article-references']").select(new XpathSelector("li[@class='c-article-references__item js-c-reading-companion-references-item']"));
  71 + Selectable authorAddressSelectable = html.xpath("//ol[@class='c-article-author-affiliation__list']").select(new XpathSelector("li"));
  72 +
  73 + String title = headSelectable.xpath("//div/h1/text()").get();
  74 + if (StringUtils.isBlank(title)) {
  75 + title = headSelectable.xpath("//h1/text()").get();
  76 + }
  77 + String articleDesc = html.xpath("//div[@class='c-article-section__content']/p/text()").get();
  78 + String publishTime;
  79 + Date publishTimeDateTime = null;
  80 + try {
  81 + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get();
  82 + } catch (Exception e) {
  83 + try {
  84 + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(1).xpath("//li/time/text()").get();
  85 + } catch (Exception e1) {
  86 + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(0).xpath("//li/time/text()").get();
  87 + }
  88 + }
  89 + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH);
  90 +
  91 + try {
  92 + publishTimeDateTime = formatter.parse(publishTime);
  93 + } catch (ParseException e) {
  94 + e.printStackTrace();
  95 + }
  96 + Selectable authorSelectable = headSelectable.xpath("//ul").nodes().get(1).select(new XpathSelector("li[@class='c-article-author-list__item']"));
  97 + List<Selectable> authorNodes = authorSelectable.nodes();
  98 + StringBuffer authorName = new StringBuffer();
  99 + for (Selectable node : authorNodes) {
  100 + authorName.append(node.xpath("//a/text()"));
  101 + }
  102 +
  103 + JSONArray authorAddress = new JSONArray();
  104 + List<Selectable> authorAddressList = authorAddressSelectable.nodes();
  105 + if (CollectionUtils.isNotEmpty(authorAddressList)) {
  106 + for (Selectable selectable : authorAddressList) {
  107 + String address = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__address']/text()").get();
  108 + String authorNames = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__authors-list']/text()").get();
  109 + JSONObject object = new JSONObject();
  110 + object.put("address", address);
  111 + object.put("authorNames", authorNames);
  112 + authorAddress.add(object);
  113 + }
  114 + }
  115 +
  116 + JSONArray references = new JSONArray();
  117 + List<Selectable> referenceList = referencesSelectable.nodes();
  118 + if (CollectionUtils.isNotEmpty(referenceList)) {
  119 + for (Selectable reference : referenceList) {
  120 + String referenceTitle = reference.xpath("//p").xpath("//p[@class='c-article-references__text']/text()").get();
  121 + List<Selectable> referenceLinks = reference.xpath("//p").xpath("//p[@class='c-article-references__links u-hide-print']").links().nodes();
  122 + List<String> links = new ArrayList<>();
  123 + if (CollectionUtils.isNotEmpty(referenceLinks)) {
  124 + links = referenceLinks.stream().map(x -> x.get()).collect(Collectors.toList());
  125 + }
  126 + JSONObject object = new JSONObject();
  127 + object.put("referenceTitle", referenceTitle);
  128 + object.put("links", links);
  129 +// if (CollectionUtils.isNotEmpty(links)) {
  130 +// page.addTargetRequests(links.stream().filter(x -> x.contains("nature")).collect(Collectors.toList()));
  131 +// }
  132 + references.add(object);
  133 + }
  134 + }
  135 +
  136 + JSONArray authorEmail = new JSONArray();
  137 + for (Selectable authorEmailSelectable : authorEmailSelectables) {
  138 + String[] split = authorEmailSelectable.xpath("//a").links().get().split(":");
  139 + String email = Objects.isNull(split) ? "" : split[split.length - 1];
  140 + String authorEmailName = authorEmailSelectable.xpath("//a/text()").get();
  141 + JSONObject jsonObject = new JSONObject();
  142 + jsonObject.put("authorEmailName", authorEmailName);
  143 + jsonObject.put("email", email);
  144 + authorEmail.add(jsonObject);
  145 + }
  146 + log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorEmail.toJSONString());
  147 +
  148 + page.putField("article", ArticleDO.builder()
  149 + .articleType(ArticleTypeEnum.NATURE_ENERGY.getType())
  150 + .articleCode(articleCode)
  151 + .authorName(authorName.toString())
  152 + .title(title)
  153 + .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE))
  154 + .emailInfo(authorEmail.toJSONString())
  155 + .articleDesc(articleDesc)
  156 + .authorAddress(authorAddress.toJSONString())
  157 + .referenceInfo(references.toJSONString()).build());
  158 + }
  159 +
  160 + public static void main(String[] args) {
  161 + Spider.create(new MatterPagePcoessor())
  162 + .addUrl("https://www.nature.com/nenergy/research-articles")
  163 + .addPipeline(new ArticlePipeline())
  164 + .thread(1).run();
  165 + }
  166 +}
... ...
src/main/java/com/canrd/webmagic/processor/NatureMaterialPagePcoessor.java 0 → 100644
  1 +package com.canrd.webmagic.processor;
  2 +
  3 +import com.alibaba.fastjson.JSONArray;
  4 +import com.alibaba.fastjson.JSONObject;
  5 +import com.baomidou.mybatisplus.core.toolkit.CollectionUtils;
  6 +import com.canrd.webmagic.common.utils.DateUtil;
  7 +import com.canrd.webmagic.common.utils.StringUtils;
  8 +import com.canrd.webmagic.domain.ArticleTypeEnum;
  9 +import com.canrd.webmagic.domain.dto.ArticleDO;
  10 +import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
  11 +import lombok.extern.slf4j.Slf4j;
  12 +import org.springframework.stereotype.Component;
  13 +import us.codecraft.webmagic.Page;
  14 +import us.codecraft.webmagic.Site;
  15 +import us.codecraft.webmagic.Spider;
  16 +import us.codecraft.webmagic.processor.PageProcessor;
  17 +import us.codecraft.webmagic.selector.Html;
  18 +import us.codecraft.webmagic.selector.Selectable;
  19 +import us.codecraft.webmagic.selector.XpathSelector;
  20 +
  21 +import java.text.ParseException;
  22 +import java.text.SimpleDateFormat;
  23 +import java.util.*;
  24 +import java.util.stream.Collectors;
  25 +
  26 +@Component
  27 +@Slf4j
  28 +public class NatureMaterialPagePcoessor implements PageProcessor {
  29 + @Override
  30 + public void process(Page page) {
  31 + String url = page.getUrl().get();
  32 + if (url.equals("https://www.nature.com/nmat/articles")){
  33 + getIndex(page);
  34 + } else if (url.contains("https://www.nature.com/nmat/articles?searchType=journalSearch&sort=PubDate&page=")){
  35 + everyPage(page);
  36 + } else if (url.equals("https://www.nature.com/search?q=battery")) {
  37 + getIndex(page);
  38 + } else if (url.contains("https://www.nature.com/search?q=battery&page=")) {
  39 + everyPage(page);
  40 + } else if (url.contains("https://www.nature.com/articles")){
  41 + doArticleContent(page);
  42 + } else if (url.equals("https://www.nature.com/nature/research-articles")) {
  43 + getIndex(page);
  44 + } else if (url.contains("https://www.nature.com/nature/research-articles?searchType=journalSearch&sort=PubDate&page=")) {
  45 + everyPage(page);
  46 + } else if (url.equals("https://www.nature.com/search?q=battery&journal=nmat&order=relevance")) {
  47 + getIndex(page);
  48 + }else if (url.equals("https://www.nature.com/search?q=batteries&journal=nmat&order=relevance")) {
  49 + getIndex(page);
  50 + }else if (url.contains("https://www.nature.com/search?q=battery&order=relevance&journal=nmat&page=")) {
  51 + everyPage(page);
  52 + }else if (url.contains("https://www.nature.com/search?q=batteries&journal=nmat&page=")) {
  53 + everyPage(page);
  54 + }
  55 + }
  56 +
  57 + @Override
  58 + public Site getSite() {
  59 + return PageProcessor.super.getSite().setRetryTimes(3).setSleepTime(100);
  60 + }
  61 +
  62 + public void getIndex(Page page){
  63 + String maxIndex = page.getHtml().xpath("//div[@class='u-mb-48']/nav/ul/li[6]/a/text()").get();
  64 + log.info(maxIndex);
  65 + String trim = maxIndex.trim();
  66 + int number = Integer.parseInt(trim);
  67 + if (page.getUrl().get().equals("https://www.nature.com/nmat/articles")){
  68 + for (int i = 1; i <= number; i++) {
  69 + page.addTargetRequest("https://www.nature.com/nmat/articles?searchType=journalSearch&sort=PubDate&page="+i);
  70 + }
  71 + }
  72 + if (page.getUrl().get().equals("https://www.nature.com/search?q=battery")){
  73 + for (int i = 1; i <= number; i++) {
  74 + page.addTargetRequest("https://www.nature.com/search?q=battery&page="+i);
  75 + }
  76 + }
  77 + if (page.getUrl().get().equals("https://www.nature.com/nature/research-articles")){
  78 + for (int i = 1; i <= number; i++) {
  79 + page.addTargetRequest("https://www.nature.com/nature/research-articles?searchType=journalSearch&sort=PubDate&page="+i);
  80 + }
  81 + }
  82 + if (page.getUrl().get().equals("https://www.nature.com/search?q=battery&journal=nmat&order=relevance")){
  83 + for (int i = 1; i <= number; i++) {
  84 + page.addTargetRequest("https://www.nature.com/search?q=battery&order=relevance&journal=nmat&page="+i);
  85 + }
  86 + }
  87 + if (page.getUrl().get().equals("https://www.nature.com/search?q=batteries&journal=nmat&order=relevance")){
  88 + for (int i = 1; i <= number; i++) {
  89 + page.addTargetRequest("https://www.nature.com/search?q=batteries&journal=nmat&page="+i);
  90 + }
  91 + }
  92 + }
  93 +
  94 + public void everyPage(Page page){
  95 + List<String> all = page.getHtml().xpath("//div[@class='u-flex-direction-column']/h3/a//@href").all();
  96 + for (int i = 0; i < all.size(); i++) {
  97 +// log.info(all.get(i));
  98 + page.addTargetRequest("https://www.nature.com"+all.get(i));
  99 + }
  100 + }
  101 +
  102 + private void doArticleContent(Page page) {
  103 + if (page.getUrl().get().contains("redirect") || !page.getUrl().get().contains("nature")) {
  104 + return;
  105 + }
  106 + //解析页面
  107 + Html html = page.getHtml();
  108 + String articleCode = page.getUrl().get();
  109 + Selectable headSelectable = html.xpath("//div[@class='c-article-header']/header");
  110 + List<Selectable> authorEmailSelectables = html.xpath("//p[@id='corresponding-author-list']/a").nodes();
  111 + Selectable referencesSelectable = html.xpath("//ol[@class='c-article-references']").select(new XpathSelector("li[@class='c-article-references__item js-c-reading-companion-references-item']"));
  112 + Selectable authorAddressSelectable = html.xpath("//ol[@class='c-article-author-affiliation__list']").select(new XpathSelector("li"));
  113 +
  114 + String title = headSelectable.xpath("//div/h1/text()").get();
  115 + if (StringUtils.isBlank(title)) {
  116 + title = headSelectable.xpath("//h1/text()").get();
  117 + }
  118 + String articleDesc = html.xpath("//div[@class='c-article-section__content']/p/text()").get();
  119 + String publishTime;
  120 + Date publishTimeDateTime = null;
  121 + try {
  122 + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get();
  123 + } catch (Exception e) {
  124 + try {
  125 + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(1).xpath("//li/time/text()").get();
  126 + } catch (Exception e1) {
  127 + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(0).xpath("//li/time/text()").get();
  128 + }
  129 + }
  130 + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH);
  131 +
  132 + try {
  133 + publishTimeDateTime = formatter.parse(publishTime);
  134 + } catch (ParseException e) {
  135 + e.printStackTrace();
  136 + }
  137 + Selectable authorSelectable = headSelectable.xpath("//ul").nodes().get(1).select(new XpathSelector("li[@class='c-article-author-list__item']"));
  138 + List<Selectable> authorNodes = authorSelectable.nodes();
  139 + StringBuffer authorName = new StringBuffer();
  140 + for (Selectable node : authorNodes) {
  141 + authorName.append(node.xpath("//a/text()"));
  142 + }
  143 +
  144 + JSONArray authorAddress = new JSONArray();
  145 + List<Selectable> authorAddressList = authorAddressSelectable.nodes();
  146 + if (CollectionUtils.isNotEmpty(authorAddressList)) {
  147 + for (Selectable selectable : authorAddressList) {
  148 + String address = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__address']/text()").get();
  149 + String authorNames = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__authors-list']/text()").get();
  150 + JSONObject object = new JSONObject();
  151 + object.put("address", address);
  152 + object.put("authorNames", authorNames);
  153 + authorAddress.add(object);
  154 + }
  155 + }
  156 +
  157 + JSONArray references = new JSONArray();
  158 + List<Selectable> referenceList = referencesSelectable.nodes();
  159 + if (CollectionUtils.isNotEmpty(referenceList)) {
  160 + for (Selectable reference : referenceList) {
  161 + String referenceTitle = reference.xpath("//p").xpath("//p[@class='c-article-references__text']/text()").get();
  162 + List<Selectable> referenceLinks = reference.xpath("//p").xpath("//p[@class='c-article-references__links u-hide-print']").links().nodes();
  163 + List<String> links = new ArrayList<>();
  164 + if (CollectionUtils.isNotEmpty(referenceLinks)) {
  165 + links = referenceLinks.stream().map(x -> x.get()).collect(Collectors.toList());
  166 + }
  167 + JSONObject object = new JSONObject();
  168 + object.put("referenceTitle", referenceTitle);
  169 + object.put("links", links);
  170 +// if (CollectionUtils.isNotEmpty(links)) {
  171 +// page.addTargetRequests(links.stream().filter(x -> x.contains("nature")).collect(Collectors.toList()));
  172 +// }
  173 + references.add(object);
  174 + }
  175 + }
  176 +
  177 + JSONArray authorEmail = new JSONArray();
  178 + for (Selectable authorEmailSelectable : authorEmailSelectables) {
  179 + String[] split = authorEmailSelectable.xpath("//a").links().get().split(":");
  180 + String email = Objects.isNull(split) ? "" : split[split.length - 1];
  181 + String authorEmailName = authorEmailSelectable.xpath("//a/text()").get();
  182 + JSONObject jsonObject = new JSONObject();
  183 + jsonObject.put("authorEmailName", authorEmailName);
  184 + jsonObject.put("email", email);
  185 + authorEmail.add(jsonObject);
  186 + }
  187 + log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorEmail.toJSONString());
  188 +
  189 + page.putField("article", ArticleDO.builder()
  190 + .articleType(ArticleTypeEnum.NATURE_MATERIAL.getType())
  191 + .articleCode(articleCode)
  192 + .authorName(authorName.toString())
  193 + .title(title)
  194 + .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE))
  195 + .emailInfo(authorEmail.toJSONString())
  196 + .articleDesc(articleDesc)
  197 + .authorAddress(authorAddress.toJSONString())
  198 + .referenceInfo(references.toJSONString()).build());
  199 + }
  200 +
  201 + public static void main(String[] args) {
  202 + Spider.create(new MatterPagePcoessor())
  203 + .addUrl("https://www.nature.com/nmat/articles?searchType=journalSearch&sort=PubDate&page=1")
  204 + .addPipeline(new ArticlePipeline())
  205 + .thread(1).run();
  206 + }
  207 +}
... ...
src/main/java/com/canrd/webmagic/processor/NatureMethodsPcoessor.java 0 → 100644
  1 +package com.canrd.webmagic.processor;
  2 +
  3 +import com.alibaba.fastjson.JSONArray;
  4 +import com.alibaba.fastjson.JSONObject;
  5 +import com.baomidou.mybatisplus.core.toolkit.CollectionUtils;
  6 +import com.canrd.webmagic.common.utils.DateUtil;
  7 +import com.canrd.webmagic.common.utils.StringUtils;
  8 +import com.canrd.webmagic.domain.ArticleTypeEnum;
  9 +import com.canrd.webmagic.domain.dto.ArticleDO;
  10 +import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
  11 +import lombok.extern.slf4j.Slf4j;
  12 +import org.springframework.stereotype.Component;
  13 +import us.codecraft.webmagic.Page;
  14 +import us.codecraft.webmagic.Site;
  15 +import us.codecraft.webmagic.Spider;
  16 +import us.codecraft.webmagic.processor.PageProcessor;
  17 +import us.codecraft.webmagic.selector.Html;
  18 +import us.codecraft.webmagic.selector.Selectable;
  19 +import us.codecraft.webmagic.selector.XpathSelector;
  20 +
  21 +import java.text.ParseException;
  22 +import java.text.SimpleDateFormat;
  23 +import java.util.*;
  24 +import java.util.stream.Collectors;
  25 +
  26 +@Slf4j
  27 +@Component
  28 +public class NatureMethodsPcoessor implements PageProcessor {
  29 + @Override
  30 + public void process(Page page) {
  31 + String url = page.getUrl().get();
  32 + if (url.equals("https://www.nature.com/nmeth/")) {
  33 + everyPage(page);
  34 + }else if (url.contains("https://www.nature.com/")){
  35 + doArticleContent(page);
  36 + }
  37 + }
  38 +
  39 + @Override
  40 + public Site getSite() {
  41 + return PageProcessor.super.getSite();
  42 + }
  43 + public void everyPage(Page page){
  44 + List<String> all = page.getHtml().xpath("//div[@class='u-flex-direction-column']/h3/a//@href").all();
  45 + for (int i = 0; i < all.size(); i++) {
  46 + page.addTargetRequest("https://www.nature.com/"+all.get(i));
  47 + }
  48 + }
  49 +
  50 + private void doArticleContent(Page page) {
  51 + Html html = page.getHtml();
  52 + String articleCode = page.getUrl().get();
  53 + Selectable headSelectable = html.xpath("//div[@class='c-article-header']/header");
  54 + List<Selectable> authorEmailSelectables = html.xpath("//p[@id='corresponding-author-list']/a").nodes();
  55 + Selectable referencesSelectable = html.xpath("//ol[@class='c-article-references']").select(new XpathSelector("li[@class='c-article-references__item js-c-reading-companion-references-item']"));
  56 + Selectable authorAddressSelectable = html.xpath("//ol[@class='c-article-author-affiliation__list']").select(new XpathSelector("li"));
  57 +
  58 + String title = headSelectable.xpath("//div/h1/text()").get();
  59 + if (StringUtils.isBlank(title)) {
  60 + title = headSelectable.xpath("//h1/text()").get();
  61 + }
  62 + String articleDesc = html.xpath("//div[@class='c-article-section__content']/p/text()").get();
  63 + String publishTime;
  64 + Date publishTimeDateTime = null;
  65 + try {
  66 + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get();
  67 + } catch (Exception e) {
  68 + try {
  69 + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(1).xpath("//li/time/text()").get();
  70 + } catch (Exception e1) {
  71 + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(0).xpath("//li/time/text()").get();
  72 + }
  73 + }
  74 + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH);
  75 +
  76 + try {
  77 + publishTimeDateTime = formatter.parse(publishTime);
  78 + } catch (ParseException e) {
  79 + e.printStackTrace();
  80 + }
  81 + Selectable authorSelectable = headSelectable.xpath("//ul").nodes().get(1).select(new XpathSelector("li[@class='c-article-author-list__item']"));
  82 + List<Selectable> authorNodes = authorSelectable.nodes();
  83 + StringBuffer authorName = new StringBuffer();
  84 + for (Selectable node : authorNodes) {
  85 + authorName.append(node.xpath("//a/text()"));
  86 + }
  87 +
  88 + JSONArray authorAddress = new JSONArray();
  89 + List<Selectable> authorAddressList = authorAddressSelectable.nodes();
  90 + if (CollectionUtils.isNotEmpty(authorAddressList)) {
  91 + for (Selectable selectable : authorAddressList) {
  92 + String address = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__address']/text()").get();
  93 + String authorNames = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__authors-list']/text()").get();
  94 + JSONObject object = new JSONObject();
  95 + object.put("address", address);
  96 + object.put("authorNames", authorNames);
  97 + authorAddress.add(object);
  98 + }
  99 + }
  100 +
  101 + JSONArray references = new JSONArray();
  102 + List<Selectable> referenceList = referencesSelectable.nodes();
  103 + if (CollectionUtils.isNotEmpty(referenceList)) {
  104 + for (Selectable reference : referenceList) {
  105 + String referenceTitle = reference.xpath("//p").xpath("//p[@class='c-article-references__text']/text()").get();
  106 + List<Selectable> referenceLinks = reference.xpath("//p").xpath("//p[@class='c-article-references__links u-hide-print']").links().nodes();
  107 + List<String> links = new ArrayList<>();
  108 + if (CollectionUtils.isNotEmpty(referenceLinks)) {
  109 + links = referenceLinks.stream().map(x -> x.get()).collect(Collectors.toList());
  110 + }
  111 + JSONObject object = new JSONObject();
  112 + object.put("referenceTitle", referenceTitle);
  113 + object.put("links", links);
  114 +// if (CollectionUtils.isNotEmpty(links)) {
  115 +// page.addTargetRequests(links.stream().filter(x -> x.contains("nature")).collect(Collectors.toList()));
  116 +// }
  117 + references.add(object);
  118 + }
  119 + }
  120 +
  121 + JSONArray authorEmail = new JSONArray();
  122 + for (Selectable authorEmailSelectable : authorEmailSelectables) {
  123 + String[] split = authorEmailSelectable.xpath("//a").links().get().split(":");
  124 + String email = Objects.isNull(split) ? "" : split[split.length - 1];
  125 + String authorEmailName = authorEmailSelectable.xpath("//a/text()").get();
  126 + JSONObject jsonObject = new JSONObject();
  127 + jsonObject.put("authorEmailName", authorEmailName);
  128 + jsonObject.put("email", email);
  129 + authorEmail.add(jsonObject);
  130 + }
  131 + log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorEmail.toJSONString());
  132 +
  133 + page.putField("article", ArticleDO.builder()
  134 + .articleType(ArticleTypeEnum.NATURE_METHODS.getType())
  135 + .articleCode(articleCode)
  136 + .authorName(authorName.toString())
  137 + .title(title)
  138 + .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE))
  139 + .emailInfo(authorEmail.toJSONString())
  140 + .articleDesc(articleDesc)
  141 + .authorAddress(authorAddress.toJSONString())
  142 + .referenceInfo(references.toJSONString()).build());
  143 + }
  144 +
  145 + public static void main(String[] args) {
  146 + Spider.create(new MatterPagePcoessor())
  147 + .addUrl("https://www.nature.com/nenergy/research-articles")
  148 + .addPipeline(new ArticlePipeline())
  149 + .thread(1).run();
  150 + }
  151 +}
... ...
src/main/java/com/canrd/webmagic/processor/NatureNanotechnologyProcessor.java 0 → 100644
  1 +package com.canrd.webmagic.processor;
  2 +
  3 +import com.alibaba.fastjson.JSONArray;
  4 +import com.alibaba.fastjson.JSONObject;
  5 +import com.baomidou.mybatisplus.core.toolkit.CollectionUtils;
  6 +import com.canrd.webmagic.common.utils.DateUtil;
  7 +import com.canrd.webmagic.common.utils.StringUtils;
  8 +import com.canrd.webmagic.domain.ArticleTypeEnum;
  9 +import com.canrd.webmagic.domain.dto.ArticleDO;
  10 +import lombok.extern.slf4j.Slf4j;
  11 +import org.springframework.stereotype.Component;
  12 +import us.codecraft.webmagic.Page;
  13 +import us.codecraft.webmagic.Site;
  14 +import us.codecraft.webmagic.processor.PageProcessor;
  15 +import us.codecraft.webmagic.selector.Html;
  16 +import us.codecraft.webmagic.selector.Selectable;
  17 +import us.codecraft.webmagic.selector.XpathSelector;
  18 +
  19 +import java.text.ParseException;
  20 +import java.text.SimpleDateFormat;
  21 +import java.util.*;
  22 +import java.util.stream.Collectors;
  23 +
  24 +@Component
  25 +@Slf4j
  26 +public class NatureNanotechnologyProcessor implements PageProcessor {
  27 +
  28 + //目前只有一页
  29 + @Override
  30 + public void process(Page page) {
  31 + String url = page.getUrl().get();
  32 + if (url.equals("https://www.nature.com/nnano/")){
  33 + everyPage(page);
  34 + } else if (url.contains("https://www.nature.com/")){
  35 + doArticleContent(page);
  36 + }
  37 + }
  38 +
  39 + @Override
  40 + public Site getSite() {
  41 + return PageProcessor.super.getSite();
  42 + }
  43 +
  44 + public void getIndex(Page page){
  45 + String maxIndex = page.getHtml().xpath("//div[@class='u-mb-48']/nav/ul/li[6]/a/text()").get();
  46 + log.info(maxIndex);
  47 + String trim = maxIndex.trim();
  48 + int number = Integer.parseInt(trim);
  49 + System.out.printf("", number);
  50 + for (int i = 0; i < number; i++) {
  51 + page.addTargetRequest("https://www.nature.com/nenergy/research-articles?searchType=journalSearch&sort=PubDate&page="+i);
  52 + }
  53 + }
  54 +
  55 + public void everyPage(Page page){
  56 + List<String> all = page.getHtml().xpath("//div[@class='u-flex-direction-column']/h3/a//@href").all();
  57 + for (int i = 0; i < all.size(); i++) {
  58 + page.addTargetRequest("https://www.nature.com/"+all.get(i));
  59 + }
  60 + }
  61 +
  62 + private void doArticleContent(Page page) {
  63 + Html html = page.getHtml();
  64 + String articleCode = page.getUrl().get();
  65 + Selectable headSelectable = html.xpath("//div[@class='c-article-header']/header");
  66 + List<Selectable> authorEmailSelectables = html.xpath("//p[@id='corresponding-author-list']/a").nodes();
  67 + Selectable referencesSelectable = html.xpath("//ol[@class='c-article-references']").select(new XpathSelector("li[@class='c-article-references__item js-c-reading-companion-references-item']"));
  68 + Selectable authorAddressSelectable = html.xpath("//ol[@class='c-article-author-affiliation__list']").select(new XpathSelector("li"));
  69 +
  70 + String title = headSelectable.xpath("//div/h1/text()").get();
  71 + if (StringUtils.isBlank(title)) {
  72 + title = headSelectable.xpath("//h1/text()").get();
  73 + }
  74 + String articleDesc = html.xpath("//div[@class='c-article-section__content']/p/text()").get();
  75 + String publishTime;
  76 + Date publishTimeDateTime = null;
  77 + try {
  78 + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get();
  79 + } catch (Exception e) {
  80 + try {
  81 + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(1).xpath("//li/time/text()").get();
  82 + } catch (Exception e1) {
  83 + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(0).xpath("//li/time/text()").get();
  84 + }
  85 + }
  86 + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH);
  87 +
  88 + try {
  89 + publishTimeDateTime = formatter.parse(publishTime);
  90 + } catch (ParseException e) {
  91 + e.printStackTrace();
  92 + }
  93 + Selectable authorSelectable = headSelectable.xpath("//ul").nodes().get(1).select(new XpathSelector("li[@class='c-article-author-list__item']"));
  94 + List<Selectable> authorNodes = authorSelectable.nodes();
  95 + StringBuffer authorName = new StringBuffer();
  96 + for (Selectable node : authorNodes) {
  97 + authorName.append(node.xpath("//a/text()"));
  98 + }
  99 +
  100 + JSONArray authorAddress = new JSONArray();
  101 + List<Selectable> authorAddressList = authorAddressSelectable.nodes();
  102 + if (CollectionUtils.isNotEmpty(authorAddressList)) {
  103 + for (Selectable selectable : authorAddressList) {
  104 + String address = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__address']/text()").get();
  105 + String authorNames = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__authors-list']/text()").get();
  106 + JSONObject object = new JSONObject();
  107 + object.put("address", address);
  108 + object.put("authorNames", authorNames);
  109 + authorAddress.add(object);
  110 + }
  111 + }
  112 +
  113 + JSONArray references = new JSONArray();
  114 + List<Selectable> referenceList = referencesSelectable.nodes();
  115 + if (CollectionUtils.isNotEmpty(referenceList)) {
  116 + for (Selectable reference : referenceList) {
  117 + String referenceTitle = reference.xpath("//p").xpath("//p[@class='c-article-references__text']/text()").get();
  118 + List<Selectable> referenceLinks = reference.xpath("//p").xpath("//p[@class='c-article-references__links u-hide-print']").links().nodes();
  119 + List<String> links = new ArrayList<>();
  120 + if (CollectionUtils.isNotEmpty(referenceLinks)) {
  121 + links = referenceLinks.stream().map(x -> x.get()).collect(Collectors.toList());
  122 + }
  123 + JSONObject object = new JSONObject();
  124 + object.put("referenceTitle", referenceTitle);
  125 + object.put("links", links);
  126 +// if (CollectionUtils.isNotEmpty(links)) {
  127 +// page.addTargetRequests(links.stream().filter(x -> x.contains("nature")).collect(Collectors.toList()));
  128 +// }
  129 + references.add(object);
  130 + }
  131 + }
  132 +
  133 + JSONArray authorEmail = new JSONArray();
  134 + for (Selectable authorEmailSelectable : authorEmailSelectables) {
  135 + String[] split = authorEmailSelectable.xpath("//a").links().get().split(":");
  136 + String email = Objects.isNull(split) ? "" : split[split.length - 1];
  137 + String authorEmailName = authorEmailSelectable.xpath("//a/text()").get();
  138 + JSONObject jsonObject = new JSONObject();
  139 + jsonObject.put("authorEmailName", authorEmailName);
  140 + jsonObject.put("email", email);
  141 + authorEmail.add(jsonObject);
  142 + }
  143 + log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorEmail.toJSONString());
  144 +
  145 + page.putField("article", ArticleDO.builder()
  146 + .articleType(ArticleTypeEnum.NATURE_NANOTECHNOLOGY.getType())
  147 + .articleCode(articleCode)
  148 + .authorName(authorName.toString())
  149 + .title(title)
  150 + .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE))
  151 + .emailInfo(authorEmail.toJSONString())
  152 + .articleDesc(articleDesc)
  153 + .authorAddress(authorAddress.toJSONString())
  154 + .referenceInfo(references.toJSONString()).build());
  155 + }
  156 +}
... ...
src/main/java/com/canrd/webmagic/processor/NaturePhysicsProcessor.java 0 → 100644
  1 +package com.canrd.webmagic.processor;
  2 +
  3 +import com.alibaba.fastjson.JSONArray;
  4 +import com.alibaba.fastjson.JSONObject;
  5 +import com.baomidou.mybatisplus.core.toolkit.CollectionUtils;
  6 +import com.canrd.webmagic.common.utils.DateUtil;
  7 +import com.canrd.webmagic.common.utils.StringUtils;
  8 +import com.canrd.webmagic.domain.ArticleTypeEnum;
  9 +import com.canrd.webmagic.domain.dto.ArticleDO;
  10 +import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
  11 +import lombok.extern.slf4j.Slf4j;
  12 +import org.springframework.stereotype.Component;
  13 +import us.codecraft.webmagic.Page;
  14 +import us.codecraft.webmagic.Site;
  15 +import us.codecraft.webmagic.Spider;
  16 +import us.codecraft.webmagic.processor.PageProcessor;
  17 +import us.codecraft.webmagic.selector.Html;
  18 +import us.codecraft.webmagic.selector.Selectable;
  19 +import us.codecraft.webmagic.selector.XpathSelector;
  20 +
  21 +import java.text.ParseException;
  22 +import java.text.SimpleDateFormat;
  23 +import java.util.*;
  24 +import java.util.stream.Collectors;
  25 +
  26 +@Slf4j
  27 +@Component
  28 +public class NaturePhysicsProcessor implements PageProcessor{
  29 + @Override
  30 + public void process(Page page) {
  31 + String url = page.getUrl().get();
  32 + if (url.equals("https://www.nature.com/nphys/")) {
  33 + everyPage(page);
  34 + }else if (url.contains("https://www.nature.com/")){
  35 + doArticleContent(page);
  36 + }
  37 + }
  38 +
  39 + @Override
  40 + public Site getSite() {
  41 + return PageProcessor.super.getSite();
  42 + }
  43 + public void everyPage(Page page){
  44 + List<String> all = page.getHtml().xpath("//div[@class='u-flex-direction-column']/h3/a//@href").all();
  45 + for (int i = 0; i < all.size(); i++) {
  46 + page.addTargetRequest("https://www.nature.com/"+all.get(i));
  47 + }
  48 + }
  49 +
  50 + private void doArticleContent(Page page) {
  51 + Html html = page.getHtml();
  52 + String articleCode = page.getUrl().get();
  53 + Selectable headSelectable = html.xpath("//div[@class='c-article-header']/header");
  54 + List<Selectable> authorEmailSelectables = html.xpath("//p[@id='corresponding-author-list']/a").nodes();
  55 + Selectable referencesSelectable = html.xpath("//ol[@class='c-article-references']").select(new XpathSelector("li[@class='c-article-references__item js-c-reading-companion-references-item']"));
  56 + Selectable authorAddressSelectable = html.xpath("//ol[@class='c-article-author-affiliation__list']").select(new XpathSelector("li"));
  57 +
  58 + String title = headSelectable.xpath("//div/h1/text()").get();
  59 + if (StringUtils.isBlank(title)) {
  60 + title = headSelectable.xpath("//h1/text()").get();
  61 + }
  62 + String articleDesc = html.xpath("//div[@class='c-article-section__content']/p/text()").get();
  63 + String publishTime;
  64 + Date publishTimeDateTime = null;
  65 + try {
  66 + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get();
  67 + } catch (Exception e) {
  68 + try {
  69 + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(1).xpath("//li/time/text()").get();
  70 + } catch (Exception e1) {
  71 + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(0).xpath("//li/time/text()").get();
  72 + }
  73 + }
  74 + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH);
  75 +
  76 + try {
  77 + publishTimeDateTime = formatter.parse(publishTime);
  78 + } catch (ParseException e) {
  79 + e.printStackTrace();
  80 + }
  81 + Selectable authorSelectable = headSelectable.xpath("//ul").nodes().get(1).select(new XpathSelector("li[@class='c-article-author-list__item']"));
  82 + List<Selectable> authorNodes = authorSelectable.nodes();
  83 + StringBuffer authorName = new StringBuffer();
  84 + for (Selectable node : authorNodes) {
  85 + authorName.append(node.xpath("//a/text()"));
  86 + }
  87 +
  88 + JSONArray authorAddress = new JSONArray();
  89 + List<Selectable> authorAddressList = authorAddressSelectable.nodes();
  90 + if (CollectionUtils.isNotEmpty(authorAddressList)) {
  91 + for (Selectable selectable : authorAddressList) {
  92 + String address = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__address']/text()").get();
  93 + String authorNames = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__authors-list']/text()").get();
  94 + JSONObject object = new JSONObject();
  95 + object.put("address", address);
  96 + object.put("authorNames", authorNames);
  97 + authorAddress.add(object);
  98 + }
  99 + }
  100 +
  101 + JSONArray references = new JSONArray();
  102 + List<Selectable> referenceList = referencesSelectable.nodes();
  103 + if (CollectionUtils.isNotEmpty(referenceList)) {
  104 + for (Selectable reference : referenceList) {
  105 + String referenceTitle = reference.xpath("//p").xpath("//p[@class='c-article-references__text']/text()").get();
  106 + List<Selectable> referenceLinks = reference.xpath("//p").xpath("//p[@class='c-article-references__links u-hide-print']").links().nodes();
  107 + List<String> links = new ArrayList<>();
  108 + if (CollectionUtils.isNotEmpty(referenceLinks)) {
  109 + links = referenceLinks.stream().map(x -> x.get()).collect(Collectors.toList());
  110 + }
  111 + JSONObject object = new JSONObject();
  112 + object.put("referenceTitle", referenceTitle);
  113 + object.put("links", links);
  114 +// if (CollectionUtils.isNotEmpty(links)) {
  115 +// page.addTargetRequests(links.stream().filter(x -> x.contains("nature")).collect(Collectors.toList()));
  116 +// }
  117 + references.add(object);
  118 + }
  119 + }
  120 +
  121 + JSONArray authorEmail = new JSONArray();
  122 + for (Selectable authorEmailSelectable : authorEmailSelectables) {
  123 + String[] split = authorEmailSelectable.xpath("//a").links().get().split(":");
  124 + String email = Objects.isNull(split) ? "" : split[split.length - 1];
  125 + String authorEmailName = authorEmailSelectable.xpath("//a/text()").get();
  126 + JSONObject jsonObject = new JSONObject();
  127 + jsonObject.put("authorEmailName", authorEmailName);
  128 + jsonObject.put("email", email);
  129 + authorEmail.add(jsonObject);
  130 + }
  131 + log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorEmail.toJSONString());
  132 +
  133 + page.putField("article", ArticleDO.builder()
  134 + .articleType(ArticleTypeEnum.NATURE_PHYSICS.getType())
  135 + .articleCode(articleCode)
  136 + .authorName(authorName.toString())
  137 + .title(title)
  138 + .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE))
  139 + .emailInfo(authorEmail.toJSONString())
  140 + .articleDesc(articleDesc)
  141 + .authorAddress(authorAddress.toJSONString())
  142 + .referenceInfo(references.toJSONString()).build());
  143 + }
  144 +
  145 + public static void main(String[] args) {
  146 + Spider.create(new MatterPagePcoessor())
  147 + .addUrl("https://www.nature.com/nenergy/research-articles")
  148 + .addPipeline(new ArticlePipeline())
  149 + .thread(1).run();
  150 + }
  151 +}
... ...
src/main/java/com/canrd/webmagic/processor/NatureSearchPageProcessor.java
... ... @@ -55,7 +55,6 @@ public class NatureSearchPageProcessor implements PageProcessor {
55 55 }else {
56 56 doArticleContent(page);
57 57 }
58   -
59 58 }
60 59  
61 60 /**
... ...
src/main/java/com/canrd/webmagic/processor/Science4JournalSearchPageProcessor.java
... ... @@ -140,7 +140,6 @@ public class Science4JournalSearchPageProcessor implements PageProcessor {
140 140  
141 141 }
142 142 }
143   -
144 143 }
145 144  
146 145 @Override
... ...
src/main/java/com/canrd/webmagic/processor/download/SeleniumDownloader.java
... ... @@ -3,10 +3,9 @@ package com.canrd.webmagic.processor.download;
3 3 import com.canrd.webmagic.config.SeleniumConfig;
4 4 import com.canrd.webmagic.processor.config.Agent;
5 5 import lombok.extern.slf4j.Slf4j;
6   -import org.openqa.selenium.By;
7   -import org.openqa.selenium.Cookie;
8   -import org.openqa.selenium.WebDriver;
9   -import org.openqa.selenium.WebElement;
  6 +import org.openqa.selenium.*;
  7 +import org.openqa.selenium.support.ui.ExpectedConditions;
  8 +import org.openqa.selenium.support.ui.WebDriverWait;
10 9 import org.springframework.stereotype.Component;
11 10 import us.codecraft.webmagic.Page;
12 11 import us.codecraft.webmagic.Request;
... ... @@ -28,7 +27,7 @@ import java.util.Map;
28 27 @Slf4j
29 28 @Component
30 29 public class SeleniumDownloader extends AbstractDownloader {
31   - private int sleepTime = 3000;
  30 + private int sleepTime = 3000000;
32 31  
33 32 @Resource
34 33 private SeleniumConfig config;
... ... @@ -64,8 +63,23 @@ public class SeleniumDownloader extends AbstractDownloader {
64 63 }
65 64  
66 65 log.info("downloading page " + request.getUrl());
67   -
68 66 webDriver.get(request.getUrl());
  67 + if (request.getUrl().equals("https://www.cell.com/matter/home")) {
  68 + WebElement searchText = webDriver.findElement(By.id("searchText"));
  69 + searchText.sendKeys("Aluminum foil");
  70 + WebElement element = webDriver.findElement(By.xpath("//div[@class='quick-search__toggle']/button"));
  71 + element.submit();
  72 + WebDriverWait wait = new WebDriverWait(webDriver, 30);
  73 + wait.until(ExpectedConditions.urlContains("https://www.cell.com/action/doSearch?"));
  74 +// wait.until(ExpectedConditions.visibilityOfElementLocated(By.xpath("//div[@class='cb-c']")));
  75 +// WebElement cloudFlare = webDriver.findElement(By.xpath("//div[@class='cb-c']/label/input"));
  76 +// if (cloudFlare!=null){
  77 +// cloudFlare.click();
  78 +// }
  79 + }
  80 + if (request.getUrl().contains("https://www.cell.com/action/doSearch?")){
  81 +
  82 + }
69 83 try {
70 84 if (sleepTime > 0) {
71 85 //休眠3秒就是为了动态的数据渲染完成后在进行获取
... ... @@ -75,6 +89,18 @@ public class SeleniumDownloader extends AbstractDownloader {
75 89 e.printStackTrace();
76 90 }
77 91  
  92 +// WebElement targetElement;
  93 +// do {
  94 +// try {
  95 +// targetElement = webDriver.findElement(By.xpath("//h2[@class=\"h2\"]"));
  96 +// log.info(String.valueOf(targetElement));
  97 +// log.info("等待验证中");
  98 +// Thread.sleep(sleepTime); // 等待一段时间后再检查
  99 +// } catch (NoSuchElementException e) {
  100 +// targetElement = null; // 如果找不到特定元素,则退出循环
  101 +// }
  102 +// } while (targetElement != null);
  103 +
78 104 WebElement webElement = webDriver.findElement(By.xpath("/html"));
79 105 String content = webElement.getAttribute("outerHTML");
80 106 page.setDownloadSuccess(true);
... ...
src/main/java/com/canrd/webmagic/processor/driver/MyChromeDriver.java
... ... @@ -59,13 +59,14 @@ public class MyChromeDriver implements BrowserDriver{
59 59 options.addArguments("blink-settings=imagesEnabled=false");
60 60 // 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
61 61 options.addArguments("--headless");
  62 + //进入指定地址
  63 +// options.setExperimentalOption("debuggerAddress", "127.0.0.1:9222");
62 64 //禁用 blink 特征
63 65 options.addArguments("disable-blink-features=AutomationControlled");
64 66 options.setExperimentalOption("excludeSwitches", Arrays.asList("enable-automation"));
65 67 options.setExperimentalOption("useAutomationExtension", false);
66 68 options.addArguments("--remote-allow-origins=*");
67 69  
68   -
69 70 String os_name = System.getProperty("os.name");
70 71 // 判断是否是windows系统
71 72 if (os_name.toLowerCase().startsWith("win")) {
... ...
src/main/java/com/canrd/webmagic/processor/pipeline/ArticlePipeline.java
... ... @@ -27,11 +27,11 @@ public class ArticlePipeline implements Pipeline {
27 27 public void process(ResultItems resultItems, Task task) {
28 28 ArticleDO articleDO = resultItems.get("article");
29 29 if (Objects.nonNull(articleDO)) {
30   - List<ArticleDO> natureArticleDO = articleService.list(new LambdaQueryWrapper<ArticleDO>().eq(ArticleDO::getArticleCode, articleDO.getArticleCode()));
31   - if (CollectionUtils.isNotEmpty(natureArticleDO)) {
32   - return;
33   - }
34   - articleService.save(articleDO);
  30 + List<ArticleDO> natureArticleDO = articleService.list(new LambdaQueryWrapper<ArticleDO>().eq(ArticleDO::getArticleCode, articleDO.getArticleCode()));
  31 + if (CollectionUtils.isNotEmpty(natureArticleDO)) {
  32 + return;
  33 + }
  34 + articleService.save(articleDO);
35 35 }
36 36 }
37 37 }
... ...
src/main/resources/application-test.yml
... ... @@ -59,7 +59,7 @@ spring:
59 59 testOnReturn: true
60 60 password: 123456
61 61 time-between-eviction-runs-millis: 1000
62   - url: jdbc:mysql://localhost:3306/webmagic?useUnicode=true&characterEncoding=UTF-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=Asia/Shanghai&useSSL=false&autoReconnect=true&failOverReadOnly=false&maxReconnects=10&allowMultiQueries=true&useAffectedRows=true&autoReconnectForPools=true
  62 + url: jdbc:mysql://localhost:3306/webpage?useUnicode=true&characterEncoding=UTF-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=Asia/Shanghai&useSSL=false&autoReconnect=true&failOverReadOnly=false&maxReconnects=10&allowMultiQueries=true&useAffectedRows=true&autoReconnectForPools=true
63 63 username: root
64 64 redis:
65 65 database: 0
... ...
target/classes/application-test.yml
... ... @@ -59,7 +59,7 @@ spring:
59 59 testOnReturn: true
60 60 password: 123456
61 61 time-between-eviction-runs-millis: 1000
62   - url: jdbc:mysql://localhost:3306/webmagic?useUnicode=true&characterEncoding=UTF-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=Asia/Shanghai&useSSL=false&autoReconnect=true&failOverReadOnly=false&maxReconnects=10&allowMultiQueries=true&useAffectedRows=true&autoReconnectForPools=true
  62 + url: jdbc:mysql://localhost:3306/webpage?useUnicode=true&characterEncoding=UTF-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=Asia/Shanghai&useSSL=false&autoReconnect=true&failOverReadOnly=false&maxReconnects=10&allowMultiQueries=true&useAffectedRows=true&autoReconnectForPools=true
63 63 username: root
64 64 redis:
65 65 database: 0
... ...
target/classes/com/canrd/webmagic/DNS/DnsResolver.class 0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/controller/MatterController.class
No preview for this file type
target/classes/com/canrd/webmagic/controller/NatureCommunicatiosController.class 0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/controller/NatureComputationalController.class 0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/controller/NatureEnergyController.class 0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/controller/NatureMaterialController.class 0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/controller/NatureMethodsController.class 0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/controller/NatureNanotechnologyController.class 0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/controller/NaturePhysicsController.class 0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/domain/ArticleTypeEnum.class
No preview for this file type
target/classes/com/canrd/webmagic/processor/ChemicalPagePcoessor.class
No preview for this file type
target/classes/com/canrd/webmagic/processor/MatterPragePcoessor.class renamed to target/classes/com/canrd/webmagic/processor/MatterPagePcoessor.class
No preview for this file type
target/classes/com/canrd/webmagic/processor/NatureCommunicatiosPcoessor.class 0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/NatureComputationalPcoessor.class 0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/NatureEnergyPagePcoessor.class 0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/NatureMaterialPagePcoessor.class 0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/NatureMethodsPcoessor.class 0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/NatureNanotechnologyProcessor.class 0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/NaturePhysicsProcessor.class 0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/NatureSearchPageProcessor.class
No preview for this file type
target/classes/com/canrd/webmagic/processor/Science4JournalSearchPageProcessor.class
No preview for this file type
target/classes/com/canrd/webmagic/processor/download/SeleniumDownloader.class
No preview for this file type
target/classes/com/canrd/webmagic/processor/driver/MyChromeDriver.class
No preview for this file type