Commit 9f5b8d9b5b36694b8e7f1b1c1cba1d4a742e8449
1 parent
1680e181
提交
Showing
52 changed files
with
1561 additions
and
29 deletions
Too many changes to show.
To preserve performance only 52 of 55 files are displayed.
.idea/inspectionProfiles/Project_Default.xml
@@ -2,5 +2,6 @@ | @@ -2,5 +2,6 @@ | ||
2 | <profile version="1.0"> | 2 | <profile version="1.0"> |
3 | <option name="myName" value="Project Default" /> | 3 | <option name="myName" value="Project Default" /> |
4 | <inspection_tool class="NonSerializableWithSerialVersionUIDField" enabled="true" level="WARNING" enabled_by_default="true" /> | 4 | <inspection_tool class="NonSerializableWithSerialVersionUIDField" enabled="true" level="WARNING" enabled_by_default="true" /> |
5 | + <inspection_tool class="SpringJavaInjectionPointsAutowiringInspection" enabled="false" level="ERROR" enabled_by_default="false" /> | ||
5 | </profile> | 6 | </profile> |
6 | </component> | 7 | </component> |
7 | \ No newline at end of file | 8 | \ No newline at end of file |
pom.xml
@@ -70,6 +70,12 @@ | @@ -70,6 +70,12 @@ | ||
70 | <!-- <version>${browsermob.version}</version>--> | 70 | <!-- <version>${browsermob.version}</version>--> |
71 | <!-- </dependency>--> | 71 | <!-- </dependency>--> |
72 | 72 | ||
73 | +<!-- DNS--> | ||
74 | + <dependency> | ||
75 | + <groupId>dnsjava</groupId> | ||
76 | + <artifactId>dnsjava</artifactId> | ||
77 | + <version>2.1.8</version> | ||
78 | + </dependency> | ||
73 | 79 | ||
74 | <!-- webmagic核心库 --> | 80 | <!-- webmagic核心库 --> |
75 | <dependency> | 81 | <dependency> |
src/main/java/com/canrd/webmagic/DNS/DnsResolver.java
0 → 100644
1 | +package com.canrd.webmagic.DNS; | ||
2 | + | ||
3 | +import org.xbill.DNS.*; | ||
4 | + | ||
5 | +public class DnsResolver { | ||
6 | + public static String resolve(String domain) { | ||
7 | + try { | ||
8 | + Record[] records = new Lookup(domain, Type.A).run(); | ||
9 | + if (records != null && records.length > 0) { | ||
10 | + ARecord aRecord = (ARecord) records[0]; | ||
11 | + return aRecord.getAddress().getHostAddress(); | ||
12 | + } | ||
13 | + } catch (TextParseException e) { | ||
14 | + e.printStackTrace(); | ||
15 | + } | ||
16 | + return null; | ||
17 | + } | ||
18 | +} | ||
0 | \ No newline at end of file | 19 | \ No newline at end of file |
src/main/java/com/canrd/webmagic/controller/MatterController.java
1 | package com.canrd.webmagic.controller; | 1 | package com.canrd.webmagic.controller; |
2 | 2 | ||
3 | import com.canrd.webmagic.common.constant.ServerResult; | 3 | import com.canrd.webmagic.common.constant.ServerResult; |
4 | -import com.canrd.webmagic.processor.MatterPragePcoessor; | 4 | +import com.canrd.webmagic.processor.MatterPagePcoessor; |
5 | import com.canrd.webmagic.processor.download.SeleniumDownloader; | 5 | import com.canrd.webmagic.processor.download.SeleniumDownloader; |
6 | import io.swagger.annotations.Api; | 6 | import io.swagger.annotations.Api; |
7 | import io.swagger.annotations.ApiOperation; | 7 | import io.swagger.annotations.ApiOperation; |
@@ -18,7 +18,7 @@ import javax.annotation.Resource; | @@ -18,7 +18,7 @@ import javax.annotation.Resource; | ||
18 | @Api("Matter") | 18 | @Api("Matter") |
19 | public class MatterController { | 19 | public class MatterController { |
20 | @Resource | 20 | @Resource |
21 | - private MatterPragePcoessor matterPragePcoessor; | 21 | + private MatterPagePcoessor matterPragePcoessor; |
22 | 22 | ||
23 | @Resource | 23 | @Resource |
24 | private SeleniumDownloader seleniumDownloader; | 24 | private SeleniumDownloader seleniumDownloader; |
@@ -26,11 +26,11 @@ public class MatterController { | @@ -26,11 +26,11 @@ public class MatterController { | ||
26 | @GetMapping("/start") | 26 | @GetMapping("/start") |
27 | @ApiOperation("start") | 27 | @ApiOperation("start") |
28 | public ServerResult start() { | 28 | public ServerResult start() { |
29 | - Spider.create(new MatterPragePcoessor()) | 29 | + Spider.create(matterPragePcoessor) |
30 | // 添加这个Spider要爬取的网页地址 | 30 | // 添加这个Spider要爬取的网页地址 |
31 | - .addUrl("https://www.cell.com/matter/fulltext/S2590-2385(21)00631-7#%20") | 31 | + .addUrl("https://www.cell.com/matter/home") |
32 | .setUUID(UuidUtil.getTimeBasedUuid().toString()) | 32 | .setUUID(UuidUtil.getTimeBasedUuid().toString()) |
33 | - .setDownloader(seleniumDownloader.setSleepTime(30000)) | 33 | + .setDownloader(seleniumDownloader) |
34 | // 开启5个线程执行,并开始爬取 | 34 | // 开启5个线程执行,并开始爬取 |
35 | .thread(5).run(); | 35 | .thread(5).run(); |
36 | return ServerResult.success(); | 36 | return ServerResult.success(); |
src/main/java/com/canrd/webmagic/controller/NatureCommunicatiosController.java
0 → 100644
1 | +package com.canrd.webmagic.controller; | ||
2 | + | ||
3 | +import com.canrd.webmagic.common.constant.ServerResult; | ||
4 | +import com.canrd.webmagic.processor.NatureCommunicatiosPcoessor; | ||
5 | +import com.canrd.webmagic.processor.NatureEnergyPagePcoessor; | ||
6 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | ||
7 | +import io.swagger.annotations.Api; | ||
8 | +import io.swagger.annotations.ApiOperation; | ||
9 | +import org.apache.logging.log4j.core.util.UuidUtil; | ||
10 | +import org.springframework.web.bind.annotation.GetMapping; | ||
11 | +import org.springframework.web.bind.annotation.RequestMapping; | ||
12 | +import org.springframework.web.bind.annotation.RestController; | ||
13 | +import us.codecraft.webmagic.Spider; | ||
14 | +import us.codecraft.webmagic.scheduler.RedisScheduler; | ||
15 | + | ||
16 | +import javax.annotation.Resource; | ||
17 | +@RestController | ||
18 | +@RequestMapping("/nature-communicatios/article") | ||
19 | +@Api("Nature") | ||
20 | +public class NatureCommunicatiosController { | ||
21 | + @Resource | ||
22 | + private NatureCommunicatiosPcoessor natureCommunicatiosPcoessor; | ||
23 | + @Resource | ||
24 | + private ArticlePipeline articlePipeline; | ||
25 | + | ||
26 | + @GetMapping("/start") | ||
27 | + @ApiOperation("start") | ||
28 | + public ServerResult start() { | ||
29 | + Spider.create(natureCommunicatiosPcoessor) | ||
30 | + .addUrl("https://www.nature.com/ncomms/articles?type=editorial") | ||
31 | + .setUUID(UuidUtil.getTimeBasedUuid().toString()) | ||
32 | + .addPipeline(articlePipeline) | ||
33 | +// .setScheduler(new RedisScheduler("127.0.0.1")) | ||
34 | + .thread(20).run(); | ||
35 | + return ServerResult.success(); | ||
36 | + } | ||
37 | +} |
src/main/java/com/canrd/webmagic/controller/NatureComputationalController.java
0 → 100644
1 | +package com.canrd.webmagic.controller; | ||
2 | + | ||
3 | +import com.canrd.webmagic.common.constant.ServerResult; | ||
4 | +import com.canrd.webmagic.processor.NatureComputationalPcoessor; | ||
5 | +import com.canrd.webmagic.processor.NatureEnergyPagePcoessor; | ||
6 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | ||
7 | +import io.swagger.annotations.Api; | ||
8 | +import io.swagger.annotations.ApiOperation; | ||
9 | +import org.apache.logging.log4j.core.util.UuidUtil; | ||
10 | +import org.springframework.web.bind.annotation.GetMapping; | ||
11 | +import org.springframework.web.bind.annotation.RequestMapping; | ||
12 | +import org.springframework.web.bind.annotation.RestController; | ||
13 | +import us.codecraft.webmagic.Spider; | ||
14 | +import us.codecraft.webmagic.scheduler.RedisScheduler; | ||
15 | + | ||
16 | +import javax.annotation.Resource; | ||
17 | + | ||
18 | +@RestController | ||
19 | +@RequestMapping("/nature-computational/article") | ||
20 | +@Api("Nature") | ||
21 | +public class NatureComputationalController { | ||
22 | + @Resource | ||
23 | + private NatureComputationalPcoessor natureComputationalPcoessor; | ||
24 | + @Resource | ||
25 | + private ArticlePipeline articlePipeline; | ||
26 | + | ||
27 | + @GetMapping("/start") | ||
28 | + @ApiOperation("start") | ||
29 | + public ServerResult start() { | ||
30 | + Spider.create(natureComputationalPcoessor) | ||
31 | + .addUrl("https://www.nature.com/natcomputsci") | ||
32 | + .setUUID(UuidUtil.getTimeBasedUuid().toString()) | ||
33 | + .addPipeline(articlePipeline) | ||
34 | + .setScheduler(new RedisScheduler("127.0.0.1")) | ||
35 | + .thread(20).run(); | ||
36 | + return ServerResult.success(); | ||
37 | + } | ||
38 | +} |
src/main/java/com/canrd/webmagic/controller/NatureEnergyController.java
0 → 100644
1 | +package com.canrd.webmagic.controller; | ||
2 | + | ||
3 | +import com.canrd.webmagic.common.constant.ServerResult; | ||
4 | +import com.canrd.webmagic.processor.NatureEnergyPagePcoessor; | ||
5 | +import com.canrd.webmagic.processor.NatureMaterialPagePcoessor; | ||
6 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | ||
7 | +import io.swagger.annotations.Api; | ||
8 | +import io.swagger.annotations.ApiOperation; | ||
9 | +import org.apache.logging.log4j.core.util.UuidUtil; | ||
10 | +import org.springframework.web.bind.annotation.GetMapping; | ||
11 | +import org.springframework.web.bind.annotation.RequestMapping; | ||
12 | +import org.springframework.web.bind.annotation.RestController; | ||
13 | +import us.codecraft.webmagic.Spider; | ||
14 | +import us.codecraft.webmagic.scheduler.RedisScheduler; | ||
15 | + | ||
16 | +import javax.annotation.Resource; | ||
17 | + | ||
18 | +@RestController | ||
19 | +@RequestMapping("/nature-energy/article") | ||
20 | +@Api("Nature") | ||
21 | +public class NatureEnergyController { | ||
22 | + @Resource | ||
23 | + private NatureEnergyPagePcoessor natureEnergyPagePcoessor; | ||
24 | + @Resource | ||
25 | + private ArticlePipeline articlePipeline; | ||
26 | + | ||
27 | + @GetMapping("/start") | ||
28 | + @ApiOperation("start") | ||
29 | + public ServerResult start() { | ||
30 | + Spider.create(natureEnergyPagePcoessor) | ||
31 | + .addUrl("https://www.nature.com/nenergy/research-articles") | ||
32 | + .setUUID(UuidUtil.getTimeBasedUuid().toString()) | ||
33 | + .addPipeline(articlePipeline) | ||
34 | + .setScheduler(new RedisScheduler("127.0.0.1")) | ||
35 | + .thread(20).run(); | ||
36 | + return ServerResult.success(); | ||
37 | + } | ||
38 | +} |
src/main/java/com/canrd/webmagic/controller/NatureMaterialController.java
0 → 100644
1 | +package com.canrd.webmagic.controller; | ||
2 | + | ||
3 | +import com.canrd.webmagic.common.constant.ServerResult; | ||
4 | +import com.canrd.webmagic.processor.MatterPagePcoessor; | ||
5 | +import com.canrd.webmagic.processor.NatureMaterialPagePcoessor; | ||
6 | +import com.canrd.webmagic.processor.download.SeleniumDownloader; | ||
7 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | ||
8 | +import io.swagger.annotations.Api; | ||
9 | +import io.swagger.annotations.ApiOperation; | ||
10 | +import org.apache.logging.log4j.core.util.UuidUtil; | ||
11 | +import org.springframework.web.bind.annotation.GetMapping; | ||
12 | +import org.springframework.web.bind.annotation.RequestMapping; | ||
13 | +import org.springframework.web.bind.annotation.RestController; | ||
14 | +import us.codecraft.webmagic.Spider; | ||
15 | +import us.codecraft.webmagic.scheduler.RedisScheduler; | ||
16 | + | ||
17 | +import javax.annotation.Resource; | ||
18 | + | ||
19 | +@RestController | ||
20 | +@RequestMapping("/nature-material/article") | ||
21 | +@Api("Nature") | ||
22 | +public class NatureMaterialController { | ||
23 | + @Resource | ||
24 | + private NatureMaterialPagePcoessor natureMaterialPagePcoessor; | ||
25 | + @Resource | ||
26 | + private ArticlePipeline articlePipeline; | ||
27 | + | ||
28 | + @GetMapping("/start") | ||
29 | + @ApiOperation("start") | ||
30 | + public ServerResult start() { | ||
31 | + Spider.create(natureMaterialPagePcoessor) | ||
32 | + // 添加这个Spider要爬取的网页地址 | ||
33 | + .addUrl("https://www.nature.com/nmat/articles") | ||
34 | + .addUrl("https://www.nature.com/search?q=battery&journal=nmat&order=relevance") | ||
35 | + .addUrl("https://www.nature.com/search?q=batteries&journal=nmat&order=relevance") | ||
36 | + .addUrl("https://www.nature.com/search?q=battery") | ||
37 | + .addUrl("https://www.nature.com/nature/research-articles") | ||
38 | + .setUUID(UuidUtil.getTimeBasedUuid().toString()) | ||
39 | + .addPipeline(articlePipeline) | ||
40 | + .setScheduler(new RedisScheduler("127.0.0.1")) | ||
41 | + // 开启5个线程执行,并开始爬取 | ||
42 | + .thread(60).run(); | ||
43 | + return ServerResult.success(); | ||
44 | + } | ||
45 | +} |
src/main/java/com/canrd/webmagic/controller/NatureMethodsController.java
0 → 100644
1 | +package com.canrd.webmagic.controller; | ||
2 | + | ||
3 | +import com.canrd.webmagic.common.constant.ServerResult; | ||
4 | +import com.canrd.webmagic.processor.NatureMethodsPcoessor; | ||
5 | +import com.canrd.webmagic.processor.NatureNanotechnologyProcessor; | ||
6 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | ||
7 | +import io.swagger.annotations.Api; | ||
8 | +import io.swagger.annotations.ApiOperation; | ||
9 | +import org.apache.logging.log4j.core.util.UuidUtil; | ||
10 | +import org.springframework.web.bind.annotation.GetMapping; | ||
11 | +import org.springframework.web.bind.annotation.RequestMapping; | ||
12 | +import org.springframework.web.bind.annotation.RestController; | ||
13 | +import us.codecraft.webmagic.Spider; | ||
14 | +import us.codecraft.webmagic.scheduler.RedisScheduler; | ||
15 | + | ||
16 | +import javax.annotation.Resource; | ||
17 | + | ||
18 | +@RestController | ||
19 | +@RequestMapping("/nature-methods/article") | ||
20 | +@Api("Nature") | ||
21 | +public class NatureMethodsController { | ||
22 | + @Resource | ||
23 | + private NatureMethodsPcoessor natureMethodsPcoessor; | ||
24 | + @Resource | ||
25 | + private ArticlePipeline articlePipeline; | ||
26 | + | ||
27 | + @GetMapping("/start") | ||
28 | + @ApiOperation("start") | ||
29 | + public ServerResult start() { | ||
30 | + Spider.create(natureMethodsPcoessor) | ||
31 | + // 添加这个Spider要爬取的网页地址 | ||
32 | + .addUrl("https://www.nature.com/nmeth/") | ||
33 | + .setUUID(UuidUtil.getTimeBasedUuid().toString()) | ||
34 | + .addPipeline(articlePipeline) | ||
35 | + .setScheduler(new RedisScheduler("127.0.0.1")) | ||
36 | + // 开启5个线程执行,并开始爬取 | ||
37 | + .thread(20).run(); | ||
38 | + return ServerResult.success(); | ||
39 | + } | ||
40 | +} |
src/main/java/com/canrd/webmagic/controller/NatureNanotechnologyController.java
0 → 100644
1 | +package com.canrd.webmagic.controller; | ||
2 | + | ||
3 | +import com.canrd.webmagic.common.constant.ServerResult; | ||
4 | +import com.canrd.webmagic.processor.NatureMaterialPagePcoessor; | ||
5 | +import com.canrd.webmagic.processor.NatureNanotechnologyProcessor; | ||
6 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | ||
7 | +import io.swagger.annotations.Api; | ||
8 | +import io.swagger.annotations.ApiOperation; | ||
9 | +import org.apache.logging.log4j.core.util.UuidUtil; | ||
10 | +import org.springframework.web.bind.annotation.GetMapping; | ||
11 | +import org.springframework.web.bind.annotation.RequestMapping; | ||
12 | +import org.springframework.web.bind.annotation.RestController; | ||
13 | +import us.codecraft.webmagic.Spider; | ||
14 | +import us.codecraft.webmagic.scheduler.RedisScheduler; | ||
15 | + | ||
16 | +import javax.annotation.Resource; | ||
17 | + | ||
18 | +@RestController | ||
19 | +@RequestMapping("/nature-nanotechnology/article") | ||
20 | +@Api("Nature") | ||
21 | +public class NatureNanotechnologyController { | ||
22 | + @Resource | ||
23 | + private NatureNanotechnologyProcessor natureNanotechnologyProcessor; | ||
24 | + @Resource | ||
25 | + private ArticlePipeline articlePipeline; | ||
26 | + | ||
27 | + @GetMapping("/start") | ||
28 | + @ApiOperation("start") | ||
29 | + public ServerResult start() { | ||
30 | + Spider.create(natureNanotechnologyProcessor) | ||
31 | + // 添加这个Spider要爬取的网页地址 | ||
32 | + .addUrl("https://www.nature.com/nnano/") | ||
33 | + .setUUID(UuidUtil.getTimeBasedUuid().toString()) | ||
34 | + .addPipeline(articlePipeline) | ||
35 | + .setScheduler(new RedisScheduler("127.0.0.1")) | ||
36 | + // 开启5个线程执行,并开始爬取 | ||
37 | + .thread(60).run(); | ||
38 | + return ServerResult.success(); | ||
39 | + } | ||
40 | +} |
src/main/java/com/canrd/webmagic/controller/NaturePhysicsController.java
0 → 100644
1 | +package com.canrd.webmagic.controller; | ||
2 | + | ||
3 | +import com.canrd.webmagic.common.constant.ServerResult; | ||
4 | +import com.canrd.webmagic.processor.NatureEnergyPagePcoessor; | ||
5 | +import com.canrd.webmagic.processor.NaturePhysicsProcessor; | ||
6 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | ||
7 | +import io.swagger.annotations.Api; | ||
8 | +import io.swagger.annotations.ApiOperation; | ||
9 | +import org.apache.logging.log4j.core.util.UuidUtil; | ||
10 | +import org.springframework.web.bind.annotation.GetMapping; | ||
11 | +import org.springframework.web.bind.annotation.RequestMapping; | ||
12 | +import org.springframework.web.bind.annotation.RestController; | ||
13 | +import us.codecraft.webmagic.Spider; | ||
14 | +import us.codecraft.webmagic.scheduler.RedisScheduler; | ||
15 | + | ||
16 | +import javax.annotation.Resource; | ||
17 | +@RestController | ||
18 | +@RequestMapping("/nature-physics/article") | ||
19 | +@Api("Nature") | ||
20 | +public class NaturePhysicsController { | ||
21 | + @Resource | ||
22 | + private NaturePhysicsProcessor naturePhysicsProcessor; | ||
23 | + @Resource | ||
24 | + private ArticlePipeline articlePipeline; | ||
25 | + | ||
26 | + @GetMapping("/start") | ||
27 | + @ApiOperation("start") | ||
28 | + public ServerResult start() { | ||
29 | + Spider.create(naturePhysicsProcessor) | ||
30 | + .addUrl("https://www.nature.com/nphys/") | ||
31 | + .setUUID(UuidUtil.getTimeBasedUuid().toString()) | ||
32 | + .addPipeline(articlePipeline) | ||
33 | +// .setScheduler(new RedisScheduler("127.0.0.1")) | ||
34 | + .thread(20).run(); | ||
35 | + return ServerResult.success(); | ||
36 | + } | ||
37 | +} | ||
0 | \ No newline at end of file | 38 | \ No newline at end of file |
src/main/java/com/canrd/webmagic/domain/ArticleTypeEnum.java
@@ -18,6 +18,13 @@ public enum ArticleTypeEnum { | @@ -18,6 +18,13 @@ public enum ArticleTypeEnum { | ||
18 | SCIENCE("science", "science网址"), | 18 | SCIENCE("science", "science网址"), |
19 | SCIENCE_SPJ("science-spj", "science网址-spj"), | 19 | SCIENCE_SPJ("science-spj", "science网址-spj"), |
20 | UNIVIE_PHYSNANO("univie-physnano", "univie网址-physnano"), | 20 | UNIVIE_PHYSNANO("univie-physnano", "univie网址-physnano"), |
21 | + NATURE_MATERIAL("nuture-material","nuture网站-material"), | ||
22 | + NATURE_NANOTECHNOLOGY("nature-nanotechnology","nuture网站-nanotechnology"), | ||
23 | + NATURE_PHYSICS("nature-physics","nuture网站-physics"), | ||
24 | + NATURE_ENERGY("nature-energy","nuture网站-energy"), | ||
25 | + NATURE_COMMUNICATIONS("nature-communications","nuture网站-communications"), | ||
26 | + NATURE_COMPUTATIONAL_SCIENCE("nature-computational|science","nuture网站-computational|science"), | ||
27 | + NATURE_METHODS("nature-methods","nuture网站-methods"), | ||
21 | ; | 28 | ; |
22 | private String type; | 29 | private String type; |
23 | private String desc; | 30 | private String desc; |
src/main/java/com/canrd/webmagic/processor/AdvancedEnergyMaterialPcoessor.java
0 → 100644
1 | +package com.canrd.webmagic.processor; | ||
2 | + | ||
3 | +import us.codecraft.webmagic.Page; | ||
4 | +import us.codecraft.webmagic.Site; | ||
5 | +import us.codecraft.webmagic.processor.PageProcessor; | ||
6 | + | ||
7 | +public class AdvancedEnergyMaterialPcoessor implements PageProcessor { | ||
8 | + @Override | ||
9 | + public void process(Page page) { | ||
10 | + | ||
11 | + } | ||
12 | + | ||
13 | + @Override | ||
14 | + public Site getSite() { | ||
15 | + return PageProcessor.super.getSite(); | ||
16 | + } | ||
17 | + | ||
18 | + | ||
19 | +} |
src/main/java/com/canrd/webmagic/processor/ChemicalPagePcoessor.java
1 | package com.canrd.webmagic.processor; | 1 | package com.canrd.webmagic.processor; |
2 | 2 | ||
3 | +import com.canrd.webmagic.DNS.DnsResolver; | ||
3 | import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | 4 | import com.canrd.webmagic.processor.pipeline.ArticlePipeline; |
5 | +import com.gargoylesoftware.htmlunit.AbstractPage; | ||
4 | import us.codecraft.webmagic.Page; | 6 | import us.codecraft.webmagic.Page; |
7 | +import us.codecraft.webmagic.Request; | ||
5 | import us.codecraft.webmagic.Site; | 8 | import us.codecraft.webmagic.Site; |
6 | import us.codecraft.webmagic.Spider; | 9 | import us.codecraft.webmagic.Spider; |
7 | import us.codecraft.webmagic.processor.PageProcessor; | 10 | import us.codecraft.webmagic.processor.PageProcessor; |
8 | import us.codecraft.webmagic.selector.Html; | 11 | import us.codecraft.webmagic.selector.Html; |
12 | +import us.codecraft.webmagic.selector.PlainText; | ||
9 | 13 | ||
10 | public class ChemicalPagePcoessor implements PageProcessor { | 14 | public class ChemicalPagePcoessor implements PageProcessor { |
11 | - private Site site = Site.me().setRetryTimes(3).setSleepTime(100); | 15 | + |
16 | + private Request request; | ||
17 | + String domain = new PlainText(request.getUrl()).regex("//(.*?)/").get(); | ||
18 | + | ||
19 | + String ip = DnsResolver.resolve(domain); | ||
20 | + private Site site = Site.me().setRetryTimes(3).setSleepTime(100).setDomain(domain).addCookie("ip", ip);; | ||
12 | @Override | 21 | @Override |
13 | public void process(Page page) { | 22 | public void process(Page page) { |
14 | 23 |
src/main/java/com/canrd/webmagic/processor/MatterPragePcoessor.java renamed to src/main/java/com/canrd/webmagic/processor/MatterPagePcoessor.java
1 | package com.canrd.webmagic.processor; | 1 | package com.canrd.webmagic.processor; |
2 | 2 | ||
3 | +import com.canrd.webmagic.processor.config.Agent; | ||
3 | import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | 4 | import com.canrd.webmagic.processor.pipeline.ArticlePipeline; |
4 | import org.springframework.stereotype.Component; | 5 | import org.springframework.stereotype.Component; |
6 | +import lombok.extern.slf4j.Slf4j; | ||
5 | import us.codecraft.webmagic.Page; | 7 | import us.codecraft.webmagic.Page; |
6 | import us.codecraft.webmagic.Site; | 8 | import us.codecraft.webmagic.Site; |
7 | import us.codecraft.webmagic.Spider; | 9 | import us.codecraft.webmagic.Spider; |
8 | import us.codecraft.webmagic.processor.PageProcessor; | 10 | import us.codecraft.webmagic.processor.PageProcessor; |
9 | import us.codecraft.webmagic.selector.Html; | 11 | import us.codecraft.webmagic.selector.Html; |
12 | + | ||
10 | @Component | 13 | @Component |
11 | -public class MatterPragePcoessor implements PageProcessor { | ||
12 | - private Site site = Site.me().setRetryTimes(3).setSleepTime(100); | 14 | +@Slf4j |
15 | +public class MatterPagePcoessor implements PageProcessor { | ||
16 | + | ||
17 | + private Site site = Site.me() | ||
18 | + .setRetryTimes(3) | ||
19 | + .setSleepTime(5) | ||
20 | + .setUserAgent(Agent.getRandom()); | ||
21 | + | ||
13 | @Override | 22 | @Override |
14 | public void process(Page page) { | 23 | public void process(Page page) { |
24 | + //首页 | ||
25 | + if (page.getUrl().get().equals("https://www.cell.com/matter/home")){ | ||
26 | + | ||
27 | + } | ||
28 | + //搜索页 | ||
29 | + else if (page.getUrl().get().contains("https://www.cell.com/action/doSearch?")) { | ||
15 | 30 | ||
31 | + } | ||
32 | + //详情页 | ||
33 | + else if (page.getUrl().get().contains("https://www.cell.com/matter/fulltext/")) { | ||
34 | + doArticleContent(page); | ||
35 | + } | ||
16 | } | 36 | } |
17 | 37 | ||
18 | @Override | 38 | @Override |
19 | public Site getSite() { | 39 | public Site getSite() { |
20 | - return PageProcessor.super.getSite(); | 40 | + return this.site; |
21 | } | 41 | } |
22 | 42 | ||
23 | - public void doArticleContent(Page page){ | 43 | + public void doArticleContent(Page page) { |
24 | Html html = page.getHtml(); | 44 | Html html = page.getHtml(); |
45 | + log.info(String.valueOf(html)); | ||
25 | String articleCode = page.getUrl().get(); | 46 | String articleCode = page.getUrl().get(); |
26 | // html.xpath() | 47 | // html.xpath() |
27 | } | 48 | } |
28 | 49 | ||
29 | public static void main(String[] args) { | 50 | public static void main(String[] args) { |
30 | // 创建一个Spider,并把我们的处理器放进去 | 51 | // 创建一个Spider,并把我们的处理器放进去 |
31 | - Spider.create(new MatterPragePcoessor()) | 52 | + Spider.create(new MatterPagePcoessor()) |
32 | // 添加这个Spider要爬取的网页地址 | 53 | // 添加这个Spider要爬取的网页地址 |
33 | - .addUrl("https://www.cell.com/matter/fulltext/S2590-2385(21)00631-7#%20") | 54 | + .addUrl("https://www.cell.com/matter/home") |
34 | .addPipeline(new ArticlePipeline()) | 55 | .addPipeline(new ArticlePipeline()) |
35 | // 开启5个线程执行,并开始爬取 | 56 | // 开启5个线程执行,并开始爬取 |
36 | - .thread(5).run(); | 57 | + .thread(1).run(); |
37 | } | 58 | } |
38 | } | 59 | } |
src/main/java/com/canrd/webmagic/processor/NatureCommunicatiosPcoessor.java
0 → 100644
1 | +package com.canrd.webmagic.processor; | ||
2 | + | ||
3 | +import com.alibaba.fastjson.JSONArray; | ||
4 | +import com.alibaba.fastjson.JSONObject; | ||
5 | +import com.baomidou.mybatisplus.core.toolkit.CollectionUtils; | ||
6 | +import com.canrd.webmagic.common.utils.DateUtil; | ||
7 | +import com.canrd.webmagic.common.utils.StringUtils; | ||
8 | +import com.canrd.webmagic.domain.ArticleTypeEnum; | ||
9 | +import com.canrd.webmagic.domain.dto.ArticleDO; | ||
10 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | ||
11 | +import lombok.extern.slf4j.Slf4j; | ||
12 | +import org.springframework.stereotype.Component; | ||
13 | +import us.codecraft.webmagic.Page; | ||
14 | +import us.codecraft.webmagic.Site; | ||
15 | +import us.codecraft.webmagic.Spider; | ||
16 | +import us.codecraft.webmagic.processor.PageProcessor; | ||
17 | +import us.codecraft.webmagic.selector.Html; | ||
18 | +import us.codecraft.webmagic.selector.Selectable; | ||
19 | +import us.codecraft.webmagic.selector.XpathSelector; | ||
20 | + | ||
21 | +import java.text.ParseException; | ||
22 | +import java.text.SimpleDateFormat; | ||
23 | +import java.util.*; | ||
24 | +import java.util.stream.Collectors; | ||
25 | + | ||
26 | +@Slf4j | ||
27 | +@Component | ||
28 | +public class NatureCommunicatiosPcoessor implements PageProcessor { | ||
29 | + | ||
30 | + @Override | ||
31 | + public void process(Page page) { | ||
32 | + if (page.getUrl().get().equals("https://www.nature.com/ncomms/articles?type=editorial")){ | ||
33 | + getIndex(page); | ||
34 | + } else if (page.getUrl().get().contains("https://www.nature.com/ncomms/articles?searchType=journalSearch&sort=PubDate&type=editorial&page=")) { | ||
35 | + everyPage(page); | ||
36 | + }else if (page.getUrl().get().contains("https://www.nature.com/articles/s41467-022-29269-6")){ | ||
37 | + doArticleContent(page); | ||
38 | + } | ||
39 | + } | ||
40 | + | ||
41 | + @Override | ||
42 | + public Site getSite() { | ||
43 | + return PageProcessor.super.getSite(); | ||
44 | + } | ||
45 | + | ||
46 | + public void getIndex(Page page){ | ||
47 | + String maxIndex = page.getHtml().xpath("//div[@class='u-mb-48']/nav/ul/li[5]/a/text()").get(); | ||
48 | + log.info(maxIndex); | ||
49 | + String trim = maxIndex.trim(); | ||
50 | + int number = Integer.parseInt(trim); | ||
51 | + System.out.printf("", number); | ||
52 | + for (int i = 1; i <= number; i++) { | ||
53 | + log.info(String.valueOf(i)); | ||
54 | + page.addTargetRequest("https://www.nature.com/ncomms/articles?searchType=journalSearch&sort=PubDate&type=editorial&page="+i); | ||
55 | + } | ||
56 | + } | ||
57 | + | ||
58 | + public void everyPage(Page page){ | ||
59 | + List<String> all = page.getHtml().xpath("//div[@class='u-flex-direction-column']/h3/a//@href").all(); | ||
60 | + for (int i = 1; i <= all.size(); i++) { | ||
61 | + log.info(String.valueOf(i)); | ||
62 | + page.addTargetRequest("https://www.nature.com"+all.get(i)); | ||
63 | + } | ||
64 | + } | ||
65 | + | ||
66 | + private void doArticleContent(Page page) { | ||
67 | + Html html = page.getHtml(); | ||
68 | + String articleCode = page.getUrl().get(); | ||
69 | + Selectable headSelectable = html.xpath("//div[@class='c-article-header']/header"); | ||
70 | + List<Selectable> authorEmailSelectables = html.xpath("//p[@id='corresponding-author-list']/a").nodes(); | ||
71 | + Selectable referencesSelectable = html.xpath("//ol[@class='c-article-references']").select(new XpathSelector("li[@class='c-article-references__item js-c-reading-companion-references-item']")); | ||
72 | + Selectable authorAddressSelectable = html.xpath("//ol[@class='c-article-author-affiliation__list']").select(new XpathSelector("li")); | ||
73 | + | ||
74 | + String title = headSelectable.xpath("//div/h1/text()").get(); | ||
75 | + if (StringUtils.isBlank(title)) { | ||
76 | + title = headSelectable.xpath("//h1/text()").get(); | ||
77 | + } | ||
78 | + String articleDesc = html.xpath("//div[@class='c-article-section__content']/p/text()").get(); | ||
79 | + String publishTime; | ||
80 | + Date publishTimeDateTime = null; | ||
81 | + try { | ||
82 | + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get(); | ||
83 | + } catch (Exception e) { | ||
84 | + try { | ||
85 | + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(1).xpath("//li/time/text()").get(); | ||
86 | + } catch (Exception e1) { | ||
87 | + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(0).xpath("//li/time/text()").get(); | ||
88 | + } | ||
89 | + } | ||
90 | + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH); | ||
91 | + | ||
92 | + try { | ||
93 | + publishTimeDateTime = formatter.parse(publishTime); | ||
94 | + } catch (ParseException e) { | ||
95 | + e.printStackTrace(); | ||
96 | + } | ||
97 | + Selectable authorSelectable = headSelectable.xpath("//ul").nodes().get(1).select(new XpathSelector("li[@class='c-article-author-list__item']")); | ||
98 | + List<Selectable> authorNodes = authorSelectable.nodes(); | ||
99 | + StringBuffer authorName = new StringBuffer(); | ||
100 | + for (Selectable node : authorNodes) { | ||
101 | + authorName.append(node.xpath("//a/text()")); | ||
102 | + } | ||
103 | + | ||
104 | + JSONArray authorAddress = new JSONArray(); | ||
105 | + List<Selectable> authorAddressList = authorAddressSelectable.nodes(); | ||
106 | + if (CollectionUtils.isNotEmpty(authorAddressList)) { | ||
107 | + for (Selectable selectable : authorAddressList) { | ||
108 | + String address = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__address']/text()").get(); | ||
109 | + String authorNames = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__authors-list']/text()").get(); | ||
110 | + JSONObject object = new JSONObject(); | ||
111 | + object.put("address", address); | ||
112 | + object.put("authorNames", authorNames); | ||
113 | + authorAddress.add(object); | ||
114 | + } | ||
115 | + } | ||
116 | + | ||
117 | + JSONArray references = new JSONArray(); | ||
118 | + List<Selectable> referenceList = referencesSelectable.nodes(); | ||
119 | + if (CollectionUtils.isNotEmpty(referenceList)) { | ||
120 | + for (Selectable reference : referenceList) { | ||
121 | + String referenceTitle = reference.xpath("//p").xpath("//p[@class='c-article-references__text']/text()").get(); | ||
122 | + List<Selectable> referenceLinks = reference.xpath("//p").xpath("//p[@class='c-article-references__links u-hide-print']").links().nodes(); | ||
123 | + List<String> links = new ArrayList<>(); | ||
124 | + if (CollectionUtils.isNotEmpty(referenceLinks)) { | ||
125 | + links = referenceLinks.stream().map(x -> x.get()).collect(Collectors.toList()); | ||
126 | + } | ||
127 | + JSONObject object = new JSONObject(); | ||
128 | + object.put("referenceTitle", referenceTitle); | ||
129 | + object.put("links", links); | ||
130 | +// if (CollectionUtils.isNotEmpty(links)) { | ||
131 | +// page.addTargetRequests(links.stream().filter(x -> x.contains("nature")).collect(Collectors.toList())); | ||
132 | +// } | ||
133 | + references.add(object); | ||
134 | + } | ||
135 | + } | ||
136 | + | ||
137 | + JSONArray authorEmail = new JSONArray(); | ||
138 | + for (Selectable authorEmailSelectable : authorEmailSelectables) { | ||
139 | + String[] split = authorEmailSelectable.xpath("//a").links().get().split(":"); | ||
140 | + String email = Objects.isNull(split) ? "" : split[split.length - 1]; | ||
141 | + String authorEmailName = authorEmailSelectable.xpath("//a/text()").get(); | ||
142 | + JSONObject jsonObject = new JSONObject(); | ||
143 | + jsonObject.put("authorEmailName", authorEmailName); | ||
144 | + jsonObject.put("email", email); | ||
145 | + authorEmail.add(jsonObject); | ||
146 | + } | ||
147 | + log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorEmail.toJSONString()); | ||
148 | + | ||
149 | + page.putField("article", ArticleDO.builder() | ||
150 | + .articleType(ArticleTypeEnum.NATURE_COMMUNICATIONS.getType()) | ||
151 | + .articleCode(articleCode) | ||
152 | + .authorName(authorName.toString()) | ||
153 | + .title(title) | ||
154 | + .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE)) | ||
155 | + .emailInfo(authorEmail.toJSONString()) | ||
156 | + .articleDesc(articleDesc) | ||
157 | + .authorAddress(authorAddress.toJSONString()) | ||
158 | + .referenceInfo(references.toJSONString()).build()); | ||
159 | + } | ||
160 | + | ||
161 | + public static void main(String[] args) { | ||
162 | + Spider.create(new MatterPagePcoessor()) | ||
163 | + .addUrl("https://www.nature.com/nenergy/research-articles") | ||
164 | + .addPipeline(new ArticlePipeline()) | ||
165 | + .thread(1).run(); | ||
166 | + } | ||
167 | +} |
src/main/java/com/canrd/webmagic/processor/NatureComputationalPcoessor.java
0 → 100644
1 | +package com.canrd.webmagic.processor; | ||
2 | + | ||
3 | +import com.alibaba.fastjson.JSONArray; | ||
4 | +import com.alibaba.fastjson.JSONObject; | ||
5 | +import com.baomidou.mybatisplus.core.toolkit.CollectionUtils; | ||
6 | +import com.canrd.webmagic.common.utils.DateUtil; | ||
7 | +import com.canrd.webmagic.common.utils.StringUtils; | ||
8 | +import com.canrd.webmagic.domain.ArticleTypeEnum; | ||
9 | +import com.canrd.webmagic.domain.dto.ArticleDO; | ||
10 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | ||
11 | +import lombok.extern.slf4j.Slf4j; | ||
12 | +import org.springframework.stereotype.Component; | ||
13 | +import us.codecraft.webmagic.Page; | ||
14 | +import us.codecraft.webmagic.Site; | ||
15 | +import us.codecraft.webmagic.Spider; | ||
16 | +import us.codecraft.webmagic.processor.PageProcessor; | ||
17 | +import us.codecraft.webmagic.selector.Html; | ||
18 | +import us.codecraft.webmagic.selector.Selectable; | ||
19 | +import us.codecraft.webmagic.selector.XpathSelector; | ||
20 | + | ||
21 | +import java.text.ParseException; | ||
22 | +import java.text.SimpleDateFormat; | ||
23 | +import java.util.*; | ||
24 | +import java.util.stream.Collectors; | ||
25 | + | ||
26 | +@Slf4j | ||
27 | +@Component | ||
28 | +public class NatureComputationalPcoessor implements PageProcessor{ | ||
29 | + @Override | ||
30 | + public void process(Page page) { | ||
31 | + String url = page.getUrl().get(); | ||
32 | + if (url.contains("https://www.nature.com/nenergy/research-articles?searchType=journalSearch&sort=PubDate&page=")){ | ||
33 | + everyPage(page); | ||
34 | + } else if (url.contains("https://www.nature.com/")){ | ||
35 | + doArticleContent(page); | ||
36 | + } | ||
37 | + } | ||
38 | + | ||
39 | + @Override | ||
40 | + public Site getSite() { | ||
41 | + return PageProcessor.super.getSite(); | ||
42 | + } | ||
43 | + | ||
44 | + | ||
45 | + public void everyPage(Page page){ | ||
46 | + List<String> all = page.getHtml().xpath("//div[@class='u-flex-direction-column']/h3/a//@href").all(); | ||
47 | + for (int i = 0; i < all.size(); i++) { | ||
48 | + page.addTargetRequest("https://www.nature.com/"+all.get(i)); | ||
49 | + } | ||
50 | + } | ||
51 | + | ||
52 | + private void doArticleContent(Page page) { | ||
53 | + Html html = page.getHtml(); | ||
54 | + String articleCode = page.getUrl().get(); | ||
55 | + Selectable headSelectable = html.xpath("//div[@class='c-article-header']/header"); | ||
56 | + List<Selectable> authorEmailSelectables = html.xpath("//p[@id='corresponding-author-list']/a").nodes(); | ||
57 | + Selectable referencesSelectable = html.xpath("//ol[@class='c-article-references']").select(new XpathSelector("li[@class='c-article-references__item js-c-reading-companion-references-item']")); | ||
58 | + Selectable authorAddressSelectable = html.xpath("//ol[@class='c-article-author-affiliation__list']").select(new XpathSelector("li")); | ||
59 | + | ||
60 | + String title = headSelectable.xpath("//div/h1/text()").get(); | ||
61 | + if (StringUtils.isBlank(title)) { | ||
62 | + title = headSelectable.xpath("//h1/text()").get(); | ||
63 | + } | ||
64 | + String articleDesc = html.xpath("//div[@class='c-article-section__content']/p/text()").get(); | ||
65 | + String publishTime; | ||
66 | + Date publishTimeDateTime = null; | ||
67 | + try { | ||
68 | + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get(); | ||
69 | + } catch (Exception e) { | ||
70 | + try { | ||
71 | + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(1).xpath("//li/time/text()").get(); | ||
72 | + } catch (Exception e1) { | ||
73 | + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(0).xpath("//li/time/text()").get(); | ||
74 | + } | ||
75 | + } | ||
76 | + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH); | ||
77 | + | ||
78 | + try { | ||
79 | + publishTimeDateTime = formatter.parse(publishTime); | ||
80 | + } catch (ParseException e) { | ||
81 | + e.printStackTrace(); | ||
82 | + } | ||
83 | + Selectable authorSelectable = headSelectable.xpath("//ul").nodes().get(1).select(new XpathSelector("li[@class='c-article-author-list__item']")); | ||
84 | + List<Selectable> authorNodes = authorSelectable.nodes(); | ||
85 | + StringBuffer authorName = new StringBuffer(); | ||
86 | + for (Selectable node : authorNodes) { | ||
87 | + authorName.append(node.xpath("//a/text()")); | ||
88 | + } | ||
89 | + | ||
90 | + JSONArray authorAddress = new JSONArray(); | ||
91 | + List<Selectable> authorAddressList = authorAddressSelectable.nodes(); | ||
92 | + if (CollectionUtils.isNotEmpty(authorAddressList)) { | ||
93 | + for (Selectable selectable : authorAddressList) { | ||
94 | + String address = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__address']/text()").get(); | ||
95 | + String authorNames = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__authors-list']/text()").get(); | ||
96 | + JSONObject object = new JSONObject(); | ||
97 | + object.put("address", address); | ||
98 | + object.put("authorNames", authorNames); | ||
99 | + authorAddress.add(object); | ||
100 | + } | ||
101 | + } | ||
102 | + | ||
103 | + JSONArray references = new JSONArray(); | ||
104 | + List<Selectable> referenceList = referencesSelectable.nodes(); | ||
105 | + if (CollectionUtils.isNotEmpty(referenceList)) { | ||
106 | + for (Selectable reference : referenceList) { | ||
107 | + String referenceTitle = reference.xpath("//p").xpath("//p[@class='c-article-references__text']/text()").get(); | ||
108 | + List<Selectable> referenceLinks = reference.xpath("//p").xpath("//p[@class='c-article-references__links u-hide-print']").links().nodes(); | ||
109 | + List<String> links = new ArrayList<>(); | ||
110 | + if (CollectionUtils.isNotEmpty(referenceLinks)) { | ||
111 | + links = referenceLinks.stream().map(x -> x.get()).collect(Collectors.toList()); | ||
112 | + } | ||
113 | + JSONObject object = new JSONObject(); | ||
114 | + object.put("referenceTitle", referenceTitle); | ||
115 | + object.put("links", links); | ||
116 | +// if (CollectionUtils.isNotEmpty(links)) { | ||
117 | +// page.addTargetRequests(links.stream().filter(x -> x.contains("nature")).collect(Collectors.toList())); | ||
118 | +// } | ||
119 | + references.add(object); | ||
120 | + } | ||
121 | + } | ||
122 | + | ||
123 | + JSONArray authorEmail = new JSONArray(); | ||
124 | + for (Selectable authorEmailSelectable : authorEmailSelectables) { | ||
125 | + String[] split = authorEmailSelectable.xpath("//a").links().get().split(":"); | ||
126 | + String email = Objects.isNull(split) ? "" : split[split.length - 1]; | ||
127 | + String authorEmailName = authorEmailSelectable.xpath("//a/text()").get(); | ||
128 | + JSONObject jsonObject = new JSONObject(); | ||
129 | + jsonObject.put("authorEmailName", authorEmailName); | ||
130 | + jsonObject.put("email", email); | ||
131 | + authorEmail.add(jsonObject); | ||
132 | + } | ||
133 | + log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorEmail.toJSONString()); | ||
134 | + | ||
135 | + page.putField("article", ArticleDO.builder() | ||
136 | + .articleType(ArticleTypeEnum.NATURE_COMPUTATIONAL_SCIENCE.getType()) | ||
137 | + .articleCode(articleCode) | ||
138 | + .authorName(authorName.toString()) | ||
139 | + .title(title) | ||
140 | + .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE)) | ||
141 | + .emailInfo(authorEmail.toJSONString()) | ||
142 | + .articleDesc(articleDesc) | ||
143 | + .authorAddress(authorAddress.toJSONString()) | ||
144 | + .referenceInfo(references.toJSONString()).build()); | ||
145 | + } | ||
146 | + | ||
147 | + public static void main(String[] args) { | ||
148 | + Spider.create(new MatterPagePcoessor()) | ||
149 | + .addUrl("https://www.nature.com/nenergy/research-articles") | ||
150 | + .addPipeline(new ArticlePipeline()) | ||
151 | + .thread(1).run(); | ||
152 | + } | ||
153 | +} |
src/main/java/com/canrd/webmagic/processor/NatureEnergyPagePcoessor.java
0 → 100644
1 | +package com.canrd.webmagic.processor; | ||
2 | + | ||
3 | +import com.alibaba.fastjson.JSONArray; | ||
4 | +import com.alibaba.fastjson.JSONObject; | ||
5 | +import com.baomidou.mybatisplus.core.toolkit.CollectionUtils; | ||
6 | +import com.canrd.webmagic.common.utils.DateUtil; | ||
7 | +import com.canrd.webmagic.common.utils.StringUtils; | ||
8 | +import com.canrd.webmagic.domain.ArticleTypeEnum; | ||
9 | +import com.canrd.webmagic.domain.dto.ArticleDO; | ||
10 | +import com.canrd.webmagic.processor.MatterPagePcoessor; | ||
11 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | ||
12 | +import lombok.extern.slf4j.Slf4j; | ||
13 | +import org.springframework.stereotype.Component; | ||
14 | +import us.codecraft.webmagic.Page; | ||
15 | +import us.codecraft.webmagic.Site; | ||
16 | +import us.codecraft.webmagic.Spider; | ||
17 | +import us.codecraft.webmagic.processor.PageProcessor; | ||
18 | +import us.codecraft.webmagic.selector.Html; | ||
19 | +import us.codecraft.webmagic.selector.Selectable; | ||
20 | +import us.codecraft.webmagic.selector.XpathSelector; | ||
21 | + | ||
22 | +import java.text.ParseException; | ||
23 | +import java.text.SimpleDateFormat; | ||
24 | +import java.util.*; | ||
25 | +import java.util.stream.Collectors; | ||
26 | + | ||
27 | +@Component | ||
28 | +@Slf4j | ||
29 | +public class NatureEnergyPagePcoessor implements PageProcessor { | ||
30 | + @Override | ||
31 | + public void process(Page page) { | ||
32 | + String url = page.getUrl().get(); | ||
33 | + if (url.equals("https://www.nature.com/nenergy/research-articles")){ | ||
34 | + getIndex(page); | ||
35 | + } else if (url.contains("https://www.nature.com/nenergy/research-articles?searchType=journalSearch&sort=PubDate&page=")){ | ||
36 | + everyPage(page); | ||
37 | + } else if (url.contains("https://www.nature.com/")){ | ||
38 | + doArticleContent(page); | ||
39 | + } | ||
40 | + } | ||
41 | + | ||
42 | + @Override | ||
43 | + public Site getSite() { | ||
44 | + return PageProcessor.super.getSite(); | ||
45 | + } | ||
46 | + | ||
47 | + public void getIndex(Page page){ | ||
48 | + String maxIndex = page.getHtml().xpath("//div[@class='u-mb-48']/nav/ul/li[6]/a/text()").get(); | ||
49 | + log.info(maxIndex); | ||
50 | + String trim = maxIndex.trim(); | ||
51 | + int number = Integer.parseInt(trim); | ||
52 | + System.out.printf("", number); | ||
53 | + for (int i = 0; i < number; i++) { | ||
54 | + page.addTargetRequest("https://www.nature.com/nenergy/research-articles?searchType=journalSearch&sort=PubDate&page="+i); | ||
55 | + } | ||
56 | + } | ||
57 | + | ||
58 | + public void everyPage(Page page){ | ||
59 | + List<String> all = page.getHtml().xpath("//div[@class='u-flex-direction-column']/h3/a//@href").all(); | ||
60 | + for (int i = 0; i < all.size(); i++) { | ||
61 | + page.addTargetRequest("https://www.nature.com/"+all.get(i)); | ||
62 | + } | ||
63 | + } | ||
64 | + | ||
65 | + private void doArticleContent(Page page) { | ||
66 | + Html html = page.getHtml(); | ||
67 | + String articleCode = page.getUrl().get(); | ||
68 | + Selectable headSelectable = html.xpath("//div[@class='c-article-header']/header"); | ||
69 | + List<Selectable> authorEmailSelectables = html.xpath("//p[@id='corresponding-author-list']/a").nodes(); | ||
70 | + Selectable referencesSelectable = html.xpath("//ol[@class='c-article-references']").select(new XpathSelector("li[@class='c-article-references__item js-c-reading-companion-references-item']")); | ||
71 | + Selectable authorAddressSelectable = html.xpath("//ol[@class='c-article-author-affiliation__list']").select(new XpathSelector("li")); | ||
72 | + | ||
73 | + String title = headSelectable.xpath("//div/h1/text()").get(); | ||
74 | + if (StringUtils.isBlank(title)) { | ||
75 | + title = headSelectable.xpath("//h1/text()").get(); | ||
76 | + } | ||
77 | + String articleDesc = html.xpath("//div[@class='c-article-section__content']/p/text()").get(); | ||
78 | + String publishTime; | ||
79 | + Date publishTimeDateTime = null; | ||
80 | + try { | ||
81 | + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get(); | ||
82 | + } catch (Exception e) { | ||
83 | + try { | ||
84 | + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(1).xpath("//li/time/text()").get(); | ||
85 | + } catch (Exception e1) { | ||
86 | + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(0).xpath("//li/time/text()").get(); | ||
87 | + } | ||
88 | + } | ||
89 | + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH); | ||
90 | + | ||
91 | + try { | ||
92 | + publishTimeDateTime = formatter.parse(publishTime); | ||
93 | + } catch (ParseException e) { | ||
94 | + e.printStackTrace(); | ||
95 | + } | ||
96 | + Selectable authorSelectable = headSelectable.xpath("//ul").nodes().get(1).select(new XpathSelector("li[@class='c-article-author-list__item']")); | ||
97 | + List<Selectable> authorNodes = authorSelectable.nodes(); | ||
98 | + StringBuffer authorName = new StringBuffer(); | ||
99 | + for (Selectable node : authorNodes) { | ||
100 | + authorName.append(node.xpath("//a/text()")); | ||
101 | + } | ||
102 | + | ||
103 | + JSONArray authorAddress = new JSONArray(); | ||
104 | + List<Selectable> authorAddressList = authorAddressSelectable.nodes(); | ||
105 | + if (CollectionUtils.isNotEmpty(authorAddressList)) { | ||
106 | + for (Selectable selectable : authorAddressList) { | ||
107 | + String address = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__address']/text()").get(); | ||
108 | + String authorNames = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__authors-list']/text()").get(); | ||
109 | + JSONObject object = new JSONObject(); | ||
110 | + object.put("address", address); | ||
111 | + object.put("authorNames", authorNames); | ||
112 | + authorAddress.add(object); | ||
113 | + } | ||
114 | + } | ||
115 | + | ||
116 | + JSONArray references = new JSONArray(); | ||
117 | + List<Selectable> referenceList = referencesSelectable.nodes(); | ||
118 | + if (CollectionUtils.isNotEmpty(referenceList)) { | ||
119 | + for (Selectable reference : referenceList) { | ||
120 | + String referenceTitle = reference.xpath("//p").xpath("//p[@class='c-article-references__text']/text()").get(); | ||
121 | + List<Selectable> referenceLinks = reference.xpath("//p").xpath("//p[@class='c-article-references__links u-hide-print']").links().nodes(); | ||
122 | + List<String> links = new ArrayList<>(); | ||
123 | + if (CollectionUtils.isNotEmpty(referenceLinks)) { | ||
124 | + links = referenceLinks.stream().map(x -> x.get()).collect(Collectors.toList()); | ||
125 | + } | ||
126 | + JSONObject object = new JSONObject(); | ||
127 | + object.put("referenceTitle", referenceTitle); | ||
128 | + object.put("links", links); | ||
129 | +// if (CollectionUtils.isNotEmpty(links)) { | ||
130 | +// page.addTargetRequests(links.stream().filter(x -> x.contains("nature")).collect(Collectors.toList())); | ||
131 | +// } | ||
132 | + references.add(object); | ||
133 | + } | ||
134 | + } | ||
135 | + | ||
136 | + JSONArray authorEmail = new JSONArray(); | ||
137 | + for (Selectable authorEmailSelectable : authorEmailSelectables) { | ||
138 | + String[] split = authorEmailSelectable.xpath("//a").links().get().split(":"); | ||
139 | + String email = Objects.isNull(split) ? "" : split[split.length - 1]; | ||
140 | + String authorEmailName = authorEmailSelectable.xpath("//a/text()").get(); | ||
141 | + JSONObject jsonObject = new JSONObject(); | ||
142 | + jsonObject.put("authorEmailName", authorEmailName); | ||
143 | + jsonObject.put("email", email); | ||
144 | + authorEmail.add(jsonObject); | ||
145 | + } | ||
146 | + log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorEmail.toJSONString()); | ||
147 | + | ||
148 | + page.putField("article", ArticleDO.builder() | ||
149 | + .articleType(ArticleTypeEnum.NATURE_ENERGY.getType()) | ||
150 | + .articleCode(articleCode) | ||
151 | + .authorName(authorName.toString()) | ||
152 | + .title(title) | ||
153 | + .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE)) | ||
154 | + .emailInfo(authorEmail.toJSONString()) | ||
155 | + .articleDesc(articleDesc) | ||
156 | + .authorAddress(authorAddress.toJSONString()) | ||
157 | + .referenceInfo(references.toJSONString()).build()); | ||
158 | + } | ||
159 | + | ||
160 | + public static void main(String[] args) { | ||
161 | + Spider.create(new MatterPagePcoessor()) | ||
162 | + .addUrl("https://www.nature.com/nenergy/research-articles") | ||
163 | + .addPipeline(new ArticlePipeline()) | ||
164 | + .thread(1).run(); | ||
165 | + } | ||
166 | +} |
src/main/java/com/canrd/webmagic/processor/NatureMaterialPagePcoessor.java
0 → 100644
1 | +package com.canrd.webmagic.processor; | ||
2 | + | ||
3 | +import com.alibaba.fastjson.JSONArray; | ||
4 | +import com.alibaba.fastjson.JSONObject; | ||
5 | +import com.baomidou.mybatisplus.core.toolkit.CollectionUtils; | ||
6 | +import com.canrd.webmagic.common.utils.DateUtil; | ||
7 | +import com.canrd.webmagic.common.utils.StringUtils; | ||
8 | +import com.canrd.webmagic.domain.ArticleTypeEnum; | ||
9 | +import com.canrd.webmagic.domain.dto.ArticleDO; | ||
10 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | ||
11 | +import lombok.extern.slf4j.Slf4j; | ||
12 | +import org.springframework.stereotype.Component; | ||
13 | +import us.codecraft.webmagic.Page; | ||
14 | +import us.codecraft.webmagic.Site; | ||
15 | +import us.codecraft.webmagic.Spider; | ||
16 | +import us.codecraft.webmagic.processor.PageProcessor; | ||
17 | +import us.codecraft.webmagic.selector.Html; | ||
18 | +import us.codecraft.webmagic.selector.Selectable; | ||
19 | +import us.codecraft.webmagic.selector.XpathSelector; | ||
20 | + | ||
21 | +import java.text.ParseException; | ||
22 | +import java.text.SimpleDateFormat; | ||
23 | +import java.util.*; | ||
24 | +import java.util.stream.Collectors; | ||
25 | + | ||
26 | +@Component | ||
27 | +@Slf4j | ||
28 | +public class NatureMaterialPagePcoessor implements PageProcessor { | ||
29 | + @Override | ||
30 | + public void process(Page page) { | ||
31 | + String url = page.getUrl().get(); | ||
32 | + if (url.equals("https://www.nature.com/nmat/articles")){ | ||
33 | + getIndex(page); | ||
34 | + } else if (url.contains("https://www.nature.com/nmat/articles?searchType=journalSearch&sort=PubDate&page=")){ | ||
35 | + everyPage(page); | ||
36 | + } else if (url.equals("https://www.nature.com/search?q=battery")) { | ||
37 | + getIndex(page); | ||
38 | + } else if (url.contains("https://www.nature.com/search?q=battery&page=")) { | ||
39 | + everyPage(page); | ||
40 | + } else if (url.contains("https://www.nature.com/articles")){ | ||
41 | + doArticleContent(page); | ||
42 | + } else if (url.equals("https://www.nature.com/nature/research-articles")) { | ||
43 | + getIndex(page); | ||
44 | + } else if (url.contains("https://www.nature.com/nature/research-articles?searchType=journalSearch&sort=PubDate&page=")) { | ||
45 | + everyPage(page); | ||
46 | + } else if (url.equals("https://www.nature.com/search?q=battery&journal=nmat&order=relevance")) { | ||
47 | + getIndex(page); | ||
48 | + }else if (url.equals("https://www.nature.com/search?q=batteries&journal=nmat&order=relevance")) { | ||
49 | + getIndex(page); | ||
50 | + }else if (url.contains("https://www.nature.com/search?q=battery&order=relevance&journal=nmat&page=")) { | ||
51 | + everyPage(page); | ||
52 | + }else if (url.contains("https://www.nature.com/search?q=batteries&journal=nmat&page=")) { | ||
53 | + everyPage(page); | ||
54 | + } | ||
55 | + } | ||
56 | + | ||
57 | + @Override | ||
58 | + public Site getSite() { | ||
59 | + return PageProcessor.super.getSite().setRetryTimes(3).setSleepTime(100); | ||
60 | + } | ||
61 | + | ||
62 | + public void getIndex(Page page){ | ||
63 | + String maxIndex = page.getHtml().xpath("//div[@class='u-mb-48']/nav/ul/li[6]/a/text()").get(); | ||
64 | + log.info(maxIndex); | ||
65 | + String trim = maxIndex.trim(); | ||
66 | + int number = Integer.parseInt(trim); | ||
67 | + if (page.getUrl().get().equals("https://www.nature.com/nmat/articles")){ | ||
68 | + for (int i = 1; i <= number; i++) { | ||
69 | + page.addTargetRequest("https://www.nature.com/nmat/articles?searchType=journalSearch&sort=PubDate&page="+i); | ||
70 | + } | ||
71 | + } | ||
72 | + if (page.getUrl().get().equals("https://www.nature.com/search?q=battery")){ | ||
73 | + for (int i = 1; i <= number; i++) { | ||
74 | + page.addTargetRequest("https://www.nature.com/search?q=battery&page="+i); | ||
75 | + } | ||
76 | + } | ||
77 | + if (page.getUrl().get().equals("https://www.nature.com/nature/research-articles")){ | ||
78 | + for (int i = 1; i <= number; i++) { | ||
79 | + page.addTargetRequest("https://www.nature.com/nature/research-articles?searchType=journalSearch&sort=PubDate&page="+i); | ||
80 | + } | ||
81 | + } | ||
82 | + if (page.getUrl().get().equals("https://www.nature.com/search?q=battery&journal=nmat&order=relevance")){ | ||
83 | + for (int i = 1; i <= number; i++) { | ||
84 | + page.addTargetRequest("https://www.nature.com/search?q=battery&order=relevance&journal=nmat&page="+i); | ||
85 | + } | ||
86 | + } | ||
87 | + if (page.getUrl().get().equals("https://www.nature.com/search?q=batteries&journal=nmat&order=relevance")){ | ||
88 | + for (int i = 1; i <= number; i++) { | ||
89 | + page.addTargetRequest("https://www.nature.com/search?q=batteries&journal=nmat&page="+i); | ||
90 | + } | ||
91 | + } | ||
92 | + } | ||
93 | + | ||
94 | + public void everyPage(Page page){ | ||
95 | + List<String> all = page.getHtml().xpath("//div[@class='u-flex-direction-column']/h3/a//@href").all(); | ||
96 | + for (int i = 0; i < all.size(); i++) { | ||
97 | +// log.info(all.get(i)); | ||
98 | + page.addTargetRequest("https://www.nature.com"+all.get(i)); | ||
99 | + } | ||
100 | + } | ||
101 | + | ||
102 | + private void doArticleContent(Page page) { | ||
103 | + if (page.getUrl().get().contains("redirect") || !page.getUrl().get().contains("nature")) { | ||
104 | + return; | ||
105 | + } | ||
106 | + //解析页面 | ||
107 | + Html html = page.getHtml(); | ||
108 | + String articleCode = page.getUrl().get(); | ||
109 | + Selectable headSelectable = html.xpath("//div[@class='c-article-header']/header"); | ||
110 | + List<Selectable> authorEmailSelectables = html.xpath("//p[@id='corresponding-author-list']/a").nodes(); | ||
111 | + Selectable referencesSelectable = html.xpath("//ol[@class='c-article-references']").select(new XpathSelector("li[@class='c-article-references__item js-c-reading-companion-references-item']")); | ||
112 | + Selectable authorAddressSelectable = html.xpath("//ol[@class='c-article-author-affiliation__list']").select(new XpathSelector("li")); | ||
113 | + | ||
114 | + String title = headSelectable.xpath("//div/h1/text()").get(); | ||
115 | + if (StringUtils.isBlank(title)) { | ||
116 | + title = headSelectable.xpath("//h1/text()").get(); | ||
117 | + } | ||
118 | + String articleDesc = html.xpath("//div[@class='c-article-section__content']/p/text()").get(); | ||
119 | + String publishTime; | ||
120 | + Date publishTimeDateTime = null; | ||
121 | + try { | ||
122 | + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get(); | ||
123 | + } catch (Exception e) { | ||
124 | + try { | ||
125 | + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(1).xpath("//li/time/text()").get(); | ||
126 | + } catch (Exception e1) { | ||
127 | + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(0).xpath("//li/time/text()").get(); | ||
128 | + } | ||
129 | + } | ||
130 | + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH); | ||
131 | + | ||
132 | + try { | ||
133 | + publishTimeDateTime = formatter.parse(publishTime); | ||
134 | + } catch (ParseException e) { | ||
135 | + e.printStackTrace(); | ||
136 | + } | ||
137 | + Selectable authorSelectable = headSelectable.xpath("//ul").nodes().get(1).select(new XpathSelector("li[@class='c-article-author-list__item']")); | ||
138 | + List<Selectable> authorNodes = authorSelectable.nodes(); | ||
139 | + StringBuffer authorName = new StringBuffer(); | ||
140 | + for (Selectable node : authorNodes) { | ||
141 | + authorName.append(node.xpath("//a/text()")); | ||
142 | + } | ||
143 | + | ||
144 | + JSONArray authorAddress = new JSONArray(); | ||
145 | + List<Selectable> authorAddressList = authorAddressSelectable.nodes(); | ||
146 | + if (CollectionUtils.isNotEmpty(authorAddressList)) { | ||
147 | + for (Selectable selectable : authorAddressList) { | ||
148 | + String address = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__address']/text()").get(); | ||
149 | + String authorNames = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__authors-list']/text()").get(); | ||
150 | + JSONObject object = new JSONObject(); | ||
151 | + object.put("address", address); | ||
152 | + object.put("authorNames", authorNames); | ||
153 | + authorAddress.add(object); | ||
154 | + } | ||
155 | + } | ||
156 | + | ||
157 | + JSONArray references = new JSONArray(); | ||
158 | + List<Selectable> referenceList = referencesSelectable.nodes(); | ||
159 | + if (CollectionUtils.isNotEmpty(referenceList)) { | ||
160 | + for (Selectable reference : referenceList) { | ||
161 | + String referenceTitle = reference.xpath("//p").xpath("//p[@class='c-article-references__text']/text()").get(); | ||
162 | + List<Selectable> referenceLinks = reference.xpath("//p").xpath("//p[@class='c-article-references__links u-hide-print']").links().nodes(); | ||
163 | + List<String> links = new ArrayList<>(); | ||
164 | + if (CollectionUtils.isNotEmpty(referenceLinks)) { | ||
165 | + links = referenceLinks.stream().map(x -> x.get()).collect(Collectors.toList()); | ||
166 | + } | ||
167 | + JSONObject object = new JSONObject(); | ||
168 | + object.put("referenceTitle", referenceTitle); | ||
169 | + object.put("links", links); | ||
170 | +// if (CollectionUtils.isNotEmpty(links)) { | ||
171 | +// page.addTargetRequests(links.stream().filter(x -> x.contains("nature")).collect(Collectors.toList())); | ||
172 | +// } | ||
173 | + references.add(object); | ||
174 | + } | ||
175 | + } | ||
176 | + | ||
177 | + JSONArray authorEmail = new JSONArray(); | ||
178 | + for (Selectable authorEmailSelectable : authorEmailSelectables) { | ||
179 | + String[] split = authorEmailSelectable.xpath("//a").links().get().split(":"); | ||
180 | + String email = Objects.isNull(split) ? "" : split[split.length - 1]; | ||
181 | + String authorEmailName = authorEmailSelectable.xpath("//a/text()").get(); | ||
182 | + JSONObject jsonObject = new JSONObject(); | ||
183 | + jsonObject.put("authorEmailName", authorEmailName); | ||
184 | + jsonObject.put("email", email); | ||
185 | + authorEmail.add(jsonObject); | ||
186 | + } | ||
187 | + log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorEmail.toJSONString()); | ||
188 | + | ||
189 | + page.putField("article", ArticleDO.builder() | ||
190 | + .articleType(ArticleTypeEnum.NATURE_MATERIAL.getType()) | ||
191 | + .articleCode(articleCode) | ||
192 | + .authorName(authorName.toString()) | ||
193 | + .title(title) | ||
194 | + .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE)) | ||
195 | + .emailInfo(authorEmail.toJSONString()) | ||
196 | + .articleDesc(articleDesc) | ||
197 | + .authorAddress(authorAddress.toJSONString()) | ||
198 | + .referenceInfo(references.toJSONString()).build()); | ||
199 | + } | ||
200 | + | ||
201 | + public static void main(String[] args) { | ||
202 | + Spider.create(new MatterPagePcoessor()) | ||
203 | + .addUrl("https://www.nature.com/nmat/articles?searchType=journalSearch&sort=PubDate&page=1") | ||
204 | + .addPipeline(new ArticlePipeline()) | ||
205 | + .thread(1).run(); | ||
206 | + } | ||
207 | +} |
src/main/java/com/canrd/webmagic/processor/NatureMethodsPcoessor.java
0 → 100644
1 | +package com.canrd.webmagic.processor; | ||
2 | + | ||
3 | +import com.alibaba.fastjson.JSONArray; | ||
4 | +import com.alibaba.fastjson.JSONObject; | ||
5 | +import com.baomidou.mybatisplus.core.toolkit.CollectionUtils; | ||
6 | +import com.canrd.webmagic.common.utils.DateUtil; | ||
7 | +import com.canrd.webmagic.common.utils.StringUtils; | ||
8 | +import com.canrd.webmagic.domain.ArticleTypeEnum; | ||
9 | +import com.canrd.webmagic.domain.dto.ArticleDO; | ||
10 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | ||
11 | +import lombok.extern.slf4j.Slf4j; | ||
12 | +import org.springframework.stereotype.Component; | ||
13 | +import us.codecraft.webmagic.Page; | ||
14 | +import us.codecraft.webmagic.Site; | ||
15 | +import us.codecraft.webmagic.Spider; | ||
16 | +import us.codecraft.webmagic.processor.PageProcessor; | ||
17 | +import us.codecraft.webmagic.selector.Html; | ||
18 | +import us.codecraft.webmagic.selector.Selectable; | ||
19 | +import us.codecraft.webmagic.selector.XpathSelector; | ||
20 | + | ||
21 | +import java.text.ParseException; | ||
22 | +import java.text.SimpleDateFormat; | ||
23 | +import java.util.*; | ||
24 | +import java.util.stream.Collectors; | ||
25 | + | ||
26 | +@Slf4j | ||
27 | +@Component | ||
28 | +public class NatureMethodsPcoessor implements PageProcessor { | ||
29 | + @Override | ||
30 | + public void process(Page page) { | ||
31 | + String url = page.getUrl().get(); | ||
32 | + if (url.equals("https://www.nature.com/nmeth/")) { | ||
33 | + everyPage(page); | ||
34 | + }else if (url.contains("https://www.nature.com/")){ | ||
35 | + doArticleContent(page); | ||
36 | + } | ||
37 | + } | ||
38 | + | ||
39 | + @Override | ||
40 | + public Site getSite() { | ||
41 | + return PageProcessor.super.getSite(); | ||
42 | + } | ||
43 | + public void everyPage(Page page){ | ||
44 | + List<String> all = page.getHtml().xpath("//div[@class='u-flex-direction-column']/h3/a//@href").all(); | ||
45 | + for (int i = 0; i < all.size(); i++) { | ||
46 | + page.addTargetRequest("https://www.nature.com/"+all.get(i)); | ||
47 | + } | ||
48 | + } | ||
49 | + | ||
50 | + private void doArticleContent(Page page) { | ||
51 | + Html html = page.getHtml(); | ||
52 | + String articleCode = page.getUrl().get(); | ||
53 | + Selectable headSelectable = html.xpath("//div[@class='c-article-header']/header"); | ||
54 | + List<Selectable> authorEmailSelectables = html.xpath("//p[@id='corresponding-author-list']/a").nodes(); | ||
55 | + Selectable referencesSelectable = html.xpath("//ol[@class='c-article-references']").select(new XpathSelector("li[@class='c-article-references__item js-c-reading-companion-references-item']")); | ||
56 | + Selectable authorAddressSelectable = html.xpath("//ol[@class='c-article-author-affiliation__list']").select(new XpathSelector("li")); | ||
57 | + | ||
58 | + String title = headSelectable.xpath("//div/h1/text()").get(); | ||
59 | + if (StringUtils.isBlank(title)) { | ||
60 | + title = headSelectable.xpath("//h1/text()").get(); | ||
61 | + } | ||
62 | + String articleDesc = html.xpath("//div[@class='c-article-section__content']/p/text()").get(); | ||
63 | + String publishTime; | ||
64 | + Date publishTimeDateTime = null; | ||
65 | + try { | ||
66 | + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get(); | ||
67 | + } catch (Exception e) { | ||
68 | + try { | ||
69 | + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(1).xpath("//li/time/text()").get(); | ||
70 | + } catch (Exception e1) { | ||
71 | + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(0).xpath("//li/time/text()").get(); | ||
72 | + } | ||
73 | + } | ||
74 | + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH); | ||
75 | + | ||
76 | + try { | ||
77 | + publishTimeDateTime = formatter.parse(publishTime); | ||
78 | + } catch (ParseException e) { | ||
79 | + e.printStackTrace(); | ||
80 | + } | ||
81 | + Selectable authorSelectable = headSelectable.xpath("//ul").nodes().get(1).select(new XpathSelector("li[@class='c-article-author-list__item']")); | ||
82 | + List<Selectable> authorNodes = authorSelectable.nodes(); | ||
83 | + StringBuffer authorName = new StringBuffer(); | ||
84 | + for (Selectable node : authorNodes) { | ||
85 | + authorName.append(node.xpath("//a/text()")); | ||
86 | + } | ||
87 | + | ||
88 | + JSONArray authorAddress = new JSONArray(); | ||
89 | + List<Selectable> authorAddressList = authorAddressSelectable.nodes(); | ||
90 | + if (CollectionUtils.isNotEmpty(authorAddressList)) { | ||
91 | + for (Selectable selectable : authorAddressList) { | ||
92 | + String address = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__address']/text()").get(); | ||
93 | + String authorNames = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__authors-list']/text()").get(); | ||
94 | + JSONObject object = new JSONObject(); | ||
95 | + object.put("address", address); | ||
96 | + object.put("authorNames", authorNames); | ||
97 | + authorAddress.add(object); | ||
98 | + } | ||
99 | + } | ||
100 | + | ||
101 | + JSONArray references = new JSONArray(); | ||
102 | + List<Selectable> referenceList = referencesSelectable.nodes(); | ||
103 | + if (CollectionUtils.isNotEmpty(referenceList)) { | ||
104 | + for (Selectable reference : referenceList) { | ||
105 | + String referenceTitle = reference.xpath("//p").xpath("//p[@class='c-article-references__text']/text()").get(); | ||
106 | + List<Selectable> referenceLinks = reference.xpath("//p").xpath("//p[@class='c-article-references__links u-hide-print']").links().nodes(); | ||
107 | + List<String> links = new ArrayList<>(); | ||
108 | + if (CollectionUtils.isNotEmpty(referenceLinks)) { | ||
109 | + links = referenceLinks.stream().map(x -> x.get()).collect(Collectors.toList()); | ||
110 | + } | ||
111 | + JSONObject object = new JSONObject(); | ||
112 | + object.put("referenceTitle", referenceTitle); | ||
113 | + object.put("links", links); | ||
114 | +// if (CollectionUtils.isNotEmpty(links)) { | ||
115 | +// page.addTargetRequests(links.stream().filter(x -> x.contains("nature")).collect(Collectors.toList())); | ||
116 | +// } | ||
117 | + references.add(object); | ||
118 | + } | ||
119 | + } | ||
120 | + | ||
121 | + JSONArray authorEmail = new JSONArray(); | ||
122 | + for (Selectable authorEmailSelectable : authorEmailSelectables) { | ||
123 | + String[] split = authorEmailSelectable.xpath("//a").links().get().split(":"); | ||
124 | + String email = Objects.isNull(split) ? "" : split[split.length - 1]; | ||
125 | + String authorEmailName = authorEmailSelectable.xpath("//a/text()").get(); | ||
126 | + JSONObject jsonObject = new JSONObject(); | ||
127 | + jsonObject.put("authorEmailName", authorEmailName); | ||
128 | + jsonObject.put("email", email); | ||
129 | + authorEmail.add(jsonObject); | ||
130 | + } | ||
131 | + log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorEmail.toJSONString()); | ||
132 | + | ||
133 | + page.putField("article", ArticleDO.builder() | ||
134 | + .articleType(ArticleTypeEnum.NATURE_METHODS.getType()) | ||
135 | + .articleCode(articleCode) | ||
136 | + .authorName(authorName.toString()) | ||
137 | + .title(title) | ||
138 | + .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE)) | ||
139 | + .emailInfo(authorEmail.toJSONString()) | ||
140 | + .articleDesc(articleDesc) | ||
141 | + .authorAddress(authorAddress.toJSONString()) | ||
142 | + .referenceInfo(references.toJSONString()).build()); | ||
143 | + } | ||
144 | + | ||
145 | + public static void main(String[] args) { | ||
146 | + Spider.create(new MatterPagePcoessor()) | ||
147 | + .addUrl("https://www.nature.com/nenergy/research-articles") | ||
148 | + .addPipeline(new ArticlePipeline()) | ||
149 | + .thread(1).run(); | ||
150 | + } | ||
151 | +} |
src/main/java/com/canrd/webmagic/processor/NatureNanotechnologyProcessor.java
0 → 100644
1 | +package com.canrd.webmagic.processor; | ||
2 | + | ||
3 | +import com.alibaba.fastjson.JSONArray; | ||
4 | +import com.alibaba.fastjson.JSONObject; | ||
5 | +import com.baomidou.mybatisplus.core.toolkit.CollectionUtils; | ||
6 | +import com.canrd.webmagic.common.utils.DateUtil; | ||
7 | +import com.canrd.webmagic.common.utils.StringUtils; | ||
8 | +import com.canrd.webmagic.domain.ArticleTypeEnum; | ||
9 | +import com.canrd.webmagic.domain.dto.ArticleDO; | ||
10 | +import lombok.extern.slf4j.Slf4j; | ||
11 | +import org.springframework.stereotype.Component; | ||
12 | +import us.codecraft.webmagic.Page; | ||
13 | +import us.codecraft.webmagic.Site; | ||
14 | +import us.codecraft.webmagic.processor.PageProcessor; | ||
15 | +import us.codecraft.webmagic.selector.Html; | ||
16 | +import us.codecraft.webmagic.selector.Selectable; | ||
17 | +import us.codecraft.webmagic.selector.XpathSelector; | ||
18 | + | ||
19 | +import java.text.ParseException; | ||
20 | +import java.text.SimpleDateFormat; | ||
21 | +import java.util.*; | ||
22 | +import java.util.stream.Collectors; | ||
23 | + | ||
24 | +@Component | ||
25 | +@Slf4j | ||
26 | +public class NatureNanotechnologyProcessor implements PageProcessor { | ||
27 | + | ||
28 | + //目前只有一页 | ||
29 | + @Override | ||
30 | + public void process(Page page) { | ||
31 | + String url = page.getUrl().get(); | ||
32 | + if (url.equals("https://www.nature.com/nnano/")){ | ||
33 | + everyPage(page); | ||
34 | + } else if (url.contains("https://www.nature.com/")){ | ||
35 | + doArticleContent(page); | ||
36 | + } | ||
37 | + } | ||
38 | + | ||
39 | + @Override | ||
40 | + public Site getSite() { | ||
41 | + return PageProcessor.super.getSite(); | ||
42 | + } | ||
43 | + | ||
44 | + public void getIndex(Page page){ | ||
45 | + String maxIndex = page.getHtml().xpath("//div[@class='u-mb-48']/nav/ul/li[6]/a/text()").get(); | ||
46 | + log.info(maxIndex); | ||
47 | + String trim = maxIndex.trim(); | ||
48 | + int number = Integer.parseInt(trim); | ||
49 | + System.out.printf("", number); | ||
50 | + for (int i = 0; i < number; i++) { | ||
51 | + page.addTargetRequest("https://www.nature.com/nenergy/research-articles?searchType=journalSearch&sort=PubDate&page="+i); | ||
52 | + } | ||
53 | + } | ||
54 | + | ||
55 | + public void everyPage(Page page){ | ||
56 | + List<String> all = page.getHtml().xpath("//div[@class='u-flex-direction-column']/h3/a//@href").all(); | ||
57 | + for (int i = 0; i < all.size(); i++) { | ||
58 | + page.addTargetRequest("https://www.nature.com/"+all.get(i)); | ||
59 | + } | ||
60 | + } | ||
61 | + | ||
62 | + private void doArticleContent(Page page) { | ||
63 | + Html html = page.getHtml(); | ||
64 | + String articleCode = page.getUrl().get(); | ||
65 | + Selectable headSelectable = html.xpath("//div[@class='c-article-header']/header"); | ||
66 | + List<Selectable> authorEmailSelectables = html.xpath("//p[@id='corresponding-author-list']/a").nodes(); | ||
67 | + Selectable referencesSelectable = html.xpath("//ol[@class='c-article-references']").select(new XpathSelector("li[@class='c-article-references__item js-c-reading-companion-references-item']")); | ||
68 | + Selectable authorAddressSelectable = html.xpath("//ol[@class='c-article-author-affiliation__list']").select(new XpathSelector("li")); | ||
69 | + | ||
70 | + String title = headSelectable.xpath("//div/h1/text()").get(); | ||
71 | + if (StringUtils.isBlank(title)) { | ||
72 | + title = headSelectable.xpath("//h1/text()").get(); | ||
73 | + } | ||
74 | + String articleDesc = html.xpath("//div[@class='c-article-section__content']/p/text()").get(); | ||
75 | + String publishTime; | ||
76 | + Date publishTimeDateTime = null; | ||
77 | + try { | ||
78 | + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get(); | ||
79 | + } catch (Exception e) { | ||
80 | + try { | ||
81 | + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(1).xpath("//li/time/text()").get(); | ||
82 | + } catch (Exception e1) { | ||
83 | + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(0).xpath("//li/time/text()").get(); | ||
84 | + } | ||
85 | + } | ||
86 | + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH); | ||
87 | + | ||
88 | + try { | ||
89 | + publishTimeDateTime = formatter.parse(publishTime); | ||
90 | + } catch (ParseException e) { | ||
91 | + e.printStackTrace(); | ||
92 | + } | ||
93 | + Selectable authorSelectable = headSelectable.xpath("//ul").nodes().get(1).select(new XpathSelector("li[@class='c-article-author-list__item']")); | ||
94 | + List<Selectable> authorNodes = authorSelectable.nodes(); | ||
95 | + StringBuffer authorName = new StringBuffer(); | ||
96 | + for (Selectable node : authorNodes) { | ||
97 | + authorName.append(node.xpath("//a/text()")); | ||
98 | + } | ||
99 | + | ||
100 | + JSONArray authorAddress = new JSONArray(); | ||
101 | + List<Selectable> authorAddressList = authorAddressSelectable.nodes(); | ||
102 | + if (CollectionUtils.isNotEmpty(authorAddressList)) { | ||
103 | + for (Selectable selectable : authorAddressList) { | ||
104 | + String address = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__address']/text()").get(); | ||
105 | + String authorNames = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__authors-list']/text()").get(); | ||
106 | + JSONObject object = new JSONObject(); | ||
107 | + object.put("address", address); | ||
108 | + object.put("authorNames", authorNames); | ||
109 | + authorAddress.add(object); | ||
110 | + } | ||
111 | + } | ||
112 | + | ||
113 | + JSONArray references = new JSONArray(); | ||
114 | + List<Selectable> referenceList = referencesSelectable.nodes(); | ||
115 | + if (CollectionUtils.isNotEmpty(referenceList)) { | ||
116 | + for (Selectable reference : referenceList) { | ||
117 | + String referenceTitle = reference.xpath("//p").xpath("//p[@class='c-article-references__text']/text()").get(); | ||
118 | + List<Selectable> referenceLinks = reference.xpath("//p").xpath("//p[@class='c-article-references__links u-hide-print']").links().nodes(); | ||
119 | + List<String> links = new ArrayList<>(); | ||
120 | + if (CollectionUtils.isNotEmpty(referenceLinks)) { | ||
121 | + links = referenceLinks.stream().map(x -> x.get()).collect(Collectors.toList()); | ||
122 | + } | ||
123 | + JSONObject object = new JSONObject(); | ||
124 | + object.put("referenceTitle", referenceTitle); | ||
125 | + object.put("links", links); | ||
126 | +// if (CollectionUtils.isNotEmpty(links)) { | ||
127 | +// page.addTargetRequests(links.stream().filter(x -> x.contains("nature")).collect(Collectors.toList())); | ||
128 | +// } | ||
129 | + references.add(object); | ||
130 | + } | ||
131 | + } | ||
132 | + | ||
133 | + JSONArray authorEmail = new JSONArray(); | ||
134 | + for (Selectable authorEmailSelectable : authorEmailSelectables) { | ||
135 | + String[] split = authorEmailSelectable.xpath("//a").links().get().split(":"); | ||
136 | + String email = Objects.isNull(split) ? "" : split[split.length - 1]; | ||
137 | + String authorEmailName = authorEmailSelectable.xpath("//a/text()").get(); | ||
138 | + JSONObject jsonObject = new JSONObject(); | ||
139 | + jsonObject.put("authorEmailName", authorEmailName); | ||
140 | + jsonObject.put("email", email); | ||
141 | + authorEmail.add(jsonObject); | ||
142 | + } | ||
143 | + log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorEmail.toJSONString()); | ||
144 | + | ||
145 | + page.putField("article", ArticleDO.builder() | ||
146 | + .articleType(ArticleTypeEnum.NATURE_NANOTECHNOLOGY.getType()) | ||
147 | + .articleCode(articleCode) | ||
148 | + .authorName(authorName.toString()) | ||
149 | + .title(title) | ||
150 | + .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE)) | ||
151 | + .emailInfo(authorEmail.toJSONString()) | ||
152 | + .articleDesc(articleDesc) | ||
153 | + .authorAddress(authorAddress.toJSONString()) | ||
154 | + .referenceInfo(references.toJSONString()).build()); | ||
155 | + } | ||
156 | +} |
src/main/java/com/canrd/webmagic/processor/NaturePhysicsProcessor.java
0 → 100644
1 | +package com.canrd.webmagic.processor; | ||
2 | + | ||
3 | +import com.alibaba.fastjson.JSONArray; | ||
4 | +import com.alibaba.fastjson.JSONObject; | ||
5 | +import com.baomidou.mybatisplus.core.toolkit.CollectionUtils; | ||
6 | +import com.canrd.webmagic.common.utils.DateUtil; | ||
7 | +import com.canrd.webmagic.common.utils.StringUtils; | ||
8 | +import com.canrd.webmagic.domain.ArticleTypeEnum; | ||
9 | +import com.canrd.webmagic.domain.dto.ArticleDO; | ||
10 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | ||
11 | +import lombok.extern.slf4j.Slf4j; | ||
12 | +import org.springframework.stereotype.Component; | ||
13 | +import us.codecraft.webmagic.Page; | ||
14 | +import us.codecraft.webmagic.Site; | ||
15 | +import us.codecraft.webmagic.Spider; | ||
16 | +import us.codecraft.webmagic.processor.PageProcessor; | ||
17 | +import us.codecraft.webmagic.selector.Html; | ||
18 | +import us.codecraft.webmagic.selector.Selectable; | ||
19 | +import us.codecraft.webmagic.selector.XpathSelector; | ||
20 | + | ||
21 | +import java.text.ParseException; | ||
22 | +import java.text.SimpleDateFormat; | ||
23 | +import java.util.*; | ||
24 | +import java.util.stream.Collectors; | ||
25 | + | ||
26 | +@Slf4j | ||
27 | +@Component | ||
28 | +public class NaturePhysicsProcessor implements PageProcessor{ | ||
29 | + @Override | ||
30 | + public void process(Page page) { | ||
31 | + String url = page.getUrl().get(); | ||
32 | + if (url.equals("https://www.nature.com/nphys/")) { | ||
33 | + everyPage(page); | ||
34 | + }else if (url.contains("https://www.nature.com/")){ | ||
35 | + doArticleContent(page); | ||
36 | + } | ||
37 | + } | ||
38 | + | ||
39 | + @Override | ||
40 | + public Site getSite() { | ||
41 | + return PageProcessor.super.getSite(); | ||
42 | + } | ||
43 | + public void everyPage(Page page){ | ||
44 | + List<String> all = page.getHtml().xpath("//div[@class='u-flex-direction-column']/h3/a//@href").all(); | ||
45 | + for (int i = 0; i < all.size(); i++) { | ||
46 | + page.addTargetRequest("https://www.nature.com/"+all.get(i)); | ||
47 | + } | ||
48 | + } | ||
49 | + | ||
50 | + private void doArticleContent(Page page) { | ||
51 | + Html html = page.getHtml(); | ||
52 | + String articleCode = page.getUrl().get(); | ||
53 | + Selectable headSelectable = html.xpath("//div[@class='c-article-header']/header"); | ||
54 | + List<Selectable> authorEmailSelectables = html.xpath("//p[@id='corresponding-author-list']/a").nodes(); | ||
55 | + Selectable referencesSelectable = html.xpath("//ol[@class='c-article-references']").select(new XpathSelector("li[@class='c-article-references__item js-c-reading-companion-references-item']")); | ||
56 | + Selectable authorAddressSelectable = html.xpath("//ol[@class='c-article-author-affiliation__list']").select(new XpathSelector("li")); | ||
57 | + | ||
58 | + String title = headSelectable.xpath("//div/h1/text()").get(); | ||
59 | + if (StringUtils.isBlank(title)) { | ||
60 | + title = headSelectable.xpath("//h1/text()").get(); | ||
61 | + } | ||
62 | + String articleDesc = html.xpath("//div[@class='c-article-section__content']/p/text()").get(); | ||
63 | + String publishTime; | ||
64 | + Date publishTimeDateTime = null; | ||
65 | + try { | ||
66 | + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get(); | ||
67 | + } catch (Exception e) { | ||
68 | + try { | ||
69 | + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(1).xpath("//li/time/text()").get(); | ||
70 | + } catch (Exception e1) { | ||
71 | + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(0).xpath("//li/time/text()").get(); | ||
72 | + } | ||
73 | + } | ||
74 | + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH); | ||
75 | + | ||
76 | + try { | ||
77 | + publishTimeDateTime = formatter.parse(publishTime); | ||
78 | + } catch (ParseException e) { | ||
79 | + e.printStackTrace(); | ||
80 | + } | ||
81 | + Selectable authorSelectable = headSelectable.xpath("//ul").nodes().get(1).select(new XpathSelector("li[@class='c-article-author-list__item']")); | ||
82 | + List<Selectable> authorNodes = authorSelectable.nodes(); | ||
83 | + StringBuffer authorName = new StringBuffer(); | ||
84 | + for (Selectable node : authorNodes) { | ||
85 | + authorName.append(node.xpath("//a/text()")); | ||
86 | + } | ||
87 | + | ||
88 | + JSONArray authorAddress = new JSONArray(); | ||
89 | + List<Selectable> authorAddressList = authorAddressSelectable.nodes(); | ||
90 | + if (CollectionUtils.isNotEmpty(authorAddressList)) { | ||
91 | + for (Selectable selectable : authorAddressList) { | ||
92 | + String address = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__address']/text()").get(); | ||
93 | + String authorNames = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__authors-list']/text()").get(); | ||
94 | + JSONObject object = new JSONObject(); | ||
95 | + object.put("address", address); | ||
96 | + object.put("authorNames", authorNames); | ||
97 | + authorAddress.add(object); | ||
98 | + } | ||
99 | + } | ||
100 | + | ||
101 | + JSONArray references = new JSONArray(); | ||
102 | + List<Selectable> referenceList = referencesSelectable.nodes(); | ||
103 | + if (CollectionUtils.isNotEmpty(referenceList)) { | ||
104 | + for (Selectable reference : referenceList) { | ||
105 | + String referenceTitle = reference.xpath("//p").xpath("//p[@class='c-article-references__text']/text()").get(); | ||
106 | + List<Selectable> referenceLinks = reference.xpath("//p").xpath("//p[@class='c-article-references__links u-hide-print']").links().nodes(); | ||
107 | + List<String> links = new ArrayList<>(); | ||
108 | + if (CollectionUtils.isNotEmpty(referenceLinks)) { | ||
109 | + links = referenceLinks.stream().map(x -> x.get()).collect(Collectors.toList()); | ||
110 | + } | ||
111 | + JSONObject object = new JSONObject(); | ||
112 | + object.put("referenceTitle", referenceTitle); | ||
113 | + object.put("links", links); | ||
114 | +// if (CollectionUtils.isNotEmpty(links)) { | ||
115 | +// page.addTargetRequests(links.stream().filter(x -> x.contains("nature")).collect(Collectors.toList())); | ||
116 | +// } | ||
117 | + references.add(object); | ||
118 | + } | ||
119 | + } | ||
120 | + | ||
121 | + JSONArray authorEmail = new JSONArray(); | ||
122 | + for (Selectable authorEmailSelectable : authorEmailSelectables) { | ||
123 | + String[] split = authorEmailSelectable.xpath("//a").links().get().split(":"); | ||
124 | + String email = Objects.isNull(split) ? "" : split[split.length - 1]; | ||
125 | + String authorEmailName = authorEmailSelectable.xpath("//a/text()").get(); | ||
126 | + JSONObject jsonObject = new JSONObject(); | ||
127 | + jsonObject.put("authorEmailName", authorEmailName); | ||
128 | + jsonObject.put("email", email); | ||
129 | + authorEmail.add(jsonObject); | ||
130 | + } | ||
131 | + log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorEmail.toJSONString()); | ||
132 | + | ||
133 | + page.putField("article", ArticleDO.builder() | ||
134 | + .articleType(ArticleTypeEnum.NATURE_PHYSICS.getType()) | ||
135 | + .articleCode(articleCode) | ||
136 | + .authorName(authorName.toString()) | ||
137 | + .title(title) | ||
138 | + .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE)) | ||
139 | + .emailInfo(authorEmail.toJSONString()) | ||
140 | + .articleDesc(articleDesc) | ||
141 | + .authorAddress(authorAddress.toJSONString()) | ||
142 | + .referenceInfo(references.toJSONString()).build()); | ||
143 | + } | ||
144 | + | ||
145 | + public static void main(String[] args) { | ||
146 | + Spider.create(new MatterPagePcoessor()) | ||
147 | + .addUrl("https://www.nature.com/nenergy/research-articles") | ||
148 | + .addPipeline(new ArticlePipeline()) | ||
149 | + .thread(1).run(); | ||
150 | + } | ||
151 | +} |
src/main/java/com/canrd/webmagic/processor/NatureSearchPageProcessor.java
src/main/java/com/canrd/webmagic/processor/Science4JournalSearchPageProcessor.java
src/main/java/com/canrd/webmagic/processor/download/SeleniumDownloader.java
@@ -3,10 +3,9 @@ package com.canrd.webmagic.processor.download; | @@ -3,10 +3,9 @@ package com.canrd.webmagic.processor.download; | ||
3 | import com.canrd.webmagic.config.SeleniumConfig; | 3 | import com.canrd.webmagic.config.SeleniumConfig; |
4 | import com.canrd.webmagic.processor.config.Agent; | 4 | import com.canrd.webmagic.processor.config.Agent; |
5 | import lombok.extern.slf4j.Slf4j; | 5 | import lombok.extern.slf4j.Slf4j; |
6 | -import org.openqa.selenium.By; | ||
7 | -import org.openqa.selenium.Cookie; | ||
8 | -import org.openqa.selenium.WebDriver; | ||
9 | -import org.openqa.selenium.WebElement; | 6 | +import org.openqa.selenium.*; |
7 | +import org.openqa.selenium.support.ui.ExpectedConditions; | ||
8 | +import org.openqa.selenium.support.ui.WebDriverWait; | ||
10 | import org.springframework.stereotype.Component; | 9 | import org.springframework.stereotype.Component; |
11 | import us.codecraft.webmagic.Page; | 10 | import us.codecraft.webmagic.Page; |
12 | import us.codecraft.webmagic.Request; | 11 | import us.codecraft.webmagic.Request; |
@@ -28,7 +27,7 @@ import java.util.Map; | @@ -28,7 +27,7 @@ import java.util.Map; | ||
28 | @Slf4j | 27 | @Slf4j |
29 | @Component | 28 | @Component |
30 | public class SeleniumDownloader extends AbstractDownloader { | 29 | public class SeleniumDownloader extends AbstractDownloader { |
31 | - private int sleepTime = 3000; | 30 | + private int sleepTime = 3000000; |
32 | 31 | ||
33 | @Resource | 32 | @Resource |
34 | private SeleniumConfig config; | 33 | private SeleniumConfig config; |
@@ -64,8 +63,23 @@ public class SeleniumDownloader extends AbstractDownloader { | @@ -64,8 +63,23 @@ public class SeleniumDownloader extends AbstractDownloader { | ||
64 | } | 63 | } |
65 | 64 | ||
66 | log.info("downloading page " + request.getUrl()); | 65 | log.info("downloading page " + request.getUrl()); |
67 | - | ||
68 | webDriver.get(request.getUrl()); | 66 | webDriver.get(request.getUrl()); |
67 | + if (request.getUrl().equals("https://www.cell.com/matter/home")) { | ||
68 | + WebElement searchText = webDriver.findElement(By.id("searchText")); | ||
69 | + searchText.sendKeys("Aluminum foil"); | ||
70 | + WebElement element = webDriver.findElement(By.xpath("//div[@class='quick-search__toggle']/button")); | ||
71 | + element.submit(); | ||
72 | + WebDriverWait wait = new WebDriverWait(webDriver, 30); | ||
73 | + wait.until(ExpectedConditions.urlContains("https://www.cell.com/action/doSearch?")); | ||
74 | +// wait.until(ExpectedConditions.visibilityOfElementLocated(By.xpath("//div[@class='cb-c']"))); | ||
75 | +// WebElement cloudFlare = webDriver.findElement(By.xpath("//div[@class='cb-c']/label/input")); | ||
76 | +// if (cloudFlare!=null){ | ||
77 | +// cloudFlare.click(); | ||
78 | +// } | ||
79 | + } | ||
80 | + if (request.getUrl().contains("https://www.cell.com/action/doSearch?")){ | ||
81 | + | ||
82 | + } | ||
69 | try { | 83 | try { |
70 | if (sleepTime > 0) { | 84 | if (sleepTime > 0) { |
71 | //休眠3秒就是为了动态的数据渲染完成后在进行获取 | 85 | //休眠3秒就是为了动态的数据渲染完成后在进行获取 |
@@ -75,6 +89,18 @@ public class SeleniumDownloader extends AbstractDownloader { | @@ -75,6 +89,18 @@ public class SeleniumDownloader extends AbstractDownloader { | ||
75 | e.printStackTrace(); | 89 | e.printStackTrace(); |
76 | } | 90 | } |
77 | 91 | ||
92 | +// WebElement targetElement; | ||
93 | +// do { | ||
94 | +// try { | ||
95 | +// targetElement = webDriver.findElement(By.xpath("//h2[@class=\"h2\"]")); | ||
96 | +// log.info(String.valueOf(targetElement)); | ||
97 | +// log.info("等待验证中"); | ||
98 | +// Thread.sleep(sleepTime); // 等待一段时间后再检查 | ||
99 | +// } catch (NoSuchElementException e) { | ||
100 | +// targetElement = null; // 如果找不到特定元素,则退出循环 | ||
101 | +// } | ||
102 | +// } while (targetElement != null); | ||
103 | + | ||
78 | WebElement webElement = webDriver.findElement(By.xpath("/html")); | 104 | WebElement webElement = webDriver.findElement(By.xpath("/html")); |
79 | String content = webElement.getAttribute("outerHTML"); | 105 | String content = webElement.getAttribute("outerHTML"); |
80 | page.setDownloadSuccess(true); | 106 | page.setDownloadSuccess(true); |
src/main/java/com/canrd/webmagic/processor/driver/MyChromeDriver.java
@@ -59,13 +59,14 @@ public class MyChromeDriver implements BrowserDriver{ | @@ -59,13 +59,14 @@ public class MyChromeDriver implements BrowserDriver{ | ||
59 | options.addArguments("blink-settings=imagesEnabled=false"); | 59 | options.addArguments("blink-settings=imagesEnabled=false"); |
60 | // 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败 | 60 | // 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败 |
61 | options.addArguments("--headless"); | 61 | options.addArguments("--headless"); |
62 | + //进入指定地址 | ||
63 | +// options.setExperimentalOption("debuggerAddress", "127.0.0.1:9222"); | ||
62 | //禁用 blink 特征 | 64 | //禁用 blink 特征 |
63 | options.addArguments("disable-blink-features=AutomationControlled"); | 65 | options.addArguments("disable-blink-features=AutomationControlled"); |
64 | options.setExperimentalOption("excludeSwitches", Arrays.asList("enable-automation")); | 66 | options.setExperimentalOption("excludeSwitches", Arrays.asList("enable-automation")); |
65 | options.setExperimentalOption("useAutomationExtension", false); | 67 | options.setExperimentalOption("useAutomationExtension", false); |
66 | options.addArguments("--remote-allow-origins=*"); | 68 | options.addArguments("--remote-allow-origins=*"); |
67 | 69 | ||
68 | - | ||
69 | String os_name = System.getProperty("os.name"); | 70 | String os_name = System.getProperty("os.name"); |
70 | // 判断是否是windows系统 | 71 | // 判断是否是windows系统 |
71 | if (os_name.toLowerCase().startsWith("win")) { | 72 | if (os_name.toLowerCase().startsWith("win")) { |
src/main/java/com/canrd/webmagic/processor/pipeline/ArticlePipeline.java
@@ -27,11 +27,11 @@ public class ArticlePipeline implements Pipeline { | @@ -27,11 +27,11 @@ public class ArticlePipeline implements Pipeline { | ||
27 | public void process(ResultItems resultItems, Task task) { | 27 | public void process(ResultItems resultItems, Task task) { |
28 | ArticleDO articleDO = resultItems.get("article"); | 28 | ArticleDO articleDO = resultItems.get("article"); |
29 | if (Objects.nonNull(articleDO)) { | 29 | if (Objects.nonNull(articleDO)) { |
30 | - List<ArticleDO> natureArticleDO = articleService.list(new LambdaQueryWrapper<ArticleDO>().eq(ArticleDO::getArticleCode, articleDO.getArticleCode())); | ||
31 | - if (CollectionUtils.isNotEmpty(natureArticleDO)) { | ||
32 | - return; | ||
33 | - } | ||
34 | - articleService.save(articleDO); | 30 | + List<ArticleDO> natureArticleDO = articleService.list(new LambdaQueryWrapper<ArticleDO>().eq(ArticleDO::getArticleCode, articleDO.getArticleCode())); |
31 | + if (CollectionUtils.isNotEmpty(natureArticleDO)) { | ||
32 | + return; | ||
33 | + } | ||
34 | + articleService.save(articleDO); | ||
35 | } | 35 | } |
36 | } | 36 | } |
37 | } | 37 | } |
src/main/resources/application-test.yml
@@ -59,7 +59,7 @@ spring: | @@ -59,7 +59,7 @@ spring: | ||
59 | testOnReturn: true | 59 | testOnReturn: true |
60 | password: 123456 | 60 | password: 123456 |
61 | time-between-eviction-runs-millis: 1000 | 61 | time-between-eviction-runs-millis: 1000 |
62 | - url: jdbc:mysql://localhost:3306/webmagic?useUnicode=true&characterEncoding=UTF-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=Asia/Shanghai&useSSL=false&autoReconnect=true&failOverReadOnly=false&maxReconnects=10&allowMultiQueries=true&useAffectedRows=true&autoReconnectForPools=true | 62 | + url: jdbc:mysql://localhost:3306/webpage?useUnicode=true&characterEncoding=UTF-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=Asia/Shanghai&useSSL=false&autoReconnect=true&failOverReadOnly=false&maxReconnects=10&allowMultiQueries=true&useAffectedRows=true&autoReconnectForPools=true |
63 | username: root | 63 | username: root |
64 | redis: | 64 | redis: |
65 | database: 0 | 65 | database: 0 |
target/classes/application-test.yml
@@ -59,7 +59,7 @@ spring: | @@ -59,7 +59,7 @@ spring: | ||
59 | testOnReturn: true | 59 | testOnReturn: true |
60 | password: 123456 | 60 | password: 123456 |
61 | time-between-eviction-runs-millis: 1000 | 61 | time-between-eviction-runs-millis: 1000 |
62 | - url: jdbc:mysql://localhost:3306/webmagic?useUnicode=true&characterEncoding=UTF-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=Asia/Shanghai&useSSL=false&autoReconnect=true&failOverReadOnly=false&maxReconnects=10&allowMultiQueries=true&useAffectedRows=true&autoReconnectForPools=true | 62 | + url: jdbc:mysql://localhost:3306/webpage?useUnicode=true&characterEncoding=UTF-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=Asia/Shanghai&useSSL=false&autoReconnect=true&failOverReadOnly=false&maxReconnects=10&allowMultiQueries=true&useAffectedRows=true&autoReconnectForPools=true |
63 | username: root | 63 | username: root |
64 | redis: | 64 | redis: |
65 | database: 0 | 65 | database: 0 |
target/classes/com/canrd/webmagic/DNS/DnsResolver.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/controller/MatterController.class
No preview for this file type
target/classes/com/canrd/webmagic/controller/NatureCommunicatiosController.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/controller/NatureComputationalController.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/controller/NatureEnergyController.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/controller/NatureMaterialController.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/controller/NatureMethodsController.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/controller/NatureNanotechnologyController.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/controller/NaturePhysicsController.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/domain/ArticleTypeEnum.class
No preview for this file type
target/classes/com/canrd/webmagic/processor/ChemicalPagePcoessor.class
No preview for this file type
target/classes/com/canrd/webmagic/processor/MatterPragePcoessor.class renamed to target/classes/com/canrd/webmagic/processor/MatterPagePcoessor.class
No preview for this file type
target/classes/com/canrd/webmagic/processor/NatureCommunicatiosPcoessor.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/NatureComputationalPcoessor.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/NatureEnergyPagePcoessor.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/NatureMaterialPagePcoessor.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/NatureMethodsPcoessor.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/NatureNanotechnologyProcessor.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/NaturePhysicsProcessor.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/NatureSearchPageProcessor.class
No preview for this file type
target/classes/com/canrd/webmagic/processor/Science4JournalSearchPageProcessor.class
No preview for this file type
target/classes/com/canrd/webmagic/processor/download/SeleniumDownloader.class
No preview for this file type
target/classes/com/canrd/webmagic/processor/driver/MyChromeDriver.class
No preview for this file type