Commit 9f5b8d9b5b36694b8e7f1b1c1cba1d4a742e8449
1 parent
1680e181
提交
Showing
52 changed files
with
1561 additions
and
29 deletions
Too many changes to show.
To preserve performance only 52 of 55 files are displayed.
.idea/inspectionProfiles/Project_Default.xml
... | ... | @@ -2,5 +2,6 @@ |
2 | 2 | <profile version="1.0"> |
3 | 3 | <option name="myName" value="Project Default" /> |
4 | 4 | <inspection_tool class="NonSerializableWithSerialVersionUIDField" enabled="true" level="WARNING" enabled_by_default="true" /> |
5 | + <inspection_tool class="SpringJavaInjectionPointsAutowiringInspection" enabled="false" level="ERROR" enabled_by_default="false" /> | |
5 | 6 | </profile> |
6 | 7 | </component> |
7 | 8 | \ No newline at end of file |
... | ... |
pom.xml
... | ... | @@ -70,6 +70,12 @@ |
70 | 70 | <!-- <version>${browsermob.version}</version>--> |
71 | 71 | <!-- </dependency>--> |
72 | 72 | |
73 | +<!-- DNS--> | |
74 | + <dependency> | |
75 | + <groupId>dnsjava</groupId> | |
76 | + <artifactId>dnsjava</artifactId> | |
77 | + <version>2.1.8</version> | |
78 | + </dependency> | |
73 | 79 | |
74 | 80 | <!-- webmagic核心库 --> |
75 | 81 | <dependency> |
... | ... |
src/main/java/com/canrd/webmagic/DNS/DnsResolver.java
0 → 100644
1 | +package com.canrd.webmagic.DNS; | |
2 | + | |
3 | +import org.xbill.DNS.*; | |
4 | + | |
5 | +public class DnsResolver { | |
6 | + public static String resolve(String domain) { | |
7 | + try { | |
8 | + Record[] records = new Lookup(domain, Type.A).run(); | |
9 | + if (records != null && records.length > 0) { | |
10 | + ARecord aRecord = (ARecord) records[0]; | |
11 | + return aRecord.getAddress().getHostAddress(); | |
12 | + } | |
13 | + } catch (TextParseException e) { | |
14 | + e.printStackTrace(); | |
15 | + } | |
16 | + return null; | |
17 | + } | |
18 | +} | |
0 | 19 | \ No newline at end of file |
... | ... |
src/main/java/com/canrd/webmagic/controller/MatterController.java
1 | 1 | package com.canrd.webmagic.controller; |
2 | 2 | |
3 | 3 | import com.canrd.webmagic.common.constant.ServerResult; |
4 | -import com.canrd.webmagic.processor.MatterPragePcoessor; | |
4 | +import com.canrd.webmagic.processor.MatterPagePcoessor; | |
5 | 5 | import com.canrd.webmagic.processor.download.SeleniumDownloader; |
6 | 6 | import io.swagger.annotations.Api; |
7 | 7 | import io.swagger.annotations.ApiOperation; |
... | ... | @@ -18,7 +18,7 @@ import javax.annotation.Resource; |
18 | 18 | @Api("Matter") |
19 | 19 | public class MatterController { |
20 | 20 | @Resource |
21 | - private MatterPragePcoessor matterPragePcoessor; | |
21 | + private MatterPagePcoessor matterPragePcoessor; | |
22 | 22 | |
23 | 23 | @Resource |
24 | 24 | private SeleniumDownloader seleniumDownloader; |
... | ... | @@ -26,11 +26,11 @@ public class MatterController { |
26 | 26 | @GetMapping("/start") |
27 | 27 | @ApiOperation("start") |
28 | 28 | public ServerResult start() { |
29 | - Spider.create(new MatterPragePcoessor()) | |
29 | + Spider.create(matterPragePcoessor) | |
30 | 30 | // 添加这个Spider要爬取的网页地址 |
31 | - .addUrl("https://www.cell.com/matter/fulltext/S2590-2385(21)00631-7#%20") | |
31 | + .addUrl("https://www.cell.com/matter/home") | |
32 | 32 | .setUUID(UuidUtil.getTimeBasedUuid().toString()) |
33 | - .setDownloader(seleniumDownloader.setSleepTime(30000)) | |
33 | + .setDownloader(seleniumDownloader) | |
34 | 34 | // 开启5个线程执行,并开始爬取 |
35 | 35 | .thread(5).run(); |
36 | 36 | return ServerResult.success(); |
... | ... |
src/main/java/com/canrd/webmagic/controller/NatureCommunicatiosController.java
0 → 100644
1 | +package com.canrd.webmagic.controller; | |
2 | + | |
3 | +import com.canrd.webmagic.common.constant.ServerResult; | |
4 | +import com.canrd.webmagic.processor.NatureCommunicatiosPcoessor; | |
5 | +import com.canrd.webmagic.processor.NatureEnergyPagePcoessor; | |
6 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | |
7 | +import io.swagger.annotations.Api; | |
8 | +import io.swagger.annotations.ApiOperation; | |
9 | +import org.apache.logging.log4j.core.util.UuidUtil; | |
10 | +import org.springframework.web.bind.annotation.GetMapping; | |
11 | +import org.springframework.web.bind.annotation.RequestMapping; | |
12 | +import org.springframework.web.bind.annotation.RestController; | |
13 | +import us.codecraft.webmagic.Spider; | |
14 | +import us.codecraft.webmagic.scheduler.RedisScheduler; | |
15 | + | |
16 | +import javax.annotation.Resource; | |
17 | +@RestController | |
18 | +@RequestMapping("/nature-communicatios/article") | |
19 | +@Api("Nature") | |
20 | +public class NatureCommunicatiosController { | |
21 | + @Resource | |
22 | + private NatureCommunicatiosPcoessor natureCommunicatiosPcoessor; | |
23 | + @Resource | |
24 | + private ArticlePipeline articlePipeline; | |
25 | + | |
26 | + @GetMapping("/start") | |
27 | + @ApiOperation("start") | |
28 | + public ServerResult start() { | |
29 | + Spider.create(natureCommunicatiosPcoessor) | |
30 | + .addUrl("https://www.nature.com/ncomms/articles?type=editorial") | |
31 | + .setUUID(UuidUtil.getTimeBasedUuid().toString()) | |
32 | + .addPipeline(articlePipeline) | |
33 | +// .setScheduler(new RedisScheduler("127.0.0.1")) | |
34 | + .thread(20).run(); | |
35 | + return ServerResult.success(); | |
36 | + } | |
37 | +} | |
... | ... |
src/main/java/com/canrd/webmagic/controller/NatureComputationalController.java
0 → 100644
1 | +package com.canrd.webmagic.controller; | |
2 | + | |
3 | +import com.canrd.webmagic.common.constant.ServerResult; | |
4 | +import com.canrd.webmagic.processor.NatureComputationalPcoessor; | |
5 | +import com.canrd.webmagic.processor.NatureEnergyPagePcoessor; | |
6 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | |
7 | +import io.swagger.annotations.Api; | |
8 | +import io.swagger.annotations.ApiOperation; | |
9 | +import org.apache.logging.log4j.core.util.UuidUtil; | |
10 | +import org.springframework.web.bind.annotation.GetMapping; | |
11 | +import org.springframework.web.bind.annotation.RequestMapping; | |
12 | +import org.springframework.web.bind.annotation.RestController; | |
13 | +import us.codecraft.webmagic.Spider; | |
14 | +import us.codecraft.webmagic.scheduler.RedisScheduler; | |
15 | + | |
16 | +import javax.annotation.Resource; | |
17 | + | |
18 | +@RestController | |
19 | +@RequestMapping("/nature-computational/article") | |
20 | +@Api("Nature") | |
21 | +public class NatureComputationalController { | |
22 | + @Resource | |
23 | + private NatureComputationalPcoessor natureComputationalPcoessor; | |
24 | + @Resource | |
25 | + private ArticlePipeline articlePipeline; | |
26 | + | |
27 | + @GetMapping("/start") | |
28 | + @ApiOperation("start") | |
29 | + public ServerResult start() { | |
30 | + Spider.create(natureComputationalPcoessor) | |
31 | + .addUrl("https://www.nature.com/natcomputsci") | |
32 | + .setUUID(UuidUtil.getTimeBasedUuid().toString()) | |
33 | + .addPipeline(articlePipeline) | |
34 | + .setScheduler(new RedisScheduler("127.0.0.1")) | |
35 | + .thread(20).run(); | |
36 | + return ServerResult.success(); | |
37 | + } | |
38 | +} | |
... | ... |
src/main/java/com/canrd/webmagic/controller/NatureEnergyController.java
0 → 100644
1 | +package com.canrd.webmagic.controller; | |
2 | + | |
3 | +import com.canrd.webmagic.common.constant.ServerResult; | |
4 | +import com.canrd.webmagic.processor.NatureEnergyPagePcoessor; | |
5 | +import com.canrd.webmagic.processor.NatureMaterialPagePcoessor; | |
6 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | |
7 | +import io.swagger.annotations.Api; | |
8 | +import io.swagger.annotations.ApiOperation; | |
9 | +import org.apache.logging.log4j.core.util.UuidUtil; | |
10 | +import org.springframework.web.bind.annotation.GetMapping; | |
11 | +import org.springframework.web.bind.annotation.RequestMapping; | |
12 | +import org.springframework.web.bind.annotation.RestController; | |
13 | +import us.codecraft.webmagic.Spider; | |
14 | +import us.codecraft.webmagic.scheduler.RedisScheduler; | |
15 | + | |
16 | +import javax.annotation.Resource; | |
17 | + | |
18 | +@RestController | |
19 | +@RequestMapping("/nature-energy/article") | |
20 | +@Api("Nature") | |
21 | +public class NatureEnergyController { | |
22 | + @Resource | |
23 | + private NatureEnergyPagePcoessor natureEnergyPagePcoessor; | |
24 | + @Resource | |
25 | + private ArticlePipeline articlePipeline; | |
26 | + | |
27 | + @GetMapping("/start") | |
28 | + @ApiOperation("start") | |
29 | + public ServerResult start() { | |
30 | + Spider.create(natureEnergyPagePcoessor) | |
31 | + .addUrl("https://www.nature.com/nenergy/research-articles") | |
32 | + .setUUID(UuidUtil.getTimeBasedUuid().toString()) | |
33 | + .addPipeline(articlePipeline) | |
34 | + .setScheduler(new RedisScheduler("127.0.0.1")) | |
35 | + .thread(20).run(); | |
36 | + return ServerResult.success(); | |
37 | + } | |
38 | +} | |
... | ... |
src/main/java/com/canrd/webmagic/controller/NatureMaterialController.java
0 → 100644
1 | +package com.canrd.webmagic.controller; | |
2 | + | |
3 | +import com.canrd.webmagic.common.constant.ServerResult; | |
4 | +import com.canrd.webmagic.processor.MatterPagePcoessor; | |
5 | +import com.canrd.webmagic.processor.NatureMaterialPagePcoessor; | |
6 | +import com.canrd.webmagic.processor.download.SeleniumDownloader; | |
7 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | |
8 | +import io.swagger.annotations.Api; | |
9 | +import io.swagger.annotations.ApiOperation; | |
10 | +import org.apache.logging.log4j.core.util.UuidUtil; | |
11 | +import org.springframework.web.bind.annotation.GetMapping; | |
12 | +import org.springframework.web.bind.annotation.RequestMapping; | |
13 | +import org.springframework.web.bind.annotation.RestController; | |
14 | +import us.codecraft.webmagic.Spider; | |
15 | +import us.codecraft.webmagic.scheduler.RedisScheduler; | |
16 | + | |
17 | +import javax.annotation.Resource; | |
18 | + | |
19 | +@RestController | |
20 | +@RequestMapping("/nature-material/article") | |
21 | +@Api("Nature") | |
22 | +public class NatureMaterialController { | |
23 | + @Resource | |
24 | + private NatureMaterialPagePcoessor natureMaterialPagePcoessor; | |
25 | + @Resource | |
26 | + private ArticlePipeline articlePipeline; | |
27 | + | |
28 | + @GetMapping("/start") | |
29 | + @ApiOperation("start") | |
30 | + public ServerResult start() { | |
31 | + Spider.create(natureMaterialPagePcoessor) | |
32 | + // 添加这个Spider要爬取的网页地址 | |
33 | + .addUrl("https://www.nature.com/nmat/articles") | |
34 | + .addUrl("https://www.nature.com/search?q=battery&journal=nmat&order=relevance") | |
35 | + .addUrl("https://www.nature.com/search?q=batteries&journal=nmat&order=relevance") | |
36 | + .addUrl("https://www.nature.com/search?q=battery") | |
37 | + .addUrl("https://www.nature.com/nature/research-articles") | |
38 | + .setUUID(UuidUtil.getTimeBasedUuid().toString()) | |
39 | + .addPipeline(articlePipeline) | |
40 | + .setScheduler(new RedisScheduler("127.0.0.1")) | |
41 | + // 开启5个线程执行,并开始爬取 | |
42 | + .thread(60).run(); | |
43 | + return ServerResult.success(); | |
44 | + } | |
45 | +} | |
... | ... |
src/main/java/com/canrd/webmagic/controller/NatureMethodsController.java
0 → 100644
1 | +package com.canrd.webmagic.controller; | |
2 | + | |
3 | +import com.canrd.webmagic.common.constant.ServerResult; | |
4 | +import com.canrd.webmagic.processor.NatureMethodsPcoessor; | |
5 | +import com.canrd.webmagic.processor.NatureNanotechnologyProcessor; | |
6 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | |
7 | +import io.swagger.annotations.Api; | |
8 | +import io.swagger.annotations.ApiOperation; | |
9 | +import org.apache.logging.log4j.core.util.UuidUtil; | |
10 | +import org.springframework.web.bind.annotation.GetMapping; | |
11 | +import org.springframework.web.bind.annotation.RequestMapping; | |
12 | +import org.springframework.web.bind.annotation.RestController; | |
13 | +import us.codecraft.webmagic.Spider; | |
14 | +import us.codecraft.webmagic.scheduler.RedisScheduler; | |
15 | + | |
16 | +import javax.annotation.Resource; | |
17 | + | |
18 | +@RestController | |
19 | +@RequestMapping("/nature-methods/article") | |
20 | +@Api("Nature") | |
21 | +public class NatureMethodsController { | |
22 | + @Resource | |
23 | + private NatureMethodsPcoessor natureMethodsPcoessor; | |
24 | + @Resource | |
25 | + private ArticlePipeline articlePipeline; | |
26 | + | |
27 | + @GetMapping("/start") | |
28 | + @ApiOperation("start") | |
29 | + public ServerResult start() { | |
30 | + Spider.create(natureMethodsPcoessor) | |
31 | + // 添加这个Spider要爬取的网页地址 | |
32 | + .addUrl("https://www.nature.com/nmeth/") | |
33 | + .setUUID(UuidUtil.getTimeBasedUuid().toString()) | |
34 | + .addPipeline(articlePipeline) | |
35 | + .setScheduler(new RedisScheduler("127.0.0.1")) | |
36 | + // 开启5个线程执行,并开始爬取 | |
37 | + .thread(20).run(); | |
38 | + return ServerResult.success(); | |
39 | + } | |
40 | +} | |
... | ... |
src/main/java/com/canrd/webmagic/controller/NatureNanotechnologyController.java
0 → 100644
1 | +package com.canrd.webmagic.controller; | |
2 | + | |
3 | +import com.canrd.webmagic.common.constant.ServerResult; | |
4 | +import com.canrd.webmagic.processor.NatureMaterialPagePcoessor; | |
5 | +import com.canrd.webmagic.processor.NatureNanotechnologyProcessor; | |
6 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | |
7 | +import io.swagger.annotations.Api; | |
8 | +import io.swagger.annotations.ApiOperation; | |
9 | +import org.apache.logging.log4j.core.util.UuidUtil; | |
10 | +import org.springframework.web.bind.annotation.GetMapping; | |
11 | +import org.springframework.web.bind.annotation.RequestMapping; | |
12 | +import org.springframework.web.bind.annotation.RestController; | |
13 | +import us.codecraft.webmagic.Spider; | |
14 | +import us.codecraft.webmagic.scheduler.RedisScheduler; | |
15 | + | |
16 | +import javax.annotation.Resource; | |
17 | + | |
18 | +@RestController | |
19 | +@RequestMapping("/nature-nanotechnology/article") | |
20 | +@Api("Nature") | |
21 | +public class NatureNanotechnologyController { | |
22 | + @Resource | |
23 | + private NatureNanotechnologyProcessor natureNanotechnologyProcessor; | |
24 | + @Resource | |
25 | + private ArticlePipeline articlePipeline; | |
26 | + | |
27 | + @GetMapping("/start") | |
28 | + @ApiOperation("start") | |
29 | + public ServerResult start() { | |
30 | + Spider.create(natureNanotechnologyProcessor) | |
31 | + // 添加这个Spider要爬取的网页地址 | |
32 | + .addUrl("https://www.nature.com/nnano/") | |
33 | + .setUUID(UuidUtil.getTimeBasedUuid().toString()) | |
34 | + .addPipeline(articlePipeline) | |
35 | + .setScheduler(new RedisScheduler("127.0.0.1")) | |
36 | + // 开启5个线程执行,并开始爬取 | |
37 | + .thread(60).run(); | |
38 | + return ServerResult.success(); | |
39 | + } | |
40 | +} | |
... | ... |
src/main/java/com/canrd/webmagic/controller/NaturePhysicsController.java
0 → 100644
1 | +package com.canrd.webmagic.controller; | |
2 | + | |
3 | +import com.canrd.webmagic.common.constant.ServerResult; | |
4 | +import com.canrd.webmagic.processor.NatureEnergyPagePcoessor; | |
5 | +import com.canrd.webmagic.processor.NaturePhysicsProcessor; | |
6 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | |
7 | +import io.swagger.annotations.Api; | |
8 | +import io.swagger.annotations.ApiOperation; | |
9 | +import org.apache.logging.log4j.core.util.UuidUtil; | |
10 | +import org.springframework.web.bind.annotation.GetMapping; | |
11 | +import org.springframework.web.bind.annotation.RequestMapping; | |
12 | +import org.springframework.web.bind.annotation.RestController; | |
13 | +import us.codecraft.webmagic.Spider; | |
14 | +import us.codecraft.webmagic.scheduler.RedisScheduler; | |
15 | + | |
16 | +import javax.annotation.Resource; | |
17 | +@RestController | |
18 | +@RequestMapping("/nature-physics/article") | |
19 | +@Api("Nature") | |
20 | +public class NaturePhysicsController { | |
21 | + @Resource | |
22 | + private NaturePhysicsProcessor naturePhysicsProcessor; | |
23 | + @Resource | |
24 | + private ArticlePipeline articlePipeline; | |
25 | + | |
26 | + @GetMapping("/start") | |
27 | + @ApiOperation("start") | |
28 | + public ServerResult start() { | |
29 | + Spider.create(naturePhysicsProcessor) | |
30 | + .addUrl("https://www.nature.com/nphys/") | |
31 | + .setUUID(UuidUtil.getTimeBasedUuid().toString()) | |
32 | + .addPipeline(articlePipeline) | |
33 | +// .setScheduler(new RedisScheduler("127.0.0.1")) | |
34 | + .thread(20).run(); | |
35 | + return ServerResult.success(); | |
36 | + } | |
37 | +} | |
0 | 38 | \ No newline at end of file |
... | ... |
src/main/java/com/canrd/webmagic/domain/ArticleTypeEnum.java
... | ... | @@ -18,6 +18,13 @@ public enum ArticleTypeEnum { |
18 | 18 | SCIENCE("science", "science网址"), |
19 | 19 | SCIENCE_SPJ("science-spj", "science网址-spj"), |
20 | 20 | UNIVIE_PHYSNANO("univie-physnano", "univie网址-physnano"), |
21 | + NATURE_MATERIAL("nuture-material","nuture网站-material"), | |
22 | + NATURE_NANOTECHNOLOGY("nature-nanotechnology","nuture网站-nanotechnology"), | |
23 | + NATURE_PHYSICS("nature-physics","nuture网站-physics"), | |
24 | + NATURE_ENERGY("nature-energy","nuture网站-energy"), | |
25 | + NATURE_COMMUNICATIONS("nature-communications","nuture网站-communications"), | |
26 | + NATURE_COMPUTATIONAL_SCIENCE("nature-computational|science","nuture网站-computational|science"), | |
27 | + NATURE_METHODS("nature-methods","nuture网站-methods"), | |
21 | 28 | ; |
22 | 29 | private String type; |
23 | 30 | private String desc; |
... | ... |
src/main/java/com/canrd/webmagic/processor/AdvancedEnergyMaterialPcoessor.java
0 → 100644
1 | +package com.canrd.webmagic.processor; | |
2 | + | |
3 | +import us.codecraft.webmagic.Page; | |
4 | +import us.codecraft.webmagic.Site; | |
5 | +import us.codecraft.webmagic.processor.PageProcessor; | |
6 | + | |
7 | +public class AdvancedEnergyMaterialPcoessor implements PageProcessor { | |
8 | + @Override | |
9 | + public void process(Page page) { | |
10 | + | |
11 | + } | |
12 | + | |
13 | + @Override | |
14 | + public Site getSite() { | |
15 | + return PageProcessor.super.getSite(); | |
16 | + } | |
17 | + | |
18 | + | |
19 | +} | |
... | ... |
src/main/java/com/canrd/webmagic/processor/ChemicalPagePcoessor.java
1 | 1 | package com.canrd.webmagic.processor; |
2 | 2 | |
3 | +import com.canrd.webmagic.DNS.DnsResolver; | |
3 | 4 | import com.canrd.webmagic.processor.pipeline.ArticlePipeline; |
5 | +import com.gargoylesoftware.htmlunit.AbstractPage; | |
4 | 6 | import us.codecraft.webmagic.Page; |
7 | +import us.codecraft.webmagic.Request; | |
5 | 8 | import us.codecraft.webmagic.Site; |
6 | 9 | import us.codecraft.webmagic.Spider; |
7 | 10 | import us.codecraft.webmagic.processor.PageProcessor; |
8 | 11 | import us.codecraft.webmagic.selector.Html; |
12 | +import us.codecraft.webmagic.selector.PlainText; | |
9 | 13 | |
10 | 14 | public class ChemicalPagePcoessor implements PageProcessor { |
11 | - private Site site = Site.me().setRetryTimes(3).setSleepTime(100); | |
15 | + | |
16 | + private Request request; | |
17 | + String domain = new PlainText(request.getUrl()).regex("//(.*?)/").get(); | |
18 | + | |
19 | + String ip = DnsResolver.resolve(domain); | |
20 | + private Site site = Site.me().setRetryTimes(3).setSleepTime(100).setDomain(domain).addCookie("ip", ip);; | |
12 | 21 | @Override |
13 | 22 | public void process(Page page) { |
14 | 23 | |
... | ... |
src/main/java/com/canrd/webmagic/processor/MatterPragePcoessor.java renamed to src/main/java/com/canrd/webmagic/processor/MatterPagePcoessor.java
1 | 1 | package com.canrd.webmagic.processor; |
2 | 2 | |
3 | +import com.canrd.webmagic.processor.config.Agent; | |
3 | 4 | import com.canrd.webmagic.processor.pipeline.ArticlePipeline; |
4 | 5 | import org.springframework.stereotype.Component; |
6 | +import lombok.extern.slf4j.Slf4j; | |
5 | 7 | import us.codecraft.webmagic.Page; |
6 | 8 | import us.codecraft.webmagic.Site; |
7 | 9 | import us.codecraft.webmagic.Spider; |
8 | 10 | import us.codecraft.webmagic.processor.PageProcessor; |
9 | 11 | import us.codecraft.webmagic.selector.Html; |
12 | + | |
10 | 13 | @Component |
11 | -public class MatterPragePcoessor implements PageProcessor { | |
12 | - private Site site = Site.me().setRetryTimes(3).setSleepTime(100); | |
14 | +@Slf4j | |
15 | +public class MatterPagePcoessor implements PageProcessor { | |
16 | + | |
17 | + private Site site = Site.me() | |
18 | + .setRetryTimes(3) | |
19 | + .setSleepTime(5) | |
20 | + .setUserAgent(Agent.getRandom()); | |
21 | + | |
13 | 22 | @Override |
14 | 23 | public void process(Page page) { |
24 | + //首页 | |
25 | + if (page.getUrl().get().equals("https://www.cell.com/matter/home")){ | |
26 | + | |
27 | + } | |
28 | + //搜索页 | |
29 | + else if (page.getUrl().get().contains("https://www.cell.com/action/doSearch?")) { | |
15 | 30 | |
31 | + } | |
32 | + //详情页 | |
33 | + else if (page.getUrl().get().contains("https://www.cell.com/matter/fulltext/")) { | |
34 | + doArticleContent(page); | |
35 | + } | |
16 | 36 | } |
17 | 37 | |
18 | 38 | @Override |
19 | 39 | public Site getSite() { |
20 | - return PageProcessor.super.getSite(); | |
40 | + return this.site; | |
21 | 41 | } |
22 | 42 | |
23 | - public void doArticleContent(Page page){ | |
43 | + public void doArticleContent(Page page) { | |
24 | 44 | Html html = page.getHtml(); |
45 | + log.info(String.valueOf(html)); | |
25 | 46 | String articleCode = page.getUrl().get(); |
26 | 47 | // html.xpath() |
27 | 48 | } |
28 | 49 | |
29 | 50 | public static void main(String[] args) { |
30 | 51 | // 创建一个Spider,并把我们的处理器放进去 |
31 | - Spider.create(new MatterPragePcoessor()) | |
52 | + Spider.create(new MatterPagePcoessor()) | |
32 | 53 | // 添加这个Spider要爬取的网页地址 |
33 | - .addUrl("https://www.cell.com/matter/fulltext/S2590-2385(21)00631-7#%20") | |
54 | + .addUrl("https://www.cell.com/matter/home") | |
34 | 55 | .addPipeline(new ArticlePipeline()) |
35 | 56 | // 开启5个线程执行,并开始爬取 |
36 | - .thread(5).run(); | |
57 | + .thread(1).run(); | |
37 | 58 | } |
38 | 59 | } |
... | ... |
src/main/java/com/canrd/webmagic/processor/NatureCommunicatiosPcoessor.java
0 → 100644
1 | +package com.canrd.webmagic.processor; | |
2 | + | |
3 | +import com.alibaba.fastjson.JSONArray; | |
4 | +import com.alibaba.fastjson.JSONObject; | |
5 | +import com.baomidou.mybatisplus.core.toolkit.CollectionUtils; | |
6 | +import com.canrd.webmagic.common.utils.DateUtil; | |
7 | +import com.canrd.webmagic.common.utils.StringUtils; | |
8 | +import com.canrd.webmagic.domain.ArticleTypeEnum; | |
9 | +import com.canrd.webmagic.domain.dto.ArticleDO; | |
10 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | |
11 | +import lombok.extern.slf4j.Slf4j; | |
12 | +import org.springframework.stereotype.Component; | |
13 | +import us.codecraft.webmagic.Page; | |
14 | +import us.codecraft.webmagic.Site; | |
15 | +import us.codecraft.webmagic.Spider; | |
16 | +import us.codecraft.webmagic.processor.PageProcessor; | |
17 | +import us.codecraft.webmagic.selector.Html; | |
18 | +import us.codecraft.webmagic.selector.Selectable; | |
19 | +import us.codecraft.webmagic.selector.XpathSelector; | |
20 | + | |
21 | +import java.text.ParseException; | |
22 | +import java.text.SimpleDateFormat; | |
23 | +import java.util.*; | |
24 | +import java.util.stream.Collectors; | |
25 | + | |
26 | +@Slf4j | |
27 | +@Component | |
28 | +public class NatureCommunicatiosPcoessor implements PageProcessor { | |
29 | + | |
30 | + @Override | |
31 | + public void process(Page page) { | |
32 | + if (page.getUrl().get().equals("https://www.nature.com/ncomms/articles?type=editorial")){ | |
33 | + getIndex(page); | |
34 | + } else if (page.getUrl().get().contains("https://www.nature.com/ncomms/articles?searchType=journalSearch&sort=PubDate&type=editorial&page=")) { | |
35 | + everyPage(page); | |
36 | + }else if (page.getUrl().get().contains("https://www.nature.com/articles/s41467-022-29269-6")){ | |
37 | + doArticleContent(page); | |
38 | + } | |
39 | + } | |
40 | + | |
41 | + @Override | |
42 | + public Site getSite() { | |
43 | + return PageProcessor.super.getSite(); | |
44 | + } | |
45 | + | |
46 | + public void getIndex(Page page){ | |
47 | + String maxIndex = page.getHtml().xpath("//div[@class='u-mb-48']/nav/ul/li[5]/a/text()").get(); | |
48 | + log.info(maxIndex); | |
49 | + String trim = maxIndex.trim(); | |
50 | + int number = Integer.parseInt(trim); | |
51 | + System.out.printf("", number); | |
52 | + for (int i = 1; i <= number; i++) { | |
53 | + log.info(String.valueOf(i)); | |
54 | + page.addTargetRequest("https://www.nature.com/ncomms/articles?searchType=journalSearch&sort=PubDate&type=editorial&page="+i); | |
55 | + } | |
56 | + } | |
57 | + | |
58 | + public void everyPage(Page page){ | |
59 | + List<String> all = page.getHtml().xpath("//div[@class='u-flex-direction-column']/h3/a//@href").all(); | |
60 | + for (int i = 1; i <= all.size(); i++) { | |
61 | + log.info(String.valueOf(i)); | |
62 | + page.addTargetRequest("https://www.nature.com"+all.get(i)); | |
63 | + } | |
64 | + } | |
65 | + | |
66 | + private void doArticleContent(Page page) { | |
67 | + Html html = page.getHtml(); | |
68 | + String articleCode = page.getUrl().get(); | |
69 | + Selectable headSelectable = html.xpath("//div[@class='c-article-header']/header"); | |
70 | + List<Selectable> authorEmailSelectables = html.xpath("//p[@id='corresponding-author-list']/a").nodes(); | |
71 | + Selectable referencesSelectable = html.xpath("//ol[@class='c-article-references']").select(new XpathSelector("li[@class='c-article-references__item js-c-reading-companion-references-item']")); | |
72 | + Selectable authorAddressSelectable = html.xpath("//ol[@class='c-article-author-affiliation__list']").select(new XpathSelector("li")); | |
73 | + | |
74 | + String title = headSelectable.xpath("//div/h1/text()").get(); | |
75 | + if (StringUtils.isBlank(title)) { | |
76 | + title = headSelectable.xpath("//h1/text()").get(); | |
77 | + } | |
78 | + String articleDesc = html.xpath("//div[@class='c-article-section__content']/p/text()").get(); | |
79 | + String publishTime; | |
80 | + Date publishTimeDateTime = null; | |
81 | + try { | |
82 | + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get(); | |
83 | + } catch (Exception e) { | |
84 | + try { | |
85 | + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(1).xpath("//li/time/text()").get(); | |
86 | + } catch (Exception e1) { | |
87 | + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(0).xpath("//li/time/text()").get(); | |
88 | + } | |
89 | + } | |
90 | + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH); | |
91 | + | |
92 | + try { | |
93 | + publishTimeDateTime = formatter.parse(publishTime); | |
94 | + } catch (ParseException e) { | |
95 | + e.printStackTrace(); | |
96 | + } | |
97 | + Selectable authorSelectable = headSelectable.xpath("//ul").nodes().get(1).select(new XpathSelector("li[@class='c-article-author-list__item']")); | |
98 | + List<Selectable> authorNodes = authorSelectable.nodes(); | |
99 | + StringBuffer authorName = new StringBuffer(); | |
100 | + for (Selectable node : authorNodes) { | |
101 | + authorName.append(node.xpath("//a/text()")); | |
102 | + } | |
103 | + | |
104 | + JSONArray authorAddress = new JSONArray(); | |
105 | + List<Selectable> authorAddressList = authorAddressSelectable.nodes(); | |
106 | + if (CollectionUtils.isNotEmpty(authorAddressList)) { | |
107 | + for (Selectable selectable : authorAddressList) { | |
108 | + String address = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__address']/text()").get(); | |
109 | + String authorNames = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__authors-list']/text()").get(); | |
110 | + JSONObject object = new JSONObject(); | |
111 | + object.put("address", address); | |
112 | + object.put("authorNames", authorNames); | |
113 | + authorAddress.add(object); | |
114 | + } | |
115 | + } | |
116 | + | |
117 | + JSONArray references = new JSONArray(); | |
118 | + List<Selectable> referenceList = referencesSelectable.nodes(); | |
119 | + if (CollectionUtils.isNotEmpty(referenceList)) { | |
120 | + for (Selectable reference : referenceList) { | |
121 | + String referenceTitle = reference.xpath("//p").xpath("//p[@class='c-article-references__text']/text()").get(); | |
122 | + List<Selectable> referenceLinks = reference.xpath("//p").xpath("//p[@class='c-article-references__links u-hide-print']").links().nodes(); | |
123 | + List<String> links = new ArrayList<>(); | |
124 | + if (CollectionUtils.isNotEmpty(referenceLinks)) { | |
125 | + links = referenceLinks.stream().map(x -> x.get()).collect(Collectors.toList()); | |
126 | + } | |
127 | + JSONObject object = new JSONObject(); | |
128 | + object.put("referenceTitle", referenceTitle); | |
129 | + object.put("links", links); | |
130 | +// if (CollectionUtils.isNotEmpty(links)) { | |
131 | +// page.addTargetRequests(links.stream().filter(x -> x.contains("nature")).collect(Collectors.toList())); | |
132 | +// } | |
133 | + references.add(object); | |
134 | + } | |
135 | + } | |
136 | + | |
137 | + JSONArray authorEmail = new JSONArray(); | |
138 | + for (Selectable authorEmailSelectable : authorEmailSelectables) { | |
139 | + String[] split = authorEmailSelectable.xpath("//a").links().get().split(":"); | |
140 | + String email = Objects.isNull(split) ? "" : split[split.length - 1]; | |
141 | + String authorEmailName = authorEmailSelectable.xpath("//a/text()").get(); | |
142 | + JSONObject jsonObject = new JSONObject(); | |
143 | + jsonObject.put("authorEmailName", authorEmailName); | |
144 | + jsonObject.put("email", email); | |
145 | + authorEmail.add(jsonObject); | |
146 | + } | |
147 | + log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorEmail.toJSONString()); | |
148 | + | |
149 | + page.putField("article", ArticleDO.builder() | |
150 | + .articleType(ArticleTypeEnum.NATURE_COMMUNICATIONS.getType()) | |
151 | + .articleCode(articleCode) | |
152 | + .authorName(authorName.toString()) | |
153 | + .title(title) | |
154 | + .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE)) | |
155 | + .emailInfo(authorEmail.toJSONString()) | |
156 | + .articleDesc(articleDesc) | |
157 | + .authorAddress(authorAddress.toJSONString()) | |
158 | + .referenceInfo(references.toJSONString()).build()); | |
159 | + } | |
160 | + | |
161 | + public static void main(String[] args) { | |
162 | + Spider.create(new MatterPagePcoessor()) | |
163 | + .addUrl("https://www.nature.com/nenergy/research-articles") | |
164 | + .addPipeline(new ArticlePipeline()) | |
165 | + .thread(1).run(); | |
166 | + } | |
167 | +} | |
... | ... |
src/main/java/com/canrd/webmagic/processor/NatureComputationalPcoessor.java
0 → 100644
1 | +package com.canrd.webmagic.processor; | |
2 | + | |
3 | +import com.alibaba.fastjson.JSONArray; | |
4 | +import com.alibaba.fastjson.JSONObject; | |
5 | +import com.baomidou.mybatisplus.core.toolkit.CollectionUtils; | |
6 | +import com.canrd.webmagic.common.utils.DateUtil; | |
7 | +import com.canrd.webmagic.common.utils.StringUtils; | |
8 | +import com.canrd.webmagic.domain.ArticleTypeEnum; | |
9 | +import com.canrd.webmagic.domain.dto.ArticleDO; | |
10 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | |
11 | +import lombok.extern.slf4j.Slf4j; | |
12 | +import org.springframework.stereotype.Component; | |
13 | +import us.codecraft.webmagic.Page; | |
14 | +import us.codecraft.webmagic.Site; | |
15 | +import us.codecraft.webmagic.Spider; | |
16 | +import us.codecraft.webmagic.processor.PageProcessor; | |
17 | +import us.codecraft.webmagic.selector.Html; | |
18 | +import us.codecraft.webmagic.selector.Selectable; | |
19 | +import us.codecraft.webmagic.selector.XpathSelector; | |
20 | + | |
21 | +import java.text.ParseException; | |
22 | +import java.text.SimpleDateFormat; | |
23 | +import java.util.*; | |
24 | +import java.util.stream.Collectors; | |
25 | + | |
26 | +@Slf4j | |
27 | +@Component | |
28 | +public class NatureComputationalPcoessor implements PageProcessor{ | |
29 | + @Override | |
30 | + public void process(Page page) { | |
31 | + String url = page.getUrl().get(); | |
32 | + if (url.contains("https://www.nature.com/nenergy/research-articles?searchType=journalSearch&sort=PubDate&page=")){ | |
33 | + everyPage(page); | |
34 | + } else if (url.contains("https://www.nature.com/")){ | |
35 | + doArticleContent(page); | |
36 | + } | |
37 | + } | |
38 | + | |
39 | + @Override | |
40 | + public Site getSite() { | |
41 | + return PageProcessor.super.getSite(); | |
42 | + } | |
43 | + | |
44 | + | |
45 | + public void everyPage(Page page){ | |
46 | + List<String> all = page.getHtml().xpath("//div[@class='u-flex-direction-column']/h3/a//@href").all(); | |
47 | + for (int i = 0; i < all.size(); i++) { | |
48 | + page.addTargetRequest("https://www.nature.com/"+all.get(i)); | |
49 | + } | |
50 | + } | |
51 | + | |
52 | + private void doArticleContent(Page page) { | |
53 | + Html html = page.getHtml(); | |
54 | + String articleCode = page.getUrl().get(); | |
55 | + Selectable headSelectable = html.xpath("//div[@class='c-article-header']/header"); | |
56 | + List<Selectable> authorEmailSelectables = html.xpath("//p[@id='corresponding-author-list']/a").nodes(); | |
57 | + Selectable referencesSelectable = html.xpath("//ol[@class='c-article-references']").select(new XpathSelector("li[@class='c-article-references__item js-c-reading-companion-references-item']")); | |
58 | + Selectable authorAddressSelectable = html.xpath("//ol[@class='c-article-author-affiliation__list']").select(new XpathSelector("li")); | |
59 | + | |
60 | + String title = headSelectable.xpath("//div/h1/text()").get(); | |
61 | + if (StringUtils.isBlank(title)) { | |
62 | + title = headSelectable.xpath("//h1/text()").get(); | |
63 | + } | |
64 | + String articleDesc = html.xpath("//div[@class='c-article-section__content']/p/text()").get(); | |
65 | + String publishTime; | |
66 | + Date publishTimeDateTime = null; | |
67 | + try { | |
68 | + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get(); | |
69 | + } catch (Exception e) { | |
70 | + try { | |
71 | + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(1).xpath("//li/time/text()").get(); | |
72 | + } catch (Exception e1) { | |
73 | + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(0).xpath("//li/time/text()").get(); | |
74 | + } | |
75 | + } | |
76 | + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH); | |
77 | + | |
78 | + try { | |
79 | + publishTimeDateTime = formatter.parse(publishTime); | |
80 | + } catch (ParseException e) { | |
81 | + e.printStackTrace(); | |
82 | + } | |
83 | + Selectable authorSelectable = headSelectable.xpath("//ul").nodes().get(1).select(new XpathSelector("li[@class='c-article-author-list__item']")); | |
84 | + List<Selectable> authorNodes = authorSelectable.nodes(); | |
85 | + StringBuffer authorName = new StringBuffer(); | |
86 | + for (Selectable node : authorNodes) { | |
87 | + authorName.append(node.xpath("//a/text()")); | |
88 | + } | |
89 | + | |
90 | + JSONArray authorAddress = new JSONArray(); | |
91 | + List<Selectable> authorAddressList = authorAddressSelectable.nodes(); | |
92 | + if (CollectionUtils.isNotEmpty(authorAddressList)) { | |
93 | + for (Selectable selectable : authorAddressList) { | |
94 | + String address = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__address']/text()").get(); | |
95 | + String authorNames = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__authors-list']/text()").get(); | |
96 | + JSONObject object = new JSONObject(); | |
97 | + object.put("address", address); | |
98 | + object.put("authorNames", authorNames); | |
99 | + authorAddress.add(object); | |
100 | + } | |
101 | + } | |
102 | + | |
103 | + JSONArray references = new JSONArray(); | |
104 | + List<Selectable> referenceList = referencesSelectable.nodes(); | |
105 | + if (CollectionUtils.isNotEmpty(referenceList)) { | |
106 | + for (Selectable reference : referenceList) { | |
107 | + String referenceTitle = reference.xpath("//p").xpath("//p[@class='c-article-references__text']/text()").get(); | |
108 | + List<Selectable> referenceLinks = reference.xpath("//p").xpath("//p[@class='c-article-references__links u-hide-print']").links().nodes(); | |
109 | + List<String> links = new ArrayList<>(); | |
110 | + if (CollectionUtils.isNotEmpty(referenceLinks)) { | |
111 | + links = referenceLinks.stream().map(x -> x.get()).collect(Collectors.toList()); | |
112 | + } | |
113 | + JSONObject object = new JSONObject(); | |
114 | + object.put("referenceTitle", referenceTitle); | |
115 | + object.put("links", links); | |
116 | +// if (CollectionUtils.isNotEmpty(links)) { | |
117 | +// page.addTargetRequests(links.stream().filter(x -> x.contains("nature")).collect(Collectors.toList())); | |
118 | +// } | |
119 | + references.add(object); | |
120 | + } | |
121 | + } | |
122 | + | |
123 | + JSONArray authorEmail = new JSONArray(); | |
124 | + for (Selectable authorEmailSelectable : authorEmailSelectables) { | |
125 | + String[] split = authorEmailSelectable.xpath("//a").links().get().split(":"); | |
126 | + String email = Objects.isNull(split) ? "" : split[split.length - 1]; | |
127 | + String authorEmailName = authorEmailSelectable.xpath("//a/text()").get(); | |
128 | + JSONObject jsonObject = new JSONObject(); | |
129 | + jsonObject.put("authorEmailName", authorEmailName); | |
130 | + jsonObject.put("email", email); | |
131 | + authorEmail.add(jsonObject); | |
132 | + } | |
133 | + log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorEmail.toJSONString()); | |
134 | + | |
135 | + page.putField("article", ArticleDO.builder() | |
136 | + .articleType(ArticleTypeEnum.NATURE_COMPUTATIONAL_SCIENCE.getType()) | |
137 | + .articleCode(articleCode) | |
138 | + .authorName(authorName.toString()) | |
139 | + .title(title) | |
140 | + .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE)) | |
141 | + .emailInfo(authorEmail.toJSONString()) | |
142 | + .articleDesc(articleDesc) | |
143 | + .authorAddress(authorAddress.toJSONString()) | |
144 | + .referenceInfo(references.toJSONString()).build()); | |
145 | + } | |
146 | + | |
147 | + public static void main(String[] args) { | |
148 | + Spider.create(new MatterPagePcoessor()) | |
149 | + .addUrl("https://www.nature.com/nenergy/research-articles") | |
150 | + .addPipeline(new ArticlePipeline()) | |
151 | + .thread(1).run(); | |
152 | + } | |
153 | +} | |
... | ... |
src/main/java/com/canrd/webmagic/processor/NatureEnergyPagePcoessor.java
0 → 100644
1 | +package com.canrd.webmagic.processor; | |
2 | + | |
3 | +import com.alibaba.fastjson.JSONArray; | |
4 | +import com.alibaba.fastjson.JSONObject; | |
5 | +import com.baomidou.mybatisplus.core.toolkit.CollectionUtils; | |
6 | +import com.canrd.webmagic.common.utils.DateUtil; | |
7 | +import com.canrd.webmagic.common.utils.StringUtils; | |
8 | +import com.canrd.webmagic.domain.ArticleTypeEnum; | |
9 | +import com.canrd.webmagic.domain.dto.ArticleDO; | |
10 | +import com.canrd.webmagic.processor.MatterPagePcoessor; | |
11 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | |
12 | +import lombok.extern.slf4j.Slf4j; | |
13 | +import org.springframework.stereotype.Component; | |
14 | +import us.codecraft.webmagic.Page; | |
15 | +import us.codecraft.webmagic.Site; | |
16 | +import us.codecraft.webmagic.Spider; | |
17 | +import us.codecraft.webmagic.processor.PageProcessor; | |
18 | +import us.codecraft.webmagic.selector.Html; | |
19 | +import us.codecraft.webmagic.selector.Selectable; | |
20 | +import us.codecraft.webmagic.selector.XpathSelector; | |
21 | + | |
22 | +import java.text.ParseException; | |
23 | +import java.text.SimpleDateFormat; | |
24 | +import java.util.*; | |
25 | +import java.util.stream.Collectors; | |
26 | + | |
27 | +@Component | |
28 | +@Slf4j | |
29 | +public class NatureEnergyPagePcoessor implements PageProcessor { | |
30 | + @Override | |
31 | + public void process(Page page) { | |
32 | + String url = page.getUrl().get(); | |
33 | + if (url.equals("https://www.nature.com/nenergy/research-articles")){ | |
34 | + getIndex(page); | |
35 | + } else if (url.contains("https://www.nature.com/nenergy/research-articles?searchType=journalSearch&sort=PubDate&page=")){ | |
36 | + everyPage(page); | |
37 | + } else if (url.contains("https://www.nature.com/")){ | |
38 | + doArticleContent(page); | |
39 | + } | |
40 | + } | |
41 | + | |
42 | + @Override | |
43 | + public Site getSite() { | |
44 | + return PageProcessor.super.getSite(); | |
45 | + } | |
46 | + | |
47 | + public void getIndex(Page page){ | |
48 | + String maxIndex = page.getHtml().xpath("//div[@class='u-mb-48']/nav/ul/li[6]/a/text()").get(); | |
49 | + log.info(maxIndex); | |
50 | + String trim = maxIndex.trim(); | |
51 | + int number = Integer.parseInt(trim); | |
52 | + System.out.printf("", number); | |
53 | + for (int i = 0; i < number; i++) { | |
54 | + page.addTargetRequest("https://www.nature.com/nenergy/research-articles?searchType=journalSearch&sort=PubDate&page="+i); | |
55 | + } | |
56 | + } | |
57 | + | |
58 | + public void everyPage(Page page){ | |
59 | + List<String> all = page.getHtml().xpath("//div[@class='u-flex-direction-column']/h3/a//@href").all(); | |
60 | + for (int i = 0; i < all.size(); i++) { | |
61 | + page.addTargetRequest("https://www.nature.com/"+all.get(i)); | |
62 | + } | |
63 | + } | |
64 | + | |
65 | + private void doArticleContent(Page page) { | |
66 | + Html html = page.getHtml(); | |
67 | + String articleCode = page.getUrl().get(); | |
68 | + Selectable headSelectable = html.xpath("//div[@class='c-article-header']/header"); | |
69 | + List<Selectable> authorEmailSelectables = html.xpath("//p[@id='corresponding-author-list']/a").nodes(); | |
70 | + Selectable referencesSelectable = html.xpath("//ol[@class='c-article-references']").select(new XpathSelector("li[@class='c-article-references__item js-c-reading-companion-references-item']")); | |
71 | + Selectable authorAddressSelectable = html.xpath("//ol[@class='c-article-author-affiliation__list']").select(new XpathSelector("li")); | |
72 | + | |
73 | + String title = headSelectable.xpath("//div/h1/text()").get(); | |
74 | + if (StringUtils.isBlank(title)) { | |
75 | + title = headSelectable.xpath("//h1/text()").get(); | |
76 | + } | |
77 | + String articleDesc = html.xpath("//div[@class='c-article-section__content']/p/text()").get(); | |
78 | + String publishTime; | |
79 | + Date publishTimeDateTime = null; | |
80 | + try { | |
81 | + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get(); | |
82 | + } catch (Exception e) { | |
83 | + try { | |
84 | + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(1).xpath("//li/time/text()").get(); | |
85 | + } catch (Exception e1) { | |
86 | + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(0).xpath("//li/time/text()").get(); | |
87 | + } | |
88 | + } | |
89 | + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH); | |
90 | + | |
91 | + try { | |
92 | + publishTimeDateTime = formatter.parse(publishTime); | |
93 | + } catch (ParseException e) { | |
94 | + e.printStackTrace(); | |
95 | + } | |
96 | + Selectable authorSelectable = headSelectable.xpath("//ul").nodes().get(1).select(new XpathSelector("li[@class='c-article-author-list__item']")); | |
97 | + List<Selectable> authorNodes = authorSelectable.nodes(); | |
98 | + StringBuffer authorName = new StringBuffer(); | |
99 | + for (Selectable node : authorNodes) { | |
100 | + authorName.append(node.xpath("//a/text()")); | |
101 | + } | |
102 | + | |
103 | + JSONArray authorAddress = new JSONArray(); | |
104 | + List<Selectable> authorAddressList = authorAddressSelectable.nodes(); | |
105 | + if (CollectionUtils.isNotEmpty(authorAddressList)) { | |
106 | + for (Selectable selectable : authorAddressList) { | |
107 | + String address = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__address']/text()").get(); | |
108 | + String authorNames = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__authors-list']/text()").get(); | |
109 | + JSONObject object = new JSONObject(); | |
110 | + object.put("address", address); | |
111 | + object.put("authorNames", authorNames); | |
112 | + authorAddress.add(object); | |
113 | + } | |
114 | + } | |
115 | + | |
116 | + JSONArray references = new JSONArray(); | |
117 | + List<Selectable> referenceList = referencesSelectable.nodes(); | |
118 | + if (CollectionUtils.isNotEmpty(referenceList)) { | |
119 | + for (Selectable reference : referenceList) { | |
120 | + String referenceTitle = reference.xpath("//p").xpath("//p[@class='c-article-references__text']/text()").get(); | |
121 | + List<Selectable> referenceLinks = reference.xpath("//p").xpath("//p[@class='c-article-references__links u-hide-print']").links().nodes(); | |
122 | + List<String> links = new ArrayList<>(); | |
123 | + if (CollectionUtils.isNotEmpty(referenceLinks)) { | |
124 | + links = referenceLinks.stream().map(x -> x.get()).collect(Collectors.toList()); | |
125 | + } | |
126 | + JSONObject object = new JSONObject(); | |
127 | + object.put("referenceTitle", referenceTitle); | |
128 | + object.put("links", links); | |
129 | +// if (CollectionUtils.isNotEmpty(links)) { | |
130 | +// page.addTargetRequests(links.stream().filter(x -> x.contains("nature")).collect(Collectors.toList())); | |
131 | +// } | |
132 | + references.add(object); | |
133 | + } | |
134 | + } | |
135 | + | |
136 | + JSONArray authorEmail = new JSONArray(); | |
137 | + for (Selectable authorEmailSelectable : authorEmailSelectables) { | |
138 | + String[] split = authorEmailSelectable.xpath("//a").links().get().split(":"); | |
139 | + String email = Objects.isNull(split) ? "" : split[split.length - 1]; | |
140 | + String authorEmailName = authorEmailSelectable.xpath("//a/text()").get(); | |
141 | + JSONObject jsonObject = new JSONObject(); | |
142 | + jsonObject.put("authorEmailName", authorEmailName); | |
143 | + jsonObject.put("email", email); | |
144 | + authorEmail.add(jsonObject); | |
145 | + } | |
146 | + log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorEmail.toJSONString()); | |
147 | + | |
148 | + page.putField("article", ArticleDO.builder() | |
149 | + .articleType(ArticleTypeEnum.NATURE_ENERGY.getType()) | |
150 | + .articleCode(articleCode) | |
151 | + .authorName(authorName.toString()) | |
152 | + .title(title) | |
153 | + .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE)) | |
154 | + .emailInfo(authorEmail.toJSONString()) | |
155 | + .articleDesc(articleDesc) | |
156 | + .authorAddress(authorAddress.toJSONString()) | |
157 | + .referenceInfo(references.toJSONString()).build()); | |
158 | + } | |
159 | + | |
160 | + public static void main(String[] args) { | |
161 | + Spider.create(new MatterPagePcoessor()) | |
162 | + .addUrl("https://www.nature.com/nenergy/research-articles") | |
163 | + .addPipeline(new ArticlePipeline()) | |
164 | + .thread(1).run(); | |
165 | + } | |
166 | +} | |
... | ... |
src/main/java/com/canrd/webmagic/processor/NatureMaterialPagePcoessor.java
0 → 100644
1 | +package com.canrd.webmagic.processor; | |
2 | + | |
3 | +import com.alibaba.fastjson.JSONArray; | |
4 | +import com.alibaba.fastjson.JSONObject; | |
5 | +import com.baomidou.mybatisplus.core.toolkit.CollectionUtils; | |
6 | +import com.canrd.webmagic.common.utils.DateUtil; | |
7 | +import com.canrd.webmagic.common.utils.StringUtils; | |
8 | +import com.canrd.webmagic.domain.ArticleTypeEnum; | |
9 | +import com.canrd.webmagic.domain.dto.ArticleDO; | |
10 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | |
11 | +import lombok.extern.slf4j.Slf4j; | |
12 | +import org.springframework.stereotype.Component; | |
13 | +import us.codecraft.webmagic.Page; | |
14 | +import us.codecraft.webmagic.Site; | |
15 | +import us.codecraft.webmagic.Spider; | |
16 | +import us.codecraft.webmagic.processor.PageProcessor; | |
17 | +import us.codecraft.webmagic.selector.Html; | |
18 | +import us.codecraft.webmagic.selector.Selectable; | |
19 | +import us.codecraft.webmagic.selector.XpathSelector; | |
20 | + | |
21 | +import java.text.ParseException; | |
22 | +import java.text.SimpleDateFormat; | |
23 | +import java.util.*; | |
24 | +import java.util.stream.Collectors; | |
25 | + | |
26 | +@Component | |
27 | +@Slf4j | |
28 | +public class NatureMaterialPagePcoessor implements PageProcessor { | |
29 | + @Override | |
30 | + public void process(Page page) { | |
31 | + String url = page.getUrl().get(); | |
32 | + if (url.equals("https://www.nature.com/nmat/articles")){ | |
33 | + getIndex(page); | |
34 | + } else if (url.contains("https://www.nature.com/nmat/articles?searchType=journalSearch&sort=PubDate&page=")){ | |
35 | + everyPage(page); | |
36 | + } else if (url.equals("https://www.nature.com/search?q=battery")) { | |
37 | + getIndex(page); | |
38 | + } else if (url.contains("https://www.nature.com/search?q=battery&page=")) { | |
39 | + everyPage(page); | |
40 | + } else if (url.contains("https://www.nature.com/articles")){ | |
41 | + doArticleContent(page); | |
42 | + } else if (url.equals("https://www.nature.com/nature/research-articles")) { | |
43 | + getIndex(page); | |
44 | + } else if (url.contains("https://www.nature.com/nature/research-articles?searchType=journalSearch&sort=PubDate&page=")) { | |
45 | + everyPage(page); | |
46 | + } else if (url.equals("https://www.nature.com/search?q=battery&journal=nmat&order=relevance")) { | |
47 | + getIndex(page); | |
48 | + }else if (url.equals("https://www.nature.com/search?q=batteries&journal=nmat&order=relevance")) { | |
49 | + getIndex(page); | |
50 | + }else if (url.contains("https://www.nature.com/search?q=battery&order=relevance&journal=nmat&page=")) { | |
51 | + everyPage(page); | |
52 | + }else if (url.contains("https://www.nature.com/search?q=batteries&journal=nmat&page=")) { | |
53 | + everyPage(page); | |
54 | + } | |
55 | + } | |
56 | + | |
57 | + @Override | |
58 | + public Site getSite() { | |
59 | + return PageProcessor.super.getSite().setRetryTimes(3).setSleepTime(100); | |
60 | + } | |
61 | + | |
62 | + public void getIndex(Page page){ | |
63 | + String maxIndex = page.getHtml().xpath("//div[@class='u-mb-48']/nav/ul/li[6]/a/text()").get(); | |
64 | + log.info(maxIndex); | |
65 | + String trim = maxIndex.trim(); | |
66 | + int number = Integer.parseInt(trim); | |
67 | + if (page.getUrl().get().equals("https://www.nature.com/nmat/articles")){ | |
68 | + for (int i = 1; i <= number; i++) { | |
69 | + page.addTargetRequest("https://www.nature.com/nmat/articles?searchType=journalSearch&sort=PubDate&page="+i); | |
70 | + } | |
71 | + } | |
72 | + if (page.getUrl().get().equals("https://www.nature.com/search?q=battery")){ | |
73 | + for (int i = 1; i <= number; i++) { | |
74 | + page.addTargetRequest("https://www.nature.com/search?q=battery&page="+i); | |
75 | + } | |
76 | + } | |
77 | + if (page.getUrl().get().equals("https://www.nature.com/nature/research-articles")){ | |
78 | + for (int i = 1; i <= number; i++) { | |
79 | + page.addTargetRequest("https://www.nature.com/nature/research-articles?searchType=journalSearch&sort=PubDate&page="+i); | |
80 | + } | |
81 | + } | |
82 | + if (page.getUrl().get().equals("https://www.nature.com/search?q=battery&journal=nmat&order=relevance")){ | |
83 | + for (int i = 1; i <= number; i++) { | |
84 | + page.addTargetRequest("https://www.nature.com/search?q=battery&order=relevance&journal=nmat&page="+i); | |
85 | + } | |
86 | + } | |
87 | + if (page.getUrl().get().equals("https://www.nature.com/search?q=batteries&journal=nmat&order=relevance")){ | |
88 | + for (int i = 1; i <= number; i++) { | |
89 | + page.addTargetRequest("https://www.nature.com/search?q=batteries&journal=nmat&page="+i); | |
90 | + } | |
91 | + } | |
92 | + } | |
93 | + | |
94 | + public void everyPage(Page page){ | |
95 | + List<String> all = page.getHtml().xpath("//div[@class='u-flex-direction-column']/h3/a//@href").all(); | |
96 | + for (int i = 0; i < all.size(); i++) { | |
97 | +// log.info(all.get(i)); | |
98 | + page.addTargetRequest("https://www.nature.com"+all.get(i)); | |
99 | + } | |
100 | + } | |
101 | + | |
102 | + private void doArticleContent(Page page) { | |
103 | + if (page.getUrl().get().contains("redirect") || !page.getUrl().get().contains("nature")) { | |
104 | + return; | |
105 | + } | |
106 | + //解析页面 | |
107 | + Html html = page.getHtml(); | |
108 | + String articleCode = page.getUrl().get(); | |
109 | + Selectable headSelectable = html.xpath("//div[@class='c-article-header']/header"); | |
110 | + List<Selectable> authorEmailSelectables = html.xpath("//p[@id='corresponding-author-list']/a").nodes(); | |
111 | + Selectable referencesSelectable = html.xpath("//ol[@class='c-article-references']").select(new XpathSelector("li[@class='c-article-references__item js-c-reading-companion-references-item']")); | |
112 | + Selectable authorAddressSelectable = html.xpath("//ol[@class='c-article-author-affiliation__list']").select(new XpathSelector("li")); | |
113 | + | |
114 | + String title = headSelectable.xpath("//div/h1/text()").get(); | |
115 | + if (StringUtils.isBlank(title)) { | |
116 | + title = headSelectable.xpath("//h1/text()").get(); | |
117 | + } | |
118 | + String articleDesc = html.xpath("//div[@class='c-article-section__content']/p/text()").get(); | |
119 | + String publishTime; | |
120 | + Date publishTimeDateTime = null; | |
121 | + try { | |
122 | + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get(); | |
123 | + } catch (Exception e) { | |
124 | + try { | |
125 | + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(1).xpath("//li/time/text()").get(); | |
126 | + } catch (Exception e1) { | |
127 | + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(0).xpath("//li/time/text()").get(); | |
128 | + } | |
129 | + } | |
130 | + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH); | |
131 | + | |
132 | + try { | |
133 | + publishTimeDateTime = formatter.parse(publishTime); | |
134 | + } catch (ParseException e) { | |
135 | + e.printStackTrace(); | |
136 | + } | |
137 | + Selectable authorSelectable = headSelectable.xpath("//ul").nodes().get(1).select(new XpathSelector("li[@class='c-article-author-list__item']")); | |
138 | + List<Selectable> authorNodes = authorSelectable.nodes(); | |
139 | + StringBuffer authorName = new StringBuffer(); | |
140 | + for (Selectable node : authorNodes) { | |
141 | + authorName.append(node.xpath("//a/text()")); | |
142 | + } | |
143 | + | |
144 | + JSONArray authorAddress = new JSONArray(); | |
145 | + List<Selectable> authorAddressList = authorAddressSelectable.nodes(); | |
146 | + if (CollectionUtils.isNotEmpty(authorAddressList)) { | |
147 | + for (Selectable selectable : authorAddressList) { | |
148 | + String address = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__address']/text()").get(); | |
149 | + String authorNames = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__authors-list']/text()").get(); | |
150 | + JSONObject object = new JSONObject(); | |
151 | + object.put("address", address); | |
152 | + object.put("authorNames", authorNames); | |
153 | + authorAddress.add(object); | |
154 | + } | |
155 | + } | |
156 | + | |
157 | + JSONArray references = new JSONArray(); | |
158 | + List<Selectable> referenceList = referencesSelectable.nodes(); | |
159 | + if (CollectionUtils.isNotEmpty(referenceList)) { | |
160 | + for (Selectable reference : referenceList) { | |
161 | + String referenceTitle = reference.xpath("//p").xpath("//p[@class='c-article-references__text']/text()").get(); | |
162 | + List<Selectable> referenceLinks = reference.xpath("//p").xpath("//p[@class='c-article-references__links u-hide-print']").links().nodes(); | |
163 | + List<String> links = new ArrayList<>(); | |
164 | + if (CollectionUtils.isNotEmpty(referenceLinks)) { | |
165 | + links = referenceLinks.stream().map(x -> x.get()).collect(Collectors.toList()); | |
166 | + } | |
167 | + JSONObject object = new JSONObject(); | |
168 | + object.put("referenceTitle", referenceTitle); | |
169 | + object.put("links", links); | |
170 | +// if (CollectionUtils.isNotEmpty(links)) { | |
171 | +// page.addTargetRequests(links.stream().filter(x -> x.contains("nature")).collect(Collectors.toList())); | |
172 | +// } | |
173 | + references.add(object); | |
174 | + } | |
175 | + } | |
176 | + | |
177 | + JSONArray authorEmail = new JSONArray(); | |
178 | + for (Selectable authorEmailSelectable : authorEmailSelectables) { | |
179 | + String[] split = authorEmailSelectable.xpath("//a").links().get().split(":"); | |
180 | + String email = Objects.isNull(split) ? "" : split[split.length - 1]; | |
181 | + String authorEmailName = authorEmailSelectable.xpath("//a/text()").get(); | |
182 | + JSONObject jsonObject = new JSONObject(); | |
183 | + jsonObject.put("authorEmailName", authorEmailName); | |
184 | + jsonObject.put("email", email); | |
185 | + authorEmail.add(jsonObject); | |
186 | + } | |
187 | + log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorEmail.toJSONString()); | |
188 | + | |
189 | + page.putField("article", ArticleDO.builder() | |
190 | + .articleType(ArticleTypeEnum.NATURE_MATERIAL.getType()) | |
191 | + .articleCode(articleCode) | |
192 | + .authorName(authorName.toString()) | |
193 | + .title(title) | |
194 | + .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE)) | |
195 | + .emailInfo(authorEmail.toJSONString()) | |
196 | + .articleDesc(articleDesc) | |
197 | + .authorAddress(authorAddress.toJSONString()) | |
198 | + .referenceInfo(references.toJSONString()).build()); | |
199 | + } | |
200 | + | |
201 | + public static void main(String[] args) { | |
202 | + Spider.create(new MatterPagePcoessor()) | |
203 | + .addUrl("https://www.nature.com/nmat/articles?searchType=journalSearch&sort=PubDate&page=1") | |
204 | + .addPipeline(new ArticlePipeline()) | |
205 | + .thread(1).run(); | |
206 | + } | |
207 | +} | |
... | ... |
src/main/java/com/canrd/webmagic/processor/NatureMethodsPcoessor.java
0 → 100644
1 | +package com.canrd.webmagic.processor; | |
2 | + | |
3 | +import com.alibaba.fastjson.JSONArray; | |
4 | +import com.alibaba.fastjson.JSONObject; | |
5 | +import com.baomidou.mybatisplus.core.toolkit.CollectionUtils; | |
6 | +import com.canrd.webmagic.common.utils.DateUtil; | |
7 | +import com.canrd.webmagic.common.utils.StringUtils; | |
8 | +import com.canrd.webmagic.domain.ArticleTypeEnum; | |
9 | +import com.canrd.webmagic.domain.dto.ArticleDO; | |
10 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | |
11 | +import lombok.extern.slf4j.Slf4j; | |
12 | +import org.springframework.stereotype.Component; | |
13 | +import us.codecraft.webmagic.Page; | |
14 | +import us.codecraft.webmagic.Site; | |
15 | +import us.codecraft.webmagic.Spider; | |
16 | +import us.codecraft.webmagic.processor.PageProcessor; | |
17 | +import us.codecraft.webmagic.selector.Html; | |
18 | +import us.codecraft.webmagic.selector.Selectable; | |
19 | +import us.codecraft.webmagic.selector.XpathSelector; | |
20 | + | |
21 | +import java.text.ParseException; | |
22 | +import java.text.SimpleDateFormat; | |
23 | +import java.util.*; | |
24 | +import java.util.stream.Collectors; | |
25 | + | |
26 | +@Slf4j | |
27 | +@Component | |
28 | +public class NatureMethodsPcoessor implements PageProcessor { | |
29 | + @Override | |
30 | + public void process(Page page) { | |
31 | + String url = page.getUrl().get(); | |
32 | + if (url.equals("https://www.nature.com/nmeth/")) { | |
33 | + everyPage(page); | |
34 | + }else if (url.contains("https://www.nature.com/")){ | |
35 | + doArticleContent(page); | |
36 | + } | |
37 | + } | |
38 | + | |
39 | + @Override | |
40 | + public Site getSite() { | |
41 | + return PageProcessor.super.getSite(); | |
42 | + } | |
43 | + public void everyPage(Page page){ | |
44 | + List<String> all = page.getHtml().xpath("//div[@class='u-flex-direction-column']/h3/a//@href").all(); | |
45 | + for (int i = 0; i < all.size(); i++) { | |
46 | + page.addTargetRequest("https://www.nature.com/"+all.get(i)); | |
47 | + } | |
48 | + } | |
49 | + | |
50 | + private void doArticleContent(Page page) { | |
51 | + Html html = page.getHtml(); | |
52 | + String articleCode = page.getUrl().get(); | |
53 | + Selectable headSelectable = html.xpath("//div[@class='c-article-header']/header"); | |
54 | + List<Selectable> authorEmailSelectables = html.xpath("//p[@id='corresponding-author-list']/a").nodes(); | |
55 | + Selectable referencesSelectable = html.xpath("//ol[@class='c-article-references']").select(new XpathSelector("li[@class='c-article-references__item js-c-reading-companion-references-item']")); | |
56 | + Selectable authorAddressSelectable = html.xpath("//ol[@class='c-article-author-affiliation__list']").select(new XpathSelector("li")); | |
57 | + | |
58 | + String title = headSelectable.xpath("//div/h1/text()").get(); | |
59 | + if (StringUtils.isBlank(title)) { | |
60 | + title = headSelectable.xpath("//h1/text()").get(); | |
61 | + } | |
62 | + String articleDesc = html.xpath("//div[@class='c-article-section__content']/p/text()").get(); | |
63 | + String publishTime; | |
64 | + Date publishTimeDateTime = null; | |
65 | + try { | |
66 | + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get(); | |
67 | + } catch (Exception e) { | |
68 | + try { | |
69 | + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(1).xpath("//li/time/text()").get(); | |
70 | + } catch (Exception e1) { | |
71 | + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(0).xpath("//li/time/text()").get(); | |
72 | + } | |
73 | + } | |
74 | + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH); | |
75 | + | |
76 | + try { | |
77 | + publishTimeDateTime = formatter.parse(publishTime); | |
78 | + } catch (ParseException e) { | |
79 | + e.printStackTrace(); | |
80 | + } | |
81 | + Selectable authorSelectable = headSelectable.xpath("//ul").nodes().get(1).select(new XpathSelector("li[@class='c-article-author-list__item']")); | |
82 | + List<Selectable> authorNodes = authorSelectable.nodes(); | |
83 | + StringBuffer authorName = new StringBuffer(); | |
84 | + for (Selectable node : authorNodes) { | |
85 | + authorName.append(node.xpath("//a/text()")); | |
86 | + } | |
87 | + | |
88 | + JSONArray authorAddress = new JSONArray(); | |
89 | + List<Selectable> authorAddressList = authorAddressSelectable.nodes(); | |
90 | + if (CollectionUtils.isNotEmpty(authorAddressList)) { | |
91 | + for (Selectable selectable : authorAddressList) { | |
92 | + String address = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__address']/text()").get(); | |
93 | + String authorNames = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__authors-list']/text()").get(); | |
94 | + JSONObject object = new JSONObject(); | |
95 | + object.put("address", address); | |
96 | + object.put("authorNames", authorNames); | |
97 | + authorAddress.add(object); | |
98 | + } | |
99 | + } | |
100 | + | |
101 | + JSONArray references = new JSONArray(); | |
102 | + List<Selectable> referenceList = referencesSelectable.nodes(); | |
103 | + if (CollectionUtils.isNotEmpty(referenceList)) { | |
104 | + for (Selectable reference : referenceList) { | |
105 | + String referenceTitle = reference.xpath("//p").xpath("//p[@class='c-article-references__text']/text()").get(); | |
106 | + List<Selectable> referenceLinks = reference.xpath("//p").xpath("//p[@class='c-article-references__links u-hide-print']").links().nodes(); | |
107 | + List<String> links = new ArrayList<>(); | |
108 | + if (CollectionUtils.isNotEmpty(referenceLinks)) { | |
109 | + links = referenceLinks.stream().map(x -> x.get()).collect(Collectors.toList()); | |
110 | + } | |
111 | + JSONObject object = new JSONObject(); | |
112 | + object.put("referenceTitle", referenceTitle); | |
113 | + object.put("links", links); | |
114 | +// if (CollectionUtils.isNotEmpty(links)) { | |
115 | +// page.addTargetRequests(links.stream().filter(x -> x.contains("nature")).collect(Collectors.toList())); | |
116 | +// } | |
117 | + references.add(object); | |
118 | + } | |
119 | + } | |
120 | + | |
121 | + JSONArray authorEmail = new JSONArray(); | |
122 | + for (Selectable authorEmailSelectable : authorEmailSelectables) { | |
123 | + String[] split = authorEmailSelectable.xpath("//a").links().get().split(":"); | |
124 | + String email = Objects.isNull(split) ? "" : split[split.length - 1]; | |
125 | + String authorEmailName = authorEmailSelectable.xpath("//a/text()").get(); | |
126 | + JSONObject jsonObject = new JSONObject(); | |
127 | + jsonObject.put("authorEmailName", authorEmailName); | |
128 | + jsonObject.put("email", email); | |
129 | + authorEmail.add(jsonObject); | |
130 | + } | |
131 | + log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorEmail.toJSONString()); | |
132 | + | |
133 | + page.putField("article", ArticleDO.builder() | |
134 | + .articleType(ArticleTypeEnum.NATURE_METHODS.getType()) | |
135 | + .articleCode(articleCode) | |
136 | + .authorName(authorName.toString()) | |
137 | + .title(title) | |
138 | + .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE)) | |
139 | + .emailInfo(authorEmail.toJSONString()) | |
140 | + .articleDesc(articleDesc) | |
141 | + .authorAddress(authorAddress.toJSONString()) | |
142 | + .referenceInfo(references.toJSONString()).build()); | |
143 | + } | |
144 | + | |
145 | + public static void main(String[] args) { | |
146 | + Spider.create(new MatterPagePcoessor()) | |
147 | + .addUrl("https://www.nature.com/nenergy/research-articles") | |
148 | + .addPipeline(new ArticlePipeline()) | |
149 | + .thread(1).run(); | |
150 | + } | |
151 | +} | |
... | ... |
src/main/java/com/canrd/webmagic/processor/NatureNanotechnologyProcessor.java
0 → 100644
1 | +package com.canrd.webmagic.processor; | |
2 | + | |
3 | +import com.alibaba.fastjson.JSONArray; | |
4 | +import com.alibaba.fastjson.JSONObject; | |
5 | +import com.baomidou.mybatisplus.core.toolkit.CollectionUtils; | |
6 | +import com.canrd.webmagic.common.utils.DateUtil; | |
7 | +import com.canrd.webmagic.common.utils.StringUtils; | |
8 | +import com.canrd.webmagic.domain.ArticleTypeEnum; | |
9 | +import com.canrd.webmagic.domain.dto.ArticleDO; | |
10 | +import lombok.extern.slf4j.Slf4j; | |
11 | +import org.springframework.stereotype.Component; | |
12 | +import us.codecraft.webmagic.Page; | |
13 | +import us.codecraft.webmagic.Site; | |
14 | +import us.codecraft.webmagic.processor.PageProcessor; | |
15 | +import us.codecraft.webmagic.selector.Html; | |
16 | +import us.codecraft.webmagic.selector.Selectable; | |
17 | +import us.codecraft.webmagic.selector.XpathSelector; | |
18 | + | |
19 | +import java.text.ParseException; | |
20 | +import java.text.SimpleDateFormat; | |
21 | +import java.util.*; | |
22 | +import java.util.stream.Collectors; | |
23 | + | |
24 | +@Component | |
25 | +@Slf4j | |
26 | +public class NatureNanotechnologyProcessor implements PageProcessor { | |
27 | + | |
28 | + //目前只有一页 | |
29 | + @Override | |
30 | + public void process(Page page) { | |
31 | + String url = page.getUrl().get(); | |
32 | + if (url.equals("https://www.nature.com/nnano/")){ | |
33 | + everyPage(page); | |
34 | + } else if (url.contains("https://www.nature.com/")){ | |
35 | + doArticleContent(page); | |
36 | + } | |
37 | + } | |
38 | + | |
39 | + @Override | |
40 | + public Site getSite() { | |
41 | + return PageProcessor.super.getSite(); | |
42 | + } | |
43 | + | |
44 | + public void getIndex(Page page){ | |
45 | + String maxIndex = page.getHtml().xpath("//div[@class='u-mb-48']/nav/ul/li[6]/a/text()").get(); | |
46 | + log.info(maxIndex); | |
47 | + String trim = maxIndex.trim(); | |
48 | + int number = Integer.parseInt(trim); | |
49 | + System.out.printf("", number); | |
50 | + for (int i = 0; i < number; i++) { | |
51 | + page.addTargetRequest("https://www.nature.com/nenergy/research-articles?searchType=journalSearch&sort=PubDate&page="+i); | |
52 | + } | |
53 | + } | |
54 | + | |
55 | + public void everyPage(Page page){ | |
56 | + List<String> all = page.getHtml().xpath("//div[@class='u-flex-direction-column']/h3/a//@href").all(); | |
57 | + for (int i = 0; i < all.size(); i++) { | |
58 | + page.addTargetRequest("https://www.nature.com/"+all.get(i)); | |
59 | + } | |
60 | + } | |
61 | + | |
62 | + private void doArticleContent(Page page) { | |
63 | + Html html = page.getHtml(); | |
64 | + String articleCode = page.getUrl().get(); | |
65 | + Selectable headSelectable = html.xpath("//div[@class='c-article-header']/header"); | |
66 | + List<Selectable> authorEmailSelectables = html.xpath("//p[@id='corresponding-author-list']/a").nodes(); | |
67 | + Selectable referencesSelectable = html.xpath("//ol[@class='c-article-references']").select(new XpathSelector("li[@class='c-article-references__item js-c-reading-companion-references-item']")); | |
68 | + Selectable authorAddressSelectable = html.xpath("//ol[@class='c-article-author-affiliation__list']").select(new XpathSelector("li")); | |
69 | + | |
70 | + String title = headSelectable.xpath("//div/h1/text()").get(); | |
71 | + if (StringUtils.isBlank(title)) { | |
72 | + title = headSelectable.xpath("//h1/text()").get(); | |
73 | + } | |
74 | + String articleDesc = html.xpath("//div[@class='c-article-section__content']/p/text()").get(); | |
75 | + String publishTime; | |
76 | + Date publishTimeDateTime = null; | |
77 | + try { | |
78 | + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get(); | |
79 | + } catch (Exception e) { | |
80 | + try { | |
81 | + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(1).xpath("//li/time/text()").get(); | |
82 | + } catch (Exception e1) { | |
83 | + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(0).xpath("//li/time/text()").get(); | |
84 | + } | |
85 | + } | |
86 | + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH); | |
87 | + | |
88 | + try { | |
89 | + publishTimeDateTime = formatter.parse(publishTime); | |
90 | + } catch (ParseException e) { | |
91 | + e.printStackTrace(); | |
92 | + } | |
93 | + Selectable authorSelectable = headSelectable.xpath("//ul").nodes().get(1).select(new XpathSelector("li[@class='c-article-author-list__item']")); | |
94 | + List<Selectable> authorNodes = authorSelectable.nodes(); | |
95 | + StringBuffer authorName = new StringBuffer(); | |
96 | + for (Selectable node : authorNodes) { | |
97 | + authorName.append(node.xpath("//a/text()")); | |
98 | + } | |
99 | + | |
100 | + JSONArray authorAddress = new JSONArray(); | |
101 | + List<Selectable> authorAddressList = authorAddressSelectable.nodes(); | |
102 | + if (CollectionUtils.isNotEmpty(authorAddressList)) { | |
103 | + for (Selectable selectable : authorAddressList) { | |
104 | + String address = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__address']/text()").get(); | |
105 | + String authorNames = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__authors-list']/text()").get(); | |
106 | + JSONObject object = new JSONObject(); | |
107 | + object.put("address", address); | |
108 | + object.put("authorNames", authorNames); | |
109 | + authorAddress.add(object); | |
110 | + } | |
111 | + } | |
112 | + | |
113 | + JSONArray references = new JSONArray(); | |
114 | + List<Selectable> referenceList = referencesSelectable.nodes(); | |
115 | + if (CollectionUtils.isNotEmpty(referenceList)) { | |
116 | + for (Selectable reference : referenceList) { | |
117 | + String referenceTitle = reference.xpath("//p").xpath("//p[@class='c-article-references__text']/text()").get(); | |
118 | + List<Selectable> referenceLinks = reference.xpath("//p").xpath("//p[@class='c-article-references__links u-hide-print']").links().nodes(); | |
119 | + List<String> links = new ArrayList<>(); | |
120 | + if (CollectionUtils.isNotEmpty(referenceLinks)) { | |
121 | + links = referenceLinks.stream().map(x -> x.get()).collect(Collectors.toList()); | |
122 | + } | |
123 | + JSONObject object = new JSONObject(); | |
124 | + object.put("referenceTitle", referenceTitle); | |
125 | + object.put("links", links); | |
126 | +// if (CollectionUtils.isNotEmpty(links)) { | |
127 | +// page.addTargetRequests(links.stream().filter(x -> x.contains("nature")).collect(Collectors.toList())); | |
128 | +// } | |
129 | + references.add(object); | |
130 | + } | |
131 | + } | |
132 | + | |
133 | + JSONArray authorEmail = new JSONArray(); | |
134 | + for (Selectable authorEmailSelectable : authorEmailSelectables) { | |
135 | + String[] split = authorEmailSelectable.xpath("//a").links().get().split(":"); | |
136 | + String email = Objects.isNull(split) ? "" : split[split.length - 1]; | |
137 | + String authorEmailName = authorEmailSelectable.xpath("//a/text()").get(); | |
138 | + JSONObject jsonObject = new JSONObject(); | |
139 | + jsonObject.put("authorEmailName", authorEmailName); | |
140 | + jsonObject.put("email", email); | |
141 | + authorEmail.add(jsonObject); | |
142 | + } | |
143 | + log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorEmail.toJSONString()); | |
144 | + | |
145 | + page.putField("article", ArticleDO.builder() | |
146 | + .articleType(ArticleTypeEnum.NATURE_NANOTECHNOLOGY.getType()) | |
147 | + .articleCode(articleCode) | |
148 | + .authorName(authorName.toString()) | |
149 | + .title(title) | |
150 | + .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE)) | |
151 | + .emailInfo(authorEmail.toJSONString()) | |
152 | + .articleDesc(articleDesc) | |
153 | + .authorAddress(authorAddress.toJSONString()) | |
154 | + .referenceInfo(references.toJSONString()).build()); | |
155 | + } | |
156 | +} | |
... | ... |
src/main/java/com/canrd/webmagic/processor/NaturePhysicsProcessor.java
0 → 100644
1 | +package com.canrd.webmagic.processor; | |
2 | + | |
3 | +import com.alibaba.fastjson.JSONArray; | |
4 | +import com.alibaba.fastjson.JSONObject; | |
5 | +import com.baomidou.mybatisplus.core.toolkit.CollectionUtils; | |
6 | +import com.canrd.webmagic.common.utils.DateUtil; | |
7 | +import com.canrd.webmagic.common.utils.StringUtils; | |
8 | +import com.canrd.webmagic.domain.ArticleTypeEnum; | |
9 | +import com.canrd.webmagic.domain.dto.ArticleDO; | |
10 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | |
11 | +import lombok.extern.slf4j.Slf4j; | |
12 | +import org.springframework.stereotype.Component; | |
13 | +import us.codecraft.webmagic.Page; | |
14 | +import us.codecraft.webmagic.Site; | |
15 | +import us.codecraft.webmagic.Spider; | |
16 | +import us.codecraft.webmagic.processor.PageProcessor; | |
17 | +import us.codecraft.webmagic.selector.Html; | |
18 | +import us.codecraft.webmagic.selector.Selectable; | |
19 | +import us.codecraft.webmagic.selector.XpathSelector; | |
20 | + | |
21 | +import java.text.ParseException; | |
22 | +import java.text.SimpleDateFormat; | |
23 | +import java.util.*; | |
24 | +import java.util.stream.Collectors; | |
25 | + | |
26 | +@Slf4j | |
27 | +@Component | |
28 | +public class NaturePhysicsProcessor implements PageProcessor{ | |
29 | + @Override | |
30 | + public void process(Page page) { | |
31 | + String url = page.getUrl().get(); | |
32 | + if (url.equals("https://www.nature.com/nphys/")) { | |
33 | + everyPage(page); | |
34 | + }else if (url.contains("https://www.nature.com/")){ | |
35 | + doArticleContent(page); | |
36 | + } | |
37 | + } | |
38 | + | |
39 | + @Override | |
40 | + public Site getSite() { | |
41 | + return PageProcessor.super.getSite(); | |
42 | + } | |
43 | + public void everyPage(Page page){ | |
44 | + List<String> all = page.getHtml().xpath("//div[@class='u-flex-direction-column']/h3/a//@href").all(); | |
45 | + for (int i = 0; i < all.size(); i++) { | |
46 | + page.addTargetRequest("https://www.nature.com/"+all.get(i)); | |
47 | + } | |
48 | + } | |
49 | + | |
50 | + private void doArticleContent(Page page) { | |
51 | + Html html = page.getHtml(); | |
52 | + String articleCode = page.getUrl().get(); | |
53 | + Selectable headSelectable = html.xpath("//div[@class='c-article-header']/header"); | |
54 | + List<Selectable> authorEmailSelectables = html.xpath("//p[@id='corresponding-author-list']/a").nodes(); | |
55 | + Selectable referencesSelectable = html.xpath("//ol[@class='c-article-references']").select(new XpathSelector("li[@class='c-article-references__item js-c-reading-companion-references-item']")); | |
56 | + Selectable authorAddressSelectable = html.xpath("//ol[@class='c-article-author-affiliation__list']").select(new XpathSelector("li")); | |
57 | + | |
58 | + String title = headSelectable.xpath("//div/h1/text()").get(); | |
59 | + if (StringUtils.isBlank(title)) { | |
60 | + title = headSelectable.xpath("//h1/text()").get(); | |
61 | + } | |
62 | + String articleDesc = html.xpath("//div[@class='c-article-section__content']/p/text()").get(); | |
63 | + String publishTime; | |
64 | + Date publishTimeDateTime = null; | |
65 | + try { | |
66 | + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get(); | |
67 | + } catch (Exception e) { | |
68 | + try { | |
69 | + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(1).xpath("//li/time/text()").get(); | |
70 | + } catch (Exception e1) { | |
71 | + publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(0).xpath("//li/time/text()").get(); | |
72 | + } | |
73 | + } | |
74 | + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH); | |
75 | + | |
76 | + try { | |
77 | + publishTimeDateTime = formatter.parse(publishTime); | |
78 | + } catch (ParseException e) { | |
79 | + e.printStackTrace(); | |
80 | + } | |
81 | + Selectable authorSelectable = headSelectable.xpath("//ul").nodes().get(1).select(new XpathSelector("li[@class='c-article-author-list__item']")); | |
82 | + List<Selectable> authorNodes = authorSelectable.nodes(); | |
83 | + StringBuffer authorName = new StringBuffer(); | |
84 | + for (Selectable node : authorNodes) { | |
85 | + authorName.append(node.xpath("//a/text()")); | |
86 | + } | |
87 | + | |
88 | + JSONArray authorAddress = new JSONArray(); | |
89 | + List<Selectable> authorAddressList = authorAddressSelectable.nodes(); | |
90 | + if (CollectionUtils.isNotEmpty(authorAddressList)) { | |
91 | + for (Selectable selectable : authorAddressList) { | |
92 | + String address = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__address']/text()").get(); | |
93 | + String authorNames = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__authors-list']/text()").get(); | |
94 | + JSONObject object = new JSONObject(); | |
95 | + object.put("address", address); | |
96 | + object.put("authorNames", authorNames); | |
97 | + authorAddress.add(object); | |
98 | + } | |
99 | + } | |
100 | + | |
101 | + JSONArray references = new JSONArray(); | |
102 | + List<Selectable> referenceList = referencesSelectable.nodes(); | |
103 | + if (CollectionUtils.isNotEmpty(referenceList)) { | |
104 | + for (Selectable reference : referenceList) { | |
105 | + String referenceTitle = reference.xpath("//p").xpath("//p[@class='c-article-references__text']/text()").get(); | |
106 | + List<Selectable> referenceLinks = reference.xpath("//p").xpath("//p[@class='c-article-references__links u-hide-print']").links().nodes(); | |
107 | + List<String> links = new ArrayList<>(); | |
108 | + if (CollectionUtils.isNotEmpty(referenceLinks)) { | |
109 | + links = referenceLinks.stream().map(x -> x.get()).collect(Collectors.toList()); | |
110 | + } | |
111 | + JSONObject object = new JSONObject(); | |
112 | + object.put("referenceTitle", referenceTitle); | |
113 | + object.put("links", links); | |
114 | +// if (CollectionUtils.isNotEmpty(links)) { | |
115 | +// page.addTargetRequests(links.stream().filter(x -> x.contains("nature")).collect(Collectors.toList())); | |
116 | +// } | |
117 | + references.add(object); | |
118 | + } | |
119 | + } | |
120 | + | |
121 | + JSONArray authorEmail = new JSONArray(); | |
122 | + for (Selectable authorEmailSelectable : authorEmailSelectables) { | |
123 | + String[] split = authorEmailSelectable.xpath("//a").links().get().split(":"); | |
124 | + String email = Objects.isNull(split) ? "" : split[split.length - 1]; | |
125 | + String authorEmailName = authorEmailSelectable.xpath("//a/text()").get(); | |
126 | + JSONObject jsonObject = new JSONObject(); | |
127 | + jsonObject.put("authorEmailName", authorEmailName); | |
128 | + jsonObject.put("email", email); | |
129 | + authorEmail.add(jsonObject); | |
130 | + } | |
131 | + log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorEmail.toJSONString()); | |
132 | + | |
133 | + page.putField("article", ArticleDO.builder() | |
134 | + .articleType(ArticleTypeEnum.NATURE_PHYSICS.getType()) | |
135 | + .articleCode(articleCode) | |
136 | + .authorName(authorName.toString()) | |
137 | + .title(title) | |
138 | + .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE)) | |
139 | + .emailInfo(authorEmail.toJSONString()) | |
140 | + .articleDesc(articleDesc) | |
141 | + .authorAddress(authorAddress.toJSONString()) | |
142 | + .referenceInfo(references.toJSONString()).build()); | |
143 | + } | |
144 | + | |
145 | + public static void main(String[] args) { | |
146 | + Spider.create(new MatterPagePcoessor()) | |
147 | + .addUrl("https://www.nature.com/nenergy/research-articles") | |
148 | + .addPipeline(new ArticlePipeline()) | |
149 | + .thread(1).run(); | |
150 | + } | |
151 | +} | |
... | ... |
src/main/java/com/canrd/webmagic/processor/NatureSearchPageProcessor.java
src/main/java/com/canrd/webmagic/processor/Science4JournalSearchPageProcessor.java
src/main/java/com/canrd/webmagic/processor/download/SeleniumDownloader.java
... | ... | @@ -3,10 +3,9 @@ package com.canrd.webmagic.processor.download; |
3 | 3 | import com.canrd.webmagic.config.SeleniumConfig; |
4 | 4 | import com.canrd.webmagic.processor.config.Agent; |
5 | 5 | import lombok.extern.slf4j.Slf4j; |
6 | -import org.openqa.selenium.By; | |
7 | -import org.openqa.selenium.Cookie; | |
8 | -import org.openqa.selenium.WebDriver; | |
9 | -import org.openqa.selenium.WebElement; | |
6 | +import org.openqa.selenium.*; | |
7 | +import org.openqa.selenium.support.ui.ExpectedConditions; | |
8 | +import org.openqa.selenium.support.ui.WebDriverWait; | |
10 | 9 | import org.springframework.stereotype.Component; |
11 | 10 | import us.codecraft.webmagic.Page; |
12 | 11 | import us.codecraft.webmagic.Request; |
... | ... | @@ -28,7 +27,7 @@ import java.util.Map; |
28 | 27 | @Slf4j |
29 | 28 | @Component |
30 | 29 | public class SeleniumDownloader extends AbstractDownloader { |
31 | - private int sleepTime = 3000; | |
30 | + private int sleepTime = 3000000; | |
32 | 31 | |
33 | 32 | @Resource |
34 | 33 | private SeleniumConfig config; |
... | ... | @@ -64,8 +63,23 @@ public class SeleniumDownloader extends AbstractDownloader { |
64 | 63 | } |
65 | 64 | |
66 | 65 | log.info("downloading page " + request.getUrl()); |
67 | - | |
68 | 66 | webDriver.get(request.getUrl()); |
67 | + if (request.getUrl().equals("https://www.cell.com/matter/home")) { | |
68 | + WebElement searchText = webDriver.findElement(By.id("searchText")); | |
69 | + searchText.sendKeys("Aluminum foil"); | |
70 | + WebElement element = webDriver.findElement(By.xpath("//div[@class='quick-search__toggle']/button")); | |
71 | + element.submit(); | |
72 | + WebDriverWait wait = new WebDriverWait(webDriver, 30); | |
73 | + wait.until(ExpectedConditions.urlContains("https://www.cell.com/action/doSearch?")); | |
74 | +// wait.until(ExpectedConditions.visibilityOfElementLocated(By.xpath("//div[@class='cb-c']"))); | |
75 | +// WebElement cloudFlare = webDriver.findElement(By.xpath("//div[@class='cb-c']/label/input")); | |
76 | +// if (cloudFlare!=null){ | |
77 | +// cloudFlare.click(); | |
78 | +// } | |
79 | + } | |
80 | + if (request.getUrl().contains("https://www.cell.com/action/doSearch?")){ | |
81 | + | |
82 | + } | |
69 | 83 | try { |
70 | 84 | if (sleepTime > 0) { |
71 | 85 | //休眠3秒就是为了动态的数据渲染完成后在进行获取 |
... | ... | @@ -75,6 +89,18 @@ public class SeleniumDownloader extends AbstractDownloader { |
75 | 89 | e.printStackTrace(); |
76 | 90 | } |
77 | 91 | |
92 | +// WebElement targetElement; | |
93 | +// do { | |
94 | +// try { | |
95 | +// targetElement = webDriver.findElement(By.xpath("//h2[@class=\"h2\"]")); | |
96 | +// log.info(String.valueOf(targetElement)); | |
97 | +// log.info("等待验证中"); | |
98 | +// Thread.sleep(sleepTime); // 等待一段时间后再检查 | |
99 | +// } catch (NoSuchElementException e) { | |
100 | +// targetElement = null; // 如果找不到特定元素,则退出循环 | |
101 | +// } | |
102 | +// } while (targetElement != null); | |
103 | + | |
78 | 104 | WebElement webElement = webDriver.findElement(By.xpath("/html")); |
79 | 105 | String content = webElement.getAttribute("outerHTML"); |
80 | 106 | page.setDownloadSuccess(true); |
... | ... |
src/main/java/com/canrd/webmagic/processor/driver/MyChromeDriver.java
... | ... | @@ -59,13 +59,14 @@ public class MyChromeDriver implements BrowserDriver{ |
59 | 59 | options.addArguments("blink-settings=imagesEnabled=false"); |
60 | 60 | // 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败 |
61 | 61 | options.addArguments("--headless"); |
62 | + //进入指定地址 | |
63 | +// options.setExperimentalOption("debuggerAddress", "127.0.0.1:9222"); | |
62 | 64 | //禁用 blink 特征 |
63 | 65 | options.addArguments("disable-blink-features=AutomationControlled"); |
64 | 66 | options.setExperimentalOption("excludeSwitches", Arrays.asList("enable-automation")); |
65 | 67 | options.setExperimentalOption("useAutomationExtension", false); |
66 | 68 | options.addArguments("--remote-allow-origins=*"); |
67 | 69 | |
68 | - | |
69 | 70 | String os_name = System.getProperty("os.name"); |
70 | 71 | // 判断是否是windows系统 |
71 | 72 | if (os_name.toLowerCase().startsWith("win")) { |
... | ... |
src/main/java/com/canrd/webmagic/processor/pipeline/ArticlePipeline.java
... | ... | @@ -27,11 +27,11 @@ public class ArticlePipeline implements Pipeline { |
27 | 27 | public void process(ResultItems resultItems, Task task) { |
28 | 28 | ArticleDO articleDO = resultItems.get("article"); |
29 | 29 | if (Objects.nonNull(articleDO)) { |
30 | - List<ArticleDO> natureArticleDO = articleService.list(new LambdaQueryWrapper<ArticleDO>().eq(ArticleDO::getArticleCode, articleDO.getArticleCode())); | |
31 | - if (CollectionUtils.isNotEmpty(natureArticleDO)) { | |
32 | - return; | |
33 | - } | |
34 | - articleService.save(articleDO); | |
30 | + List<ArticleDO> natureArticleDO = articleService.list(new LambdaQueryWrapper<ArticleDO>().eq(ArticleDO::getArticleCode, articleDO.getArticleCode())); | |
31 | + if (CollectionUtils.isNotEmpty(natureArticleDO)) { | |
32 | + return; | |
33 | + } | |
34 | + articleService.save(articleDO); | |
35 | 35 | } |
36 | 36 | } |
37 | 37 | } |
... | ... |
src/main/resources/application-test.yml
... | ... | @@ -59,7 +59,7 @@ spring: |
59 | 59 | testOnReturn: true |
60 | 60 | password: 123456 |
61 | 61 | time-between-eviction-runs-millis: 1000 |
62 | - url: jdbc:mysql://localhost:3306/webmagic?useUnicode=true&characterEncoding=UTF-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=Asia/Shanghai&useSSL=false&autoReconnect=true&failOverReadOnly=false&maxReconnects=10&allowMultiQueries=true&useAffectedRows=true&autoReconnectForPools=true | |
62 | + url: jdbc:mysql://localhost:3306/webpage?useUnicode=true&characterEncoding=UTF-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=Asia/Shanghai&useSSL=false&autoReconnect=true&failOverReadOnly=false&maxReconnects=10&allowMultiQueries=true&useAffectedRows=true&autoReconnectForPools=true | |
63 | 63 | username: root |
64 | 64 | redis: |
65 | 65 | database: 0 |
... | ... |
target/classes/application-test.yml
... | ... | @@ -59,7 +59,7 @@ spring: |
59 | 59 | testOnReturn: true |
60 | 60 | password: 123456 |
61 | 61 | time-between-eviction-runs-millis: 1000 |
62 | - url: jdbc:mysql://localhost:3306/webmagic?useUnicode=true&characterEncoding=UTF-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=Asia/Shanghai&useSSL=false&autoReconnect=true&failOverReadOnly=false&maxReconnects=10&allowMultiQueries=true&useAffectedRows=true&autoReconnectForPools=true | |
62 | + url: jdbc:mysql://localhost:3306/webpage?useUnicode=true&characterEncoding=UTF-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=Asia/Shanghai&useSSL=false&autoReconnect=true&failOverReadOnly=false&maxReconnects=10&allowMultiQueries=true&useAffectedRows=true&autoReconnectForPools=true | |
63 | 63 | username: root |
64 | 64 | redis: |
65 | 65 | database: 0 |
... | ... |
target/classes/com/canrd/webmagic/DNS/DnsResolver.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/controller/MatterController.class
No preview for this file type
target/classes/com/canrd/webmagic/controller/NatureCommunicatiosController.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/controller/NatureComputationalController.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/controller/NatureEnergyController.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/controller/NatureMaterialController.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/controller/NatureMethodsController.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/controller/NatureNanotechnologyController.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/controller/NaturePhysicsController.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/domain/ArticleTypeEnum.class
No preview for this file type
target/classes/com/canrd/webmagic/processor/ChemicalPagePcoessor.class
No preview for this file type
target/classes/com/canrd/webmagic/processor/MatterPragePcoessor.class renamed to target/classes/com/canrd/webmagic/processor/MatterPagePcoessor.class
No preview for this file type
target/classes/com/canrd/webmagic/processor/NatureCommunicatiosPcoessor.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/NatureComputationalPcoessor.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/NatureEnergyPagePcoessor.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/NatureMaterialPagePcoessor.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/NatureMethodsPcoessor.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/NatureNanotechnologyProcessor.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/NaturePhysicsProcessor.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/NatureSearchPageProcessor.class
No preview for this file type
target/classes/com/canrd/webmagic/processor/Science4JournalSearchPageProcessor.class
No preview for this file type
target/classes/com/canrd/webmagic/processor/download/SeleniumDownloader.class
No preview for this file type
target/classes/com/canrd/webmagic/processor/driver/MyChromeDriver.class
No preview for this file type