Commit 1680e1818f9683472a638fab23d8514617cc7fd8
1 parent
b17c8d02
matter官网
Showing
15 changed files
with
102 additions
and
55 deletions
Too many changes to show.
To preserve performance only 15 of 16 files are displayed.
src/main/java/com/canrd/webmagic/controller/MatterController.java
0 → 100644
1 | +package com.canrd.webmagic.controller; | ||
2 | + | ||
3 | +import com.canrd.webmagic.common.constant.ServerResult; | ||
4 | +import com.canrd.webmagic.processor.MatterPragePcoessor; | ||
5 | +import com.canrd.webmagic.processor.download.SeleniumDownloader; | ||
6 | +import io.swagger.annotations.Api; | ||
7 | +import io.swagger.annotations.ApiOperation; | ||
8 | +import org.apache.logging.log4j.core.util.UuidUtil; | ||
9 | +import org.springframework.web.bind.annotation.GetMapping; | ||
10 | +import org.springframework.web.bind.annotation.RequestMapping; | ||
11 | +import org.springframework.web.bind.annotation.RestController; | ||
12 | +import us.codecraft.webmagic.Spider; | ||
13 | + | ||
14 | +import javax.annotation.Resource; | ||
15 | + | ||
16 | +@RestController | ||
17 | +@RequestMapping("/matter/article") | ||
18 | +@Api("Matter") | ||
19 | +public class MatterController { | ||
20 | + @Resource | ||
21 | + private MatterPragePcoessor matterPragePcoessor; | ||
22 | + | ||
23 | + @Resource | ||
24 | + private SeleniumDownloader seleniumDownloader; | ||
25 | + | ||
26 | + @GetMapping("/start") | ||
27 | + @ApiOperation("start") | ||
28 | + public ServerResult start() { | ||
29 | + Spider.create(new MatterPragePcoessor()) | ||
30 | + // 添加这个Spider要爬取的网页地址 | ||
31 | + .addUrl("https://www.cell.com/matter/fulltext/S2590-2385(21)00631-7#%20") | ||
32 | + .setUUID(UuidUtil.getTimeBasedUuid().toString()) | ||
33 | + .setDownloader(seleniumDownloader.setSleepTime(30000)) | ||
34 | + // 开启5个线程执行,并开始爬取 | ||
35 | + .thread(5).run(); | ||
36 | + return ServerResult.success(); | ||
37 | + } | ||
38 | +} |
src/main/java/com/canrd/webmagic/driver/ChromeBuildDriver.java deleted
100644 → 0
1 | -package com.canrd.webmagic.driver; | ||
2 | - | ||
3 | -import org.openqa.selenium.chrome.ChromeOptions; | ||
4 | - | ||
5 | -import java.util.Arrays; | ||
6 | - | ||
7 | -public class ChromeBuildDriver { | ||
8 | - public ChromeOptions build(String DriverPath){ | ||
9 | - ChromeOptions options = new ChromeOptions(); | ||
10 | - // 添加代理,这里使用的代理是示例,需要替换为实际的代理服务器地址和端口 | ||
11 | -// options.addArguments("--proxy-server=http://proxy-server:port"); | ||
12 | - | ||
13 | - // 禁用JavaScript,有时这能帮助绕过Cloudflare的检查 | ||
14 | - options.addArguments("--disable-javascript"); | ||
15 | - | ||
16 | - // 禁用浏览器扩展,如果知道Cloudflare使用了特定的扩展,可以禁用它 | ||
17 | - options.addArguments("--disable-extensions"); | ||
18 | - | ||
19 | - // 禁用本地缓存,确保每次访问都从服务器获取 | ||
20 | - options.addArguments("--disable-application-cache"); | ||
21 | - | ||
22 | - // 禁止策略化 | ||
23 | - options.addArguments("--disable-infobars"); | ||
24 | - // 解决DevToolsActivePort文件不存在的报错 | ||
25 | - options.addArguments("--no-sandbox"); | ||
26 | - // 指定浏览器分辨 | ||
27 | - options.addArguments("window-size=1920x3000"); | ||
28 | - // 谷歌文档提到需要加上这个属性来规避bug | ||
29 | - options.addArguments("--disable-gpu"); | ||
30 | - // 隐身模式(无痕模式) | ||
31 | - options.addArguments("--incognito"); | ||
32 | - // 最大化运行(全屏窗口),不设置,取元素会报错 | ||
33 | - options.addArguments("--start-maximized"); | ||
34 | - // 禁用浏览器正在被自动化程序控制的提示 | ||
35 | - options.addArguments("--disable-infobars"); | ||
36 | - // 隐藏滚动条, 应对一些特殊页面 | ||
37 | - options.addArguments("--hide-scrollbars"); | ||
38 | - // 不加载图片, 提升速度 | ||
39 | - options.addArguments("blink-settings=imagesEnabled=false"); | ||
40 | - // 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败 | ||
41 | - options.addArguments("--headless"); | ||
42 | - //禁用 blink 特征 | ||
43 | - options.addArguments("disable-blink-features=AutomationControlled"); | ||
44 | - options.setExperimentalOption("excludeSwitches", Arrays.asList("enable-automation")); | ||
45 | - options.setExperimentalOption("useAutomationExtension", false); | ||
46 | - options.addArguments("--remote-allow-origins=*"); | ||
47 | - options.setBinary(DriverPath); | ||
48 | - return options; | ||
49 | - } | ||
50 | -} |
src/main/java/com/canrd/webmagic/processor/ChemicalPagePcoessor.java
1 | package com.canrd.webmagic.processor; | 1 | package com.canrd.webmagic.processor; |
2 | 2 | ||
3 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | ||
3 | import us.codecraft.webmagic.Page; | 4 | import us.codecraft.webmagic.Page; |
5 | +import us.codecraft.webmagic.Site; | ||
6 | +import us.codecraft.webmagic.Spider; | ||
4 | import us.codecraft.webmagic.processor.PageProcessor; | 7 | import us.codecraft.webmagic.processor.PageProcessor; |
8 | +import us.codecraft.webmagic.selector.Html; | ||
5 | 9 | ||
6 | public class ChemicalPagePcoessor implements PageProcessor { | 10 | public class ChemicalPagePcoessor implements PageProcessor { |
7 | - | 11 | + private Site site = Site.me().setRetryTimes(3).setSleepTime(100); |
8 | @Override | 12 | @Override |
9 | public void process(Page page) { | 13 | public void process(Page page) { |
10 | 14 | ||
15 | + }; | ||
16 | + | ||
17 | + public void doArticleContent(Page page){ | ||
18 | + Html html = page.getHtml(); | ||
19 | + | ||
20 | + } | ||
21 | + | ||
22 | + public static void main(String[] args) { | ||
23 | + // 创建一个Spider,并把我们的处理器放进去 | ||
24 | + Spider.create(new ChemicalPagePcoessor()) | ||
25 | + // 添加这个Spider要爬取的网页地址 | ||
26 | + .addUrl("https://www.nature.com/nature/research-articles?sort=PubDate&page=1") | ||
27 | + .addPipeline(new ArticlePipeline()) | ||
28 | + // 开启5个线程执行,并开始爬取 | ||
29 | + .thread(5).run(); | ||
11 | } | 30 | } |
12 | } | 31 | } |
src/main/java/com/canrd/webmagic/processor/MatterPragePcoessor.java
0 → 100644
1 | +package com.canrd.webmagic.processor; | ||
2 | + | ||
3 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | ||
4 | +import org.springframework.stereotype.Component; | ||
5 | +import us.codecraft.webmagic.Page; | ||
6 | +import us.codecraft.webmagic.Site; | ||
7 | +import us.codecraft.webmagic.Spider; | ||
8 | +import us.codecraft.webmagic.processor.PageProcessor; | ||
9 | +import us.codecraft.webmagic.selector.Html; | ||
10 | +@Component | ||
11 | +public class MatterPragePcoessor implements PageProcessor { | ||
12 | + private Site site = Site.me().setRetryTimes(3).setSleepTime(100); | ||
13 | + @Override | ||
14 | + public void process(Page page) { | ||
15 | + | ||
16 | + } | ||
17 | + | ||
18 | + @Override | ||
19 | + public Site getSite() { | ||
20 | + return PageProcessor.super.getSite(); | ||
21 | + } | ||
22 | + | ||
23 | + public void doArticleContent(Page page){ | ||
24 | + Html html = page.getHtml(); | ||
25 | + String articleCode = page.getUrl().get(); | ||
26 | +// html.xpath() | ||
27 | + } | ||
28 | + | ||
29 | + public static void main(String[] args) { | ||
30 | + // 创建一个Spider,并把我们的处理器放进去 | ||
31 | + Spider.create(new MatterPragePcoessor()) | ||
32 | + // 添加这个Spider要爬取的网页地址 | ||
33 | + .addUrl("https://www.cell.com/matter/fulltext/S2590-2385(21)00631-7#%20") | ||
34 | + .addPipeline(new ArticlePipeline()) | ||
35 | + // 开启5个线程执行,并开始爬取 | ||
36 | + .thread(5).run(); | ||
37 | + } | ||
38 | +} |
src/main/java/com/canrd/webmagic/processor/driver/MyChromeDriver.java
@@ -16,7 +16,9 @@ import java.util.Arrays; | @@ -16,7 +16,9 @@ import java.util.Arrays; | ||
16 | */ | 16 | */ |
17 | @Component | 17 | @Component |
18 | public class MyChromeDriver implements BrowserDriver{ | 18 | public class MyChromeDriver implements BrowserDriver{ |
19 | - private static final String WIN_DRIVER_PATH = "D:\\driver\\chrome\\chromedriver-win64\\chromedriver.exe"; | 19 | +// private static final String WIN_DRIVER_PATH = "D:\\driver\\chrome\\chromedriver-win64\\chromedriver.exe"; |
20 | + | ||
21 | + private static final String WIN_DRIVER_PATH = "C:\\Program Files\\Google\\Chrome\\Application\\chromedriver.exe"; | ||
20 | private static final String LINUX_DRIVER_PATH = "/home/canrd/webmagic/chrome/chromedriver-linux64/chromedriver"; | 22 | private static final String LINUX_DRIVER_PATH = "/home/canrd/webmagic/chrome/chromedriver-linux64/chromedriver"; |
21 | private static final String WIN_BINARY_PATH = "D:\\driver\\chrome\\chrome-headless-shell-win64\\chrome-headless-shell.exe"; | 23 | private static final String WIN_BINARY_PATH = "D:\\driver\\chrome\\chrome-headless-shell-win64\\chrome-headless-shell.exe"; |
22 | private static final String LINUX_BINARY_PATH = "/home/canrd/webmagic/chrome/chrome-linux64/chrome"; | 24 | private static final String LINUX_BINARY_PATH = "/home/canrd/webmagic/chrome/chrome-linux64/chrome"; |
target/classes/application-local.yml
@@ -57,13 +57,13 @@ spring: | @@ -57,13 +57,13 @@ spring: | ||
57 | testWhileIdle: true | 57 | testWhileIdle: true |
58 | testOnBorrow: true | 58 | testOnBorrow: true |
59 | testOnReturn: true | 59 | testOnReturn: true |
60 | - password: canrd@2024 | 60 | + password: 123456 |
61 | time-between-eviction-runs-millis: 1000 | 61 | time-between-eviction-runs-millis: 1000 |
62 | - url: jdbc:mysql://39.108.227.113:3307/webmagic?useUnicode=true&characterEncoding=UTF-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=Asia/Shanghai&useSSL=false&autoReconnect=true&failOverReadOnly=false&maxReconnects=10&allowMultiQueries=true&useAffectedRows=true&autoReconnectForPools=true | 62 | + url: jdbc:mysql://localhost:3306/webmagic?useUnicode=true&characterEncoding=UTF-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=Asia/Shanghai&useSSL=false&autoReconnect=true&failOverReadOnly=false&maxReconnects=10&allowMultiQueries=true&useAffectedRows=true&autoReconnectForPools=true |
63 | username: root | 63 | username: root |
64 | redis: | 64 | redis: |
65 | database: 0 | 65 | database: 0 |
66 | - host: 39.108.227.113 | 66 | + host: localhost |
67 | lettuce: | 67 | lettuce: |
68 | pool: | 68 | pool: |
69 | max-active: 2000 | 69 | max-active: 2000 |
target/classes/com/canrd/webmagic/config/SeleniumConfig.class
No preview for this file type
target/classes/com/canrd/webmagic/controller/MatterController.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/ChemicalPagePcoessor.class
No preview for this file type
target/classes/com/canrd/webmagic/processor/MatterPragePcoessor.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/download/SeleniumDownloader.class
No preview for this file type
target/classes/com/canrd/webmagic/processor/driver/BrowserDriver.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/driver/MyChromeDriver.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/driver/MyEdgeDriver.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/driver/MyFirefoxDriver.class
0 → 100644
No preview for this file type