Commit 1680e1818f9683472a638fab23d8514617cc7fd8
1 parent
b17c8d02
matter官网
Showing
15 changed files
with
102 additions
and
55 deletions
Too many changes to show.
To preserve performance only 15 of 16 files are displayed.
src/main/java/com/canrd/webmagic/controller/MatterController.java
0 → 100644
1 | +package com.canrd.webmagic.controller; | |
2 | + | |
3 | +import com.canrd.webmagic.common.constant.ServerResult; | |
4 | +import com.canrd.webmagic.processor.MatterPragePcoessor; | |
5 | +import com.canrd.webmagic.processor.download.SeleniumDownloader; | |
6 | +import io.swagger.annotations.Api; | |
7 | +import io.swagger.annotations.ApiOperation; | |
8 | +import org.apache.logging.log4j.core.util.UuidUtil; | |
9 | +import org.springframework.web.bind.annotation.GetMapping; | |
10 | +import org.springframework.web.bind.annotation.RequestMapping; | |
11 | +import org.springframework.web.bind.annotation.RestController; | |
12 | +import us.codecraft.webmagic.Spider; | |
13 | + | |
14 | +import javax.annotation.Resource; | |
15 | + | |
16 | +@RestController | |
17 | +@RequestMapping("/matter/article") | |
18 | +@Api("Matter") | |
19 | +public class MatterController { | |
20 | + @Resource | |
21 | + private MatterPragePcoessor matterPragePcoessor; | |
22 | + | |
23 | + @Resource | |
24 | + private SeleniumDownloader seleniumDownloader; | |
25 | + | |
26 | + @GetMapping("/start") | |
27 | + @ApiOperation("start") | |
28 | + public ServerResult start() { | |
29 | + Spider.create(new MatterPragePcoessor()) | |
30 | + // 添加这个Spider要爬取的网页地址 | |
31 | + .addUrl("https://www.cell.com/matter/fulltext/S2590-2385(21)00631-7#%20") | |
32 | + .setUUID(UuidUtil.getTimeBasedUuid().toString()) | |
33 | + .setDownloader(seleniumDownloader.setSleepTime(30000)) | |
34 | + // 开启5个线程执行,并开始爬取 | |
35 | + .thread(5).run(); | |
36 | + return ServerResult.success(); | |
37 | + } | |
38 | +} | |
... | ... |
src/main/java/com/canrd/webmagic/driver/ChromeBuildDriver.java deleted
100644 → 0
1 | -package com.canrd.webmagic.driver; | |
2 | - | |
3 | -import org.openqa.selenium.chrome.ChromeOptions; | |
4 | - | |
5 | -import java.util.Arrays; | |
6 | - | |
7 | -public class ChromeBuildDriver { | |
8 | - public ChromeOptions build(String DriverPath){ | |
9 | - ChromeOptions options = new ChromeOptions(); | |
10 | - // 添加代理,这里使用的代理是示例,需要替换为实际的代理服务器地址和端口 | |
11 | -// options.addArguments("--proxy-server=http://proxy-server:port"); | |
12 | - | |
13 | - // 禁用JavaScript,有时这能帮助绕过Cloudflare的检查 | |
14 | - options.addArguments("--disable-javascript"); | |
15 | - | |
16 | - // 禁用浏览器扩展,如果知道Cloudflare使用了特定的扩展,可以禁用它 | |
17 | - options.addArguments("--disable-extensions"); | |
18 | - | |
19 | - // 禁用本地缓存,确保每次访问都从服务器获取 | |
20 | - options.addArguments("--disable-application-cache"); | |
21 | - | |
22 | - // 禁止策略化 | |
23 | - options.addArguments("--disable-infobars"); | |
24 | - // 解决DevToolsActivePort文件不存在的报错 | |
25 | - options.addArguments("--no-sandbox"); | |
26 | - // 指定浏览器分辨 | |
27 | - options.addArguments("window-size=1920x3000"); | |
28 | - // 谷歌文档提到需要加上这个属性来规避bug | |
29 | - options.addArguments("--disable-gpu"); | |
30 | - // 隐身模式(无痕模式) | |
31 | - options.addArguments("--incognito"); | |
32 | - // 最大化运行(全屏窗口),不设置,取元素会报错 | |
33 | - options.addArguments("--start-maximized"); | |
34 | - // 禁用浏览器正在被自动化程序控制的提示 | |
35 | - options.addArguments("--disable-infobars"); | |
36 | - // 隐藏滚动条, 应对一些特殊页面 | |
37 | - options.addArguments("--hide-scrollbars"); | |
38 | - // 不加载图片, 提升速度 | |
39 | - options.addArguments("blink-settings=imagesEnabled=false"); | |
40 | - // 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败 | |
41 | - options.addArguments("--headless"); | |
42 | - //禁用 blink 特征 | |
43 | - options.addArguments("disable-blink-features=AutomationControlled"); | |
44 | - options.setExperimentalOption("excludeSwitches", Arrays.asList("enable-automation")); | |
45 | - options.setExperimentalOption("useAutomationExtension", false); | |
46 | - options.addArguments("--remote-allow-origins=*"); | |
47 | - options.setBinary(DriverPath); | |
48 | - return options; | |
49 | - } | |
50 | -} |
src/main/java/com/canrd/webmagic/processor/ChemicalPagePcoessor.java
1 | 1 | package com.canrd.webmagic.processor; |
2 | 2 | |
3 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | |
3 | 4 | import us.codecraft.webmagic.Page; |
5 | +import us.codecraft.webmagic.Site; | |
6 | +import us.codecraft.webmagic.Spider; | |
4 | 7 | import us.codecraft.webmagic.processor.PageProcessor; |
8 | +import us.codecraft.webmagic.selector.Html; | |
5 | 9 | |
6 | 10 | public class ChemicalPagePcoessor implements PageProcessor { |
7 | - | |
11 | + private Site site = Site.me().setRetryTimes(3).setSleepTime(100); | |
8 | 12 | @Override |
9 | 13 | public void process(Page page) { |
10 | 14 | |
15 | + }; | |
16 | + | |
17 | + public void doArticleContent(Page page){ | |
18 | + Html html = page.getHtml(); | |
19 | + | |
20 | + } | |
21 | + | |
22 | + public static void main(String[] args) { | |
23 | + // 创建一个Spider,并把我们的处理器放进去 | |
24 | + Spider.create(new ChemicalPagePcoessor()) | |
25 | + // 添加这个Spider要爬取的网页地址 | |
26 | + .addUrl("https://www.nature.com/nature/research-articles?sort=PubDate&page=1") | |
27 | + .addPipeline(new ArticlePipeline()) | |
28 | + // 开启5个线程执行,并开始爬取 | |
29 | + .thread(5).run(); | |
11 | 30 | } |
12 | 31 | } |
... | ... |
src/main/java/com/canrd/webmagic/processor/MatterPragePcoessor.java
0 → 100644
1 | +package com.canrd.webmagic.processor; | |
2 | + | |
3 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | |
4 | +import org.springframework.stereotype.Component; | |
5 | +import us.codecraft.webmagic.Page; | |
6 | +import us.codecraft.webmagic.Site; | |
7 | +import us.codecraft.webmagic.Spider; | |
8 | +import us.codecraft.webmagic.processor.PageProcessor; | |
9 | +import us.codecraft.webmagic.selector.Html; | |
10 | +@Component | |
11 | +public class MatterPragePcoessor implements PageProcessor { | |
12 | + private Site site = Site.me().setRetryTimes(3).setSleepTime(100); | |
13 | + @Override | |
14 | + public void process(Page page) { | |
15 | + | |
16 | + } | |
17 | + | |
18 | + @Override | |
19 | + public Site getSite() { | |
20 | + return PageProcessor.super.getSite(); | |
21 | + } | |
22 | + | |
23 | + public void doArticleContent(Page page){ | |
24 | + Html html = page.getHtml(); | |
25 | + String articleCode = page.getUrl().get(); | |
26 | +// html.xpath() | |
27 | + } | |
28 | + | |
29 | + public static void main(String[] args) { | |
30 | + // 创建一个Spider,并把我们的处理器放进去 | |
31 | + Spider.create(new MatterPragePcoessor()) | |
32 | + // 添加这个Spider要爬取的网页地址 | |
33 | + .addUrl("https://www.cell.com/matter/fulltext/S2590-2385(21)00631-7#%20") | |
34 | + .addPipeline(new ArticlePipeline()) | |
35 | + // 开启5个线程执行,并开始爬取 | |
36 | + .thread(5).run(); | |
37 | + } | |
38 | +} | |
... | ... |
src/main/java/com/canrd/webmagic/processor/driver/MyChromeDriver.java
... | ... | @@ -16,7 +16,9 @@ import java.util.Arrays; |
16 | 16 | */ |
17 | 17 | @Component |
18 | 18 | public class MyChromeDriver implements BrowserDriver{ |
19 | - private static final String WIN_DRIVER_PATH = "D:\\driver\\chrome\\chromedriver-win64\\chromedriver.exe"; | |
19 | +// private static final String WIN_DRIVER_PATH = "D:\\driver\\chrome\\chromedriver-win64\\chromedriver.exe"; | |
20 | + | |
21 | + private static final String WIN_DRIVER_PATH = "C:\\Program Files\\Google\\Chrome\\Application\\chromedriver.exe"; | |
20 | 22 | private static final String LINUX_DRIVER_PATH = "/home/canrd/webmagic/chrome/chromedriver-linux64/chromedriver"; |
21 | 23 | private static final String WIN_BINARY_PATH = "D:\\driver\\chrome\\chrome-headless-shell-win64\\chrome-headless-shell.exe"; |
22 | 24 | private static final String LINUX_BINARY_PATH = "/home/canrd/webmagic/chrome/chrome-linux64/chrome"; |
... | ... |
target/classes/application-local.yml
... | ... | @@ -57,13 +57,13 @@ spring: |
57 | 57 | testWhileIdle: true |
58 | 58 | testOnBorrow: true |
59 | 59 | testOnReturn: true |
60 | - password: canrd@2024 | |
60 | + password: 123456 | |
61 | 61 | time-between-eviction-runs-millis: 1000 |
62 | - url: jdbc:mysql://39.108.227.113:3307/webmagic?useUnicode=true&characterEncoding=UTF-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=Asia/Shanghai&useSSL=false&autoReconnect=true&failOverReadOnly=false&maxReconnects=10&allowMultiQueries=true&useAffectedRows=true&autoReconnectForPools=true | |
62 | + url: jdbc:mysql://localhost:3306/webmagic?useUnicode=true&characterEncoding=UTF-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=Asia/Shanghai&useSSL=false&autoReconnect=true&failOverReadOnly=false&maxReconnects=10&allowMultiQueries=true&useAffectedRows=true&autoReconnectForPools=true | |
63 | 63 | username: root |
64 | 64 | redis: |
65 | 65 | database: 0 |
66 | - host: 39.108.227.113 | |
66 | + host: localhost | |
67 | 67 | lettuce: |
68 | 68 | pool: |
69 | 69 | max-active: 2000 |
... | ... |
target/classes/com/canrd/webmagic/config/SeleniumConfig.class
No preview for this file type
target/classes/com/canrd/webmagic/controller/MatterController.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/ChemicalPagePcoessor.class
No preview for this file type
target/classes/com/canrd/webmagic/processor/MatterPragePcoessor.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/download/SeleniumDownloader.class
No preview for this file type
target/classes/com/canrd/webmagic/processor/driver/BrowserDriver.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/driver/MyChromeDriver.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/driver/MyEdgeDriver.class
0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/driver/MyFirefoxDriver.class
0 → 100644
No preview for this file type