Commit 236651f679bba19ae4fe5e9ebdac5961389d89f3
1 parent
f7957dbb
新增了火狐和edge驱动
Showing
8 changed files
with
240 additions
and
59 deletions
pom.xml
@@ -48,6 +48,7 @@ | @@ -48,6 +48,7 @@ | ||
48 | <easyexcel.version>2.2.3</easyexcel.version> | 48 | <easyexcel.version>2.2.3</easyexcel.version> |
49 | <webmagic.version>0.10.0</webmagic.version> | 49 | <webmagic.version>0.10.0</webmagic.version> |
50 | <selenium.version>3.4.0</selenium.version> | 50 | <selenium.version>3.4.0</selenium.version> |
51 | +<!-- <browsermob.version>2.1.5</browsermob.version>--> | ||
51 | </properties> | 52 | </properties> |
52 | 53 | ||
53 | <dependencies> | 54 | <dependencies> |
@@ -63,6 +64,12 @@ | @@ -63,6 +64,12 @@ | ||
63 | </exclusion> | 64 | </exclusion> |
64 | </exclusions> | 65 | </exclusions> |
65 | </dependency> | 66 | </dependency> |
67 | +<!-- <dependency>--> | ||
68 | +<!-- <groupId>net.lightbody.bmp</groupId>--> | ||
69 | +<!-- <artifactId>browsermob-core</artifactId>--> | ||
70 | +<!-- <version>${browsermob.version}</version>--> | ||
71 | +<!-- </dependency>--> | ||
72 | + | ||
66 | 73 | ||
67 | <!-- webmagic核心库 --> | 74 | <!-- webmagic核心库 --> |
68 | <dependency> | 75 | <dependency> |
src/main/java/com/canrd/webmagic/config/SeleniumConfig.java
1 | package com.canrd.webmagic.config; | 1 | package com.canrd.webmagic.config; |
2 | 2 | ||
3 | +import com.canrd.webmagic.processor.driver.BrowserDriver; | ||
4 | +import com.canrd.webmagic.processor.driver.MyChromeDriver; | ||
5 | +import com.canrd.webmagic.processor.driver.MyFirefoxDriver; | ||
6 | +import lombok.Data; | ||
3 | import org.openqa.selenium.WebDriver; | 7 | import org.openqa.selenium.WebDriver; |
4 | -import org.openqa.selenium.chrome.ChromeDriver; | ||
5 | -import org.openqa.selenium.chrome.ChromeOptions; | ||
6 | -import org.springframework.context.annotation.Bean; | 8 | +import org.springframework.beans.factory.annotation.Autowired; |
9 | +import org.springframework.context.ApplicationContext; | ||
10 | +import org.springframework.context.ApplicationContextAware; | ||
7 | import org.springframework.context.annotation.Configuration; | 11 | import org.springframework.context.annotation.Configuration; |
8 | 12 | ||
9 | -import java.util.Arrays; | 13 | +import java.util.*; |
14 | + | ||
10 | 15 | ||
11 | /** | 16 | /** |
12 | * @author: xms | 17 | * @author: xms |
@@ -15,62 +20,32 @@ import java.util.Arrays; | @@ -15,62 +20,32 @@ import java.util.Arrays; | ||
15 | * @version: 1.0 | 20 | * @version: 1.0 |
16 | */ | 21 | */ |
17 | @Configuration | 22 | @Configuration |
18 | -public class SeleniumConfig { | 23 | +@Data |
24 | +public class SeleniumConfig implements ApplicationContextAware { | ||
19 | 25 | ||
20 | -// @Bean | ||
21 | - public WebDriver webDriver() throws InterruptedException { | ||
22 | - // 初始化ChromeOptions | ||
23 | - ChromeOptions options = new ChromeOptions(); | ||
24 | - // 添加代理,这里使用的代理是示例,需要替换为实际的代理服务器地址和端口 | ||
25 | -// options.addArguments("--proxy-server=http://proxy-server:port"); | 26 | + private static ApplicationContext context; |
26 | 27 | ||
27 | - // 禁用JavaScript,有时这能帮助绕过Cloudflare的检查 | ||
28 | - options.addArguments("--disable-javascript"); | 28 | + @Override |
29 | + public void setApplicationContext(ApplicationContext applicationContext) { | ||
30 | + context = applicationContext; | ||
31 | + } | ||
29 | 32 | ||
30 | - // 禁用浏览器扩展,如果知道Cloudflare使用了特定的扩展,可以禁用它 | ||
31 | - options.addArguments("--disable-extensions"); | 33 | + public static final String FIREFOX_KEY = "myFirefoxDriver"; |
34 | + public static final String CHROME_KEY = "myChromeDriver"; | ||
35 | + public static final String EDGE_KEY = "myEdgeDriver"; | ||
32 | 36 | ||
33 | - // 禁用本地缓存,确保每次访问都从服务器获取 | ||
34 | - options.addArguments("--disable-application-cache"); | 37 | + public static final List<String> DRIVER_KEYS = new ArrayList<>(Arrays.asList( |
38 | + CHROME_KEY, | ||
39 | + FIREFOX_KEY, | ||
40 | + EDGE_KEY | ||
41 | + )); | ||
35 | 42 | ||
36 | - // 禁止策略化 | ||
37 | - options.addArguments("--disable-infobars"); | ||
38 | - // 解决DevToolsActivePort文件不存在的报错 | ||
39 | - options.addArguments("--no-sandbox"); | ||
40 | - // 指定浏览器分辨 | ||
41 | - options.addArguments("window-size=1920x3000"); | ||
42 | - // 谷歌文档提到需要加上这个属性来规避bug | ||
43 | - options.addArguments("--disable-gpu"); | ||
44 | - // 隐身模式(无痕模式) | ||
45 | - options.addArguments("--incognito"); | ||
46 | - // 最大化运行(全屏窗口),不设置,取元素会报错 | ||
47 | - options.addArguments("--start-maximized"); | ||
48 | - // 禁用浏览器正在被自动化程序控制的提示 | ||
49 | - options.addArguments("--disable-infobars"); | ||
50 | - // 隐藏滚动条, 应对一些特殊页面 | ||
51 | - options.addArguments("--hide-scrollbars"); | ||
52 | - // 不加载图片, 提升速度 | ||
53 | - options.addArguments("blink-settings=imagesEnabled=false"); | ||
54 | - // 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败 | ||
55 | - options.addArguments("--headless"); | ||
56 | - //禁用 blink 特征 | ||
57 | - options.addArguments("disable-blink-features=AutomationControlled"); | ||
58 | - options.setExperimentalOption("excludeSwitches", Arrays.asList("enable-automation")); | ||
59 | - options.setExperimentalOption("useAutomationExtension", false); | ||
60 | - options.addArguments("--remote-allow-origins=*"); | ||
61 | 43 | ||
62 | - String os_name = System.getProperty("os.name"); | ||
63 | - // 判断是否是windows系统 | ||
64 | - if (os_name.toLowerCase().startsWith("win")) { | ||
65 | - // windows | ||
66 | - options.setBinary("D:\\chrome\\chrome-headless-shell-win64\\chrome-headless-shell-win64\\chrome-headless-shell.exe"); | ||
67 | - System.setProperty("webdriver.chrome.driver", "D:\\chrome\\chromedriver-win64\\chromedriver-win64\\chromedriver.exe"); | ||
68 | - } else { | ||
69 | - // linux | ||
70 | - options.setBinary("/home/canrd/webmagic/chrome/chrome-linux64/chrome"); | ||
71 | - System.setProperty("webdriver.chrome.driver", "/home/canrd/webmagic/chrome/chromedriver-linux64/chromedriver"); | 44 | + public WebDriver getWebDriver(String key) { |
45 | + Object bean = context.getBean(key); | ||
46 | + if (bean instanceof BrowserDriver) { | ||
47 | + return ((BrowserDriver) bean).getDriver(); | ||
72 | } | 48 | } |
73 | - | ||
74 | - return new ChromeDriver(options); | 49 | + return null; |
75 | } | 50 | } |
76 | } | 51 | } |
src/main/java/com/canrd/webmagic/processor/download/SeleniumDownloader.java
@@ -49,7 +49,8 @@ public class SeleniumDownloader extends AbstractDownloader { | @@ -49,7 +49,8 @@ public class SeleniumDownloader extends AbstractDownloader { | ||
49 | Page page = Page.fail(); | 49 | Page page = Page.fail(); |
50 | WebDriver webDriver = null; | 50 | WebDriver webDriver = null; |
51 | try { | 51 | try { |
52 | - webDriver = config.webDriver(); | 52 | + //0默认谷歌浏览器 |
53 | + webDriver = config.getWebDriver(SeleniumConfig.DRIVER_KEYS.get(0)); | ||
53 | WebDriver.Options manage = webDriver.manage(); | 54 | WebDriver.Options manage = webDriver.manage(); |
54 | Site site = task.getSite(); | 55 | Site site = task.getSite(); |
55 | site.setUserAgent(Agent.getRandom()); | 56 | site.setUserAgent(Agent.getRandom()); |
@@ -88,7 +89,7 @@ public class SeleniumDownloader extends AbstractDownloader { | @@ -88,7 +89,7 @@ public class SeleniumDownloader extends AbstractDownloader { | ||
88 | } finally { | 89 | } finally { |
89 | if (webDriver != null) { | 90 | if (webDriver != null) { |
90 | webDriver.close(); | 91 | webDriver.close(); |
91 | - webDriver.quit(); | 92 | +// webDriver.quit(); |
92 | webDriver = null; | 93 | webDriver = null; |
93 | } | 94 | } |
94 | } | 95 | } |
src/main/java/com/canrd/webmagic/processor/driver/BrowserDriver.java
0 → 100644
1 | +package com.canrd.webmagic.processor.driver; | ||
2 | + | ||
3 | +import org.openqa.selenium.WebDriver; | ||
4 | + | ||
5 | +/** | ||
6 | + * @author zhongnanhuang | ||
7 | + * @version 1.0 | ||
8 | + * @project webmagic-canrd-service | ||
9 | + * @description 驱动接口 | ||
10 | + * @date 2024/5/23 10:46:16 | ||
11 | + */ | ||
12 | +public interface BrowserDriver { | ||
13 | + | ||
14 | + WebDriver getDriver(); | ||
15 | + | ||
16 | +} |
src/main/java/com/canrd/webmagic/processor/driver/MyChromeDriver.java
0 → 100644
1 | +package com.canrd.webmagic.processor.driver; | ||
2 | + | ||
3 | +import org.openqa.selenium.WebDriver; | ||
4 | +import org.openqa.selenium.chrome.ChromeOptions; | ||
5 | +import org.springframework.stereotype.Component; | ||
6 | + | ||
7 | +import java.util.Arrays; | ||
8 | + | ||
9 | +/** | ||
10 | + * @author zhongnanhuang | ||
11 | + * @version 1.0 | ||
12 | + * @project webmagic-canrd-service | ||
13 | + * @description 谷歌浏览器驱动 | ||
14 | + * @date 2024/5/23 10:49:30 | ||
15 | + * 驱动下载地址:https://storage.googleapis.com/chrome-for-testing-public | ||
16 | + */ | ||
17 | +@Component | ||
18 | +public class MyChromeDriver implements BrowserDriver{ | ||
19 | + private static final String WIN_DRIVER_PATH = "D:\\driver\\chrome\\chromedriver-win64\\chromedriver.exe"; | ||
20 | + private static final String LINUX_DRIVER_PATH = "/home/canrd/webmagic/chrome/chromedriver-linux64/chromedriver"; | ||
21 | + private static final String WIN_BINARY_PATH = "D:\\driver\\chrome\\chrome-headless-shell-win64\\chrome-headless-shell.exe"; | ||
22 | + private static final String LINUX_BINARY_PATH = "/home/canrd/webmagic/chrome/chrome-linux64/chrome"; | ||
23 | + | ||
24 | + @Override | ||
25 | + public WebDriver getDriver() { | ||
26 | + // 初始化ChromeOptions | ||
27 | + ChromeOptions options = new ChromeOptions(); | ||
28 | + // 添加代理,这里使用的代理是示例,需要替换为实际的代理服务器地址和端口 | ||
29 | +// options.addArguments("--proxy-server=http://proxy-server:port"); | ||
30 | + | ||
31 | + // 禁用JavaScript,有时这能帮助绕过Cloudflare的检查 | ||
32 | + options.addArguments("--disable-javascript"); | ||
33 | + | ||
34 | + // 禁用浏览器扩展,如果知道Cloudflare使用了特定的扩展,可以禁用它 | ||
35 | + options.addArguments("--disable-extensions"); | ||
36 | + | ||
37 | + // 禁用本地缓存,确保每次访问都从服务器获取 | ||
38 | + options.addArguments("--disable-application-cache"); | ||
39 | + | ||
40 | + // 禁止策略化 | ||
41 | + options.addArguments("--disable-infobars"); | ||
42 | + // 解决DevToolsActivePort文件不存在的报错 | ||
43 | + options.addArguments("--no-sandbox"); | ||
44 | + // 指定浏览器分辨 | ||
45 | + options.addArguments("window-size=1920x3000"); | ||
46 | + // 谷歌文档提到需要加上这个属性来规避bug | ||
47 | + options.addArguments("--disable-gpu"); | ||
48 | + // 隐身模式(无痕模式) | ||
49 | + options.addArguments("--incognito"); | ||
50 | + // 最大化运行(全屏窗口),不设置,取元素会报错 | ||
51 | + options.addArguments("--start-maximized"); | ||
52 | + // 禁用浏览器正在被自动化程序控制的提示 | ||
53 | + options.addArguments("--disable-infobars"); | ||
54 | + // 隐藏滚动条, 应对一些特殊页面 | ||
55 | + options.addArguments("--hide-scrollbars"); | ||
56 | + // 不加载图片, 提升速度 | ||
57 | + options.addArguments("blink-settings=imagesEnabled=false"); | ||
58 | + // 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败 | ||
59 | + options.addArguments("--headless"); | ||
60 | + //禁用 blink 特征 | ||
61 | + options.addArguments("disable-blink-features=AutomationControlled"); | ||
62 | + options.setExperimentalOption("excludeSwitches", Arrays.asList("enable-automation")); | ||
63 | + options.setExperimentalOption("useAutomationExtension", false); | ||
64 | + options.addArguments("--remote-allow-origins=*"); | ||
65 | + | ||
66 | + | ||
67 | + String os_name = System.getProperty("os.name"); | ||
68 | + // 判断是否是windows系统 | ||
69 | + if (os_name.toLowerCase().startsWith("win")) { | ||
70 | + // windows | ||
71 | + options.setBinary(WIN_BINARY_PATH); | ||
72 | + System.setProperty("webdriver.chrome.driver", WIN_DRIVER_PATH); | ||
73 | + } else { | ||
74 | + // linux | ||
75 | + options.setBinary(LINUX_BINARY_PATH); | ||
76 | + System.setProperty("webdriver.chrome.driver", LINUX_DRIVER_PATH); | ||
77 | + } | ||
78 | + | ||
79 | + return new org.openqa.selenium.chrome.ChromeDriver(options); | ||
80 | + } | ||
81 | +} |
src/main/java/com/canrd/webmagic/processor/driver/MyEdgeDriver.java
0 → 100644
1 | +package com.canrd.webmagic.processor.driver; | ||
2 | + | ||
3 | +import org.openqa.selenium.WebDriver; | ||
4 | +import org.openqa.selenium.chrome.ChromeDriver; | ||
5 | +import org.openqa.selenium.chrome.ChromeOptions; | ||
6 | +import org.openqa.selenium.edge.EdgeDriver; | ||
7 | +import org.openqa.selenium.edge.EdgeOptions; | ||
8 | +import org.openqa.selenium.firefox.FirefoxOptions; | ||
9 | +import org.openqa.selenium.firefox.FirefoxProfile; | ||
10 | +import org.springframework.stereotype.Component; | ||
11 | + | ||
12 | +import java.time.Duration; | ||
13 | + | ||
14 | +/** | ||
15 | + * @author zhongnanhuang | ||
16 | + * @version 1.0 | ||
17 | + * @project webmagic-canrd-service | ||
18 | + * @description 火狐浏览器驱动 | ||
19 | + * @date 2024/5/23 10:45:50 | ||
20 | + * 驱动下载地址:https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/?form=MA13LH#downloads | ||
21 | + */ | ||
22 | +@Component | ||
23 | +public class MyEdgeDriver implements BrowserDriver{ | ||
24 | + private static final String WIN_DRIVER_PATH = "D:\\driver\\edge\\msedgedriver.exe"; | ||
25 | + private static final String LINUX_DRIVER_PATH = "/home/canrd/webmagic/chrome/chromedriver-linux64/chromedriver"; | ||
26 | + | ||
27 | + @Override | ||
28 | + public WebDriver getDriver() { | ||
29 | + // 设置 Chrome 驱动程序路径 | ||
30 | + System.setProperty("webdriver.edge.driver", WIN_DRIVER_PATH); // 替换为实际的驱动程序路径 | ||
31 | + | ||
32 | + // 初始化 ChromeOptions 对象 | ||
33 | + EdgeOptions options = new EdgeOptions(); | ||
34 | +// options.setPageLoadTimeout(Duration.ofSeconds(30)); // 设置页面加载超时时间为30秒 | ||
35 | +// options.addArguments("--blink-settings=imagesEnabled=false"); // 禁用图片加载 | ||
36 | +// options.addArguments("--disable-notifications"); // 禁用浏览器通知 | ||
37 | +// options.addArguments("--disable-infobars"); // 禁用信息栏 | ||
38 | +// options.addArguments("--disable-extensions"); // 禁用扩展 | ||
39 | +// options.addArguments("--disable-dev-shm-usage"); // 禁用/dev/shm使用 | ||
40 | +// options.setHeadless(true); | ||
41 | + | ||
42 | + | ||
43 | + // 设置其他选项,例如添加扩展程序等 | ||
44 | + // 初始化 WebDriver | ||
45 | + EdgeDriver driver = new EdgeDriver(options); | ||
46 | + | ||
47 | + return driver; | ||
48 | + } | ||
49 | +} |
src/main/java/com/canrd/webmagic/processor/driver/MyFirefoxDriver.java
0 → 100644
1 | +package com.canrd.webmagic.processor.driver; | ||
2 | + | ||
3 | +import org.openqa.selenium.WebDriver; | ||
4 | +import org.openqa.selenium.firefox.FirefoxOptions; | ||
5 | +import org.openqa.selenium.firefox.FirefoxProfile; | ||
6 | +import org.springframework.stereotype.Component; | ||
7 | + | ||
8 | +/** | ||
9 | + * @author zhongnanhuang | ||
10 | + * @version 1.0 | ||
11 | + * @project webmagic-canrd-service | ||
12 | + * @description 火狐浏览器驱动 | ||
13 | + * @date 2024/5/23 10:45:50 | ||
14 | + * 驱动下载地址:https://objects.githubusercontent.com/github-production-release-asset-2e65be | ||
15 | + */ | ||
16 | +@Component | ||
17 | +public class MyFirefoxDriver implements BrowserDriver{ | ||
18 | + private static final String WIN_DRIVER_PATH = "D:\\driver\\firefox\\geckodriver.exe"; | ||
19 | + private static final String LINUX_DRIVER_PATH = "/home/canrd/webmagic/chrome/chromedriver-linux64/chromedriver"; | ||
20 | + | ||
21 | + @Override | ||
22 | + public WebDriver getDriver() { | ||
23 | + // 创建 Firefox 配置文件 | ||
24 | + FirefoxProfile profile = new FirefoxProfile(); | ||
25 | + profile.setPreference("permissions.default.image", 2); // 禁用图片加载 | ||
26 | + profile.setPreference("permissions.default.stylesheet", 2); // 禁用 CSS | ||
27 | + profile.setPreference("media.peerconnection.enabled", false); // 禁用 WebRTC | ||
28 | + profile.setPreference("dom.webnotifications.enabled", false); // 禁用通知 | ||
29 | +// profile.setPreference("general.useragent.override", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"); // 设置自定义 User-Agent | ||
30 | + | ||
31 | + // 创建 Firefox 选项 | ||
32 | + FirefoxOptions options = new FirefoxOptions(); | ||
33 | + options.setProfile(profile); | ||
34 | + | ||
35 | + | ||
36 | + | ||
37 | + String os_name = System.getProperty("os.name"); | ||
38 | + // 判断是否是windows系统 | ||
39 | + if (os_name.toLowerCase().startsWith("win")) { | ||
40 | + // windows | ||
41 | + // 指定 GeckoDriver 的路径 | ||
42 | + System.setProperty("webdriver.gecko.driver", WIN_DRIVER_PATH); // 替换为实际的 geckodriver 路径 | ||
43 | + } else { | ||
44 | + // linux | ||
45 | + System.setProperty("webdriver.chrome.driver", LINUX_DRIVER_PATH); | ||
46 | + } | ||
47 | + | ||
48 | + // 初始化 WebDriver | ||
49 | + WebDriver driver = new org.openqa.selenium.firefox.FirefoxDriver(options); | ||
50 | + return driver; | ||
51 | + } | ||
52 | +} |
src/main/resources/application-local.yml
@@ -57,13 +57,13 @@ spring: | @@ -57,13 +57,13 @@ spring: | ||
57 | testWhileIdle: true | 57 | testWhileIdle: true |
58 | testOnBorrow: true | 58 | testOnBorrow: true |
59 | testOnReturn: true | 59 | testOnReturn: true |
60 | - password: canrd@2024 | 60 | + password: 123456 |
61 | time-between-eviction-runs-millis: 1000 | 61 | time-between-eviction-runs-millis: 1000 |
62 | - url: jdbc:mysql://39.108.227.113:3307/webmagic?useUnicode=true&characterEncoding=UTF-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=Asia/Shanghai&useSSL=false&autoReconnect=true&failOverReadOnly=false&maxReconnects=10&allowMultiQueries=true&useAffectedRows=true&autoReconnectForPools=true | 62 | + url: jdbc:mysql://localhost:3306/webmagic?useUnicode=true&characterEncoding=UTF-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=Asia/Shanghai&useSSL=false&autoReconnect=true&failOverReadOnly=false&maxReconnects=10&allowMultiQueries=true&useAffectedRows=true&autoReconnectForPools=true |
63 | username: root | 63 | username: root |
64 | redis: | 64 | redis: |
65 | database: 0 | 65 | database: 0 |
66 | - host: 39.108.227.113 | 66 | + host: localhost |
67 | lettuce: | 67 | lettuce: |
68 | pool: | 68 | pool: |
69 | max-active: 2000 | 69 | max-active: 2000 |