Commit b17c8d02d4038eb64c998b624715cc596c69a83a
Merge branch 'master' of http://39.108.227.113:8001/xiemaosheng2/webmagic-canrd-service
# Conflicts: # pom.xml # src/main/java/com/canrd/webmagic/config/SeleniumConfig.java
Showing
8 changed files
with
241 additions
and
52 deletions
pom.xml
... | ... | @@ -48,9 +48,11 @@ |
48 | 48 | <easyexcel.version>2.2.3</easyexcel.version> |
49 | 49 | <webmagic.version>0.10.0</webmagic.version> |
50 | 50 | <selenium.version>3.4.0</selenium.version> |
51 | +<!-- <browsermob.version>2.1.5</browsermob.version>--> | |
51 | 52 | </properties> |
52 | 53 | |
53 | 54 | <dependencies> |
55 | + | |
54 | 56 | <dependency> |
55 | 57 | <groupId>org.springframework.boot</groupId> |
56 | 58 | <artifactId>spring-boot-starter-web</artifactId> |
... | ... | @@ -62,6 +64,13 @@ |
62 | 64 | </exclusion> |
63 | 65 | </exclusions> |
64 | 66 | </dependency> |
67 | +<!-- <dependency>--> | |
68 | +<!-- <groupId>net.lightbody.bmp</groupId>--> | |
69 | +<!-- <artifactId>browsermob-core</artifactId>--> | |
70 | +<!-- <version>${browsermob.version}</version>--> | |
71 | +<!-- </dependency>--> | |
72 | + | |
73 | + | |
65 | 74 | <!-- webmagic核心库 --> |
66 | 75 | <dependency> |
67 | 76 | <groupId>us.codecraft</groupId> | ... | ... |
src/main/java/com/canrd/webmagic/config/SeleniumConfig.java
1 | 1 | package com.canrd.webmagic.config; |
2 | 2 | |
3 | -import com.canrd.webmagic.driver.ChromeBuildDriver; | |
4 | -import com.google.common.collect.ImmutableList; | |
5 | -import com.google.gson.JsonObject; | |
6 | -import com.sun.java.swing.plaf.windows.resources.windows; | |
3 | +import com.canrd.webmagic.processor.driver.BrowserDriver; | |
4 | +import com.canrd.webmagic.processor.driver.MyChromeDriver; | |
5 | +import com.canrd.webmagic.processor.driver.MyFirefoxDriver; | |
6 | +import lombok.Data; | |
7 | 7 | import org.openqa.selenium.WebDriver; |
8 | -import org.openqa.selenium.chrome.ChromeDriver; | |
9 | -import org.openqa.selenium.chrome.ChromeOptions; | |
10 | -import org.openqa.selenium.edge.EdgeDriver; | |
11 | -import org.openqa.selenium.edge.EdgeDriverService; | |
12 | -import org.openqa.selenium.edge.EdgeOptions; | |
13 | -import org.openqa.selenium.remote.DesiredCapabilities; | |
14 | -import org.springframework.context.annotation.Bean; | |
8 | +import org.springframework.beans.factory.annotation.Autowired; | |
9 | +import org.springframework.context.ApplicationContext; | |
10 | +import org.springframework.context.ApplicationContextAware; | |
15 | 11 | import org.springframework.context.annotation.Configuration; |
16 | 12 | |
17 | -import java.io.File; | |
18 | -import java.io.IOException; | |
19 | -import java.util.Arrays; | |
13 | +import java.util.*; | |
14 | + | |
20 | 15 | |
21 | 16 | /** |
22 | 17 | * @author: xms |
... | ... | @@ -25,45 +20,31 @@ import java.util.Arrays; |
25 | 20 | * @version: 1.0 |
26 | 21 | */ |
27 | 22 | @Configuration |
28 | -public class SeleniumConfig { | |
29 | - private static String currentDriver = ""; | |
30 | - | |
31 | - // @Bean | |
32 | - public WebDriver webDriver() throws InterruptedException, IOException { | |
23 | +@Data | |
24 | +public class SeleniumConfig implements ApplicationContextAware { | |
33 | 25 | |
34 | - // 初始化ChromeOptions | |
35 | - ChromeOptions chromeOptions = new ChromeBuildDriver().build("C:\\Program Files\\Google\\Chrome\\Application\\chromedriver.exe"); | |
26 | + private static ApplicationContext context; | |
36 | 27 | |
37 | - EdgeOptions edgeOptions = new EdgeOptions(); | |
38 | - | |
39 | - //配置Edge | |
40 | - File edgeFile = new File("C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedgedriver.exe"); | |
28 | + @Override | |
29 | + public void setApplicationContext(ApplicationContext applicationContext) { | |
30 | + context = applicationContext; | |
31 | + } | |
41 | 32 | |
42 | - edgeOptions.setPageLoadStrategy("none"); | |
43 | -// JsonObject jsonObject = edgeOptions.toJson(); | |
33 | + public static final String FIREFOX_KEY = "myFirefoxDriver"; | |
34 | + public static final String CHROME_KEY = "myChromeDriver"; | |
35 | + public static final String EDGE_KEY = "myEdgeDriver"; | |
44 | 36 | |
37 | + public static final List<String> DRIVER_KEYS = new ArrayList<>(Arrays.asList( | |
38 | + CHROME_KEY, | |
39 | + FIREFOX_KEY, | |
40 | + EDGE_KEY | |
41 | + )); | |
45 | 42 | |
46 | - String os_name = System.getProperty("os.name"); | |
47 | - // 判断是否是windows系统 | |
48 | - if (os_name.toLowerCase().startsWith("win")) { | |
49 | 43 | |
50 | -// edgeOptions.setBinary("C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedgedriver.exe"); | |
51 | - currentDriver = "edge"; | |
52 | - //windows | |
53 | - if (currentDriver.equals("") || currentDriver.equals("edge")) { | |
54 | - System.out.printf("chrome启动"); | |
55 | - System.setProperty("webdriver.chrome.driver", "C:\\Program Files\\Google\\Chrome\\Application\\chromedriver.exe"); | |
56 | - currentDriver = "chrome"; | |
57 | - return new ChromeDriver(chromeOptions); | |
58 | - } else if (currentDriver.equals("chrome")) { | |
59 | - System.out.printf("edge启动"); | |
60 | - System.setProperty("webdriver.edge.driver", "C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedgedriver.exe"); | |
61 | - currentDriver = "edge"; | |
62 | - return new EdgeDriver(edgeOptions); | |
63 | - } | |
64 | - } else { | |
65 | - // linux | |
66 | - System.setProperty("webdriver.chrome.driver", "C:\\Program Files\\Google\\Chrome\\Application\\chromedriver.exe"); | |
44 | + public WebDriver getWebDriver(String key) { | |
45 | + Object bean = context.getBean(key); | |
46 | + if (bean instanceof BrowserDriver) { | |
47 | + return ((BrowserDriver) bean).getDriver(); | |
67 | 48 | } |
68 | 49 | return null; |
69 | 50 | } | ... | ... |
src/main/java/com/canrd/webmagic/processor/download/SeleniumDownloader.java
... | ... | @@ -49,7 +49,8 @@ public class SeleniumDownloader extends AbstractDownloader { |
49 | 49 | Page page = Page.fail(); |
50 | 50 | WebDriver webDriver = null; |
51 | 51 | try { |
52 | - webDriver = config.webDriver(); | |
52 | + //0默认谷歌浏览器 | |
53 | + webDriver = config.getWebDriver(SeleniumConfig.DRIVER_KEYS.get(0)); | |
53 | 54 | WebDriver.Options manage = webDriver.manage(); |
54 | 55 | Site site = task.getSite(); |
55 | 56 | site.setUserAgent(Agent.getRandom()); |
... | ... | @@ -88,7 +89,7 @@ public class SeleniumDownloader extends AbstractDownloader { |
88 | 89 | } finally { |
89 | 90 | if (webDriver != null) { |
90 | 91 | webDriver.close(); |
91 | - webDriver.quit(); | |
92 | +// webDriver.quit(); | |
92 | 93 | webDriver = null; |
93 | 94 | } |
94 | 95 | } | ... | ... |
src/main/java/com/canrd/webmagic/processor/driver/BrowserDriver.java
0 → 100644
1 | +package com.canrd.webmagic.processor.driver; | |
2 | + | |
3 | +import org.openqa.selenium.WebDriver; | |
4 | + | |
5 | +/** | |
6 | + * @author zhongnanhuang | |
7 | + * @version 1.0 | |
8 | + * @project webmagic-canrd-service | |
9 | + * @description 驱动接口 | |
10 | + * @date 2024/5/23 10:46:16 | |
11 | + */ | |
12 | +public interface BrowserDriver { | |
13 | + | |
14 | + WebDriver getDriver(); | |
15 | + | |
16 | +} | ... | ... |
src/main/java/com/canrd/webmagic/processor/driver/MyChromeDriver.java
0 → 100644
1 | +package com.canrd.webmagic.processor.driver; | |
2 | + | |
3 | +import org.openqa.selenium.WebDriver; | |
4 | +import org.openqa.selenium.chrome.ChromeOptions; | |
5 | +import org.springframework.stereotype.Component; | |
6 | + | |
7 | +import java.util.Arrays; | |
8 | + | |
9 | +/** | |
10 | + * @author zhongnanhuang | |
11 | + * @version 1.0 | |
12 | + * @project webmagic-canrd-service | |
13 | + * @description 谷歌浏览器驱动 | |
14 | + * @date 2024/5/23 10:49:30 | |
15 | + * 驱动下载地址:https://storage.googleapis.com/chrome-for-testing-public | |
16 | + */ | |
17 | +@Component | |
18 | +public class MyChromeDriver implements BrowserDriver{ | |
19 | + private static final String WIN_DRIVER_PATH = "D:\\driver\\chrome\\chromedriver-win64\\chromedriver.exe"; | |
20 | + private static final String LINUX_DRIVER_PATH = "/home/canrd/webmagic/chrome/chromedriver-linux64/chromedriver"; | |
21 | + private static final String WIN_BINARY_PATH = "D:\\driver\\chrome\\chrome-headless-shell-win64\\chrome-headless-shell.exe"; | |
22 | + private static final String LINUX_BINARY_PATH = "/home/canrd/webmagic/chrome/chrome-linux64/chrome"; | |
23 | + | |
24 | + @Override | |
25 | + public WebDriver getDriver() { | |
26 | + // 初始化ChromeOptions | |
27 | + ChromeOptions options = new ChromeOptions(); | |
28 | + // 添加代理,这里使用的代理是示例,需要替换为实际的代理服务器地址和端口 | |
29 | +// options.addArguments("--proxy-server=http://proxy-server:port"); | |
30 | + | |
31 | + // 禁用JavaScript,有时这能帮助绕过Cloudflare的检查 | |
32 | + options.addArguments("--disable-javascript"); | |
33 | + | |
34 | + // 禁用浏览器扩展,如果知道Cloudflare使用了特定的扩展,可以禁用它 | |
35 | + options.addArguments("--disable-extensions"); | |
36 | + | |
37 | + // 禁用本地缓存,确保每次访问都从服务器获取 | |
38 | + options.addArguments("--disable-application-cache"); | |
39 | + | |
40 | + // 禁止策略化 | |
41 | + options.addArguments("--disable-infobars"); | |
42 | + // 解决DevToolsActivePort文件不存在的报错 | |
43 | + options.addArguments("--no-sandbox"); | |
44 | + // 指定浏览器分辨 | |
45 | + options.addArguments("window-size=1920x3000"); | |
46 | + // 谷歌文档提到需要加上这个属性来规避bug | |
47 | + options.addArguments("--disable-gpu"); | |
48 | + // 隐身模式(无痕模式) | |
49 | + options.addArguments("--incognito"); | |
50 | + // 最大化运行(全屏窗口),不设置,取元素会报错 | |
51 | + options.addArguments("--start-maximized"); | |
52 | + // 禁用浏览器正在被自动化程序控制的提示 | |
53 | + options.addArguments("--disable-infobars"); | |
54 | + // 隐藏滚动条, 应对一些特殊页面 | |
55 | + options.addArguments("--hide-scrollbars"); | |
56 | + // 不加载图片, 提升速度 | |
57 | + options.addArguments("blink-settings=imagesEnabled=false"); | |
58 | + // 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败 | |
59 | + options.addArguments("--headless"); | |
60 | + //禁用 blink 特征 | |
61 | + options.addArguments("disable-blink-features=AutomationControlled"); | |
62 | + options.setExperimentalOption("excludeSwitches", Arrays.asList("enable-automation")); | |
63 | + options.setExperimentalOption("useAutomationExtension", false); | |
64 | + options.addArguments("--remote-allow-origins=*"); | |
65 | + | |
66 | + | |
67 | + String os_name = System.getProperty("os.name"); | |
68 | + // 判断是否是windows系统 | |
69 | + if (os_name.toLowerCase().startsWith("win")) { | |
70 | + // windows | |
71 | + options.setBinary(WIN_BINARY_PATH); | |
72 | + System.setProperty("webdriver.chrome.driver", WIN_DRIVER_PATH); | |
73 | + } else { | |
74 | + // linux | |
75 | + options.setBinary(LINUX_BINARY_PATH); | |
76 | + System.setProperty("webdriver.chrome.driver", LINUX_DRIVER_PATH); | |
77 | + } | |
78 | + | |
79 | + return new org.openqa.selenium.chrome.ChromeDriver(options); | |
80 | + } | |
81 | +} | ... | ... |
src/main/java/com/canrd/webmagic/processor/driver/MyEdgeDriver.java
0 → 100644
1 | +package com.canrd.webmagic.processor.driver; | |
2 | + | |
3 | +import org.openqa.selenium.WebDriver; | |
4 | +import org.openqa.selenium.chrome.ChromeDriver; | |
5 | +import org.openqa.selenium.chrome.ChromeOptions; | |
6 | +import org.openqa.selenium.edge.EdgeDriver; | |
7 | +import org.openqa.selenium.edge.EdgeOptions; | |
8 | +import org.openqa.selenium.firefox.FirefoxOptions; | |
9 | +import org.openqa.selenium.firefox.FirefoxProfile; | |
10 | +import org.springframework.stereotype.Component; | |
11 | + | |
12 | +import java.time.Duration; | |
13 | + | |
14 | +/** | |
15 | + * @author zhongnanhuang | |
16 | + * @version 1.0 | |
17 | + * @project webmagic-canrd-service | |
18 | + * @description 火狐浏览器驱动 | |
19 | + * @date 2024/5/23 10:45:50 | |
20 | + * 驱动下载地址:https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/?form=MA13LH#downloads | |
21 | + */ | |
22 | +@Component | |
23 | +public class MyEdgeDriver implements BrowserDriver{ | |
24 | + private static final String WIN_DRIVER_PATH = "D:\\driver\\edge\\msedgedriver.exe"; | |
25 | + private static final String LINUX_DRIVER_PATH = "/home/canrd/webmagic/chrome/chromedriver-linux64/chromedriver"; | |
26 | + | |
27 | + @Override | |
28 | + public WebDriver getDriver() { | |
29 | + // 设置 Chrome 驱动程序路径 | |
30 | + System.setProperty("webdriver.edge.driver", WIN_DRIVER_PATH); // 替换为实际的驱动程序路径 | |
31 | + | |
32 | + // 初始化 ChromeOptions 对象 | |
33 | + EdgeOptions options = new EdgeOptions(); | |
34 | +// options.setPageLoadTimeout(Duration.ofSeconds(30)); // 设置页面加载超时时间为30秒 | |
35 | +// options.addArguments("--blink-settings=imagesEnabled=false"); // 禁用图片加载 | |
36 | +// options.addArguments("--disable-notifications"); // 禁用浏览器通知 | |
37 | +// options.addArguments("--disable-infobars"); // 禁用信息栏 | |
38 | +// options.addArguments("--disable-extensions"); // 禁用扩展 | |
39 | +// options.addArguments("--disable-dev-shm-usage"); // 禁用/dev/shm使用 | |
40 | +// options.setHeadless(true); | |
41 | + | |
42 | + | |
43 | + // 设置其他选项,例如添加扩展程序等 | |
44 | + // 初始化 WebDriver | |
45 | + EdgeDriver driver = new EdgeDriver(options); | |
46 | + | |
47 | + return driver; | |
48 | + } | |
49 | +} | ... | ... |
src/main/java/com/canrd/webmagic/processor/driver/MyFirefoxDriver.java
0 → 100644
1 | +package com.canrd.webmagic.processor.driver; | |
2 | + | |
3 | +import org.openqa.selenium.WebDriver; | |
4 | +import org.openqa.selenium.firefox.FirefoxOptions; | |
5 | +import org.openqa.selenium.firefox.FirefoxProfile; | |
6 | +import org.springframework.stereotype.Component; | |
7 | + | |
8 | +/** | |
9 | + * @author zhongnanhuang | |
10 | + * @version 1.0 | |
11 | + * @project webmagic-canrd-service | |
12 | + * @description 火狐浏览器驱动 | |
13 | + * @date 2024/5/23 10:45:50 | |
14 | + * 驱动下载地址:https://objects.githubusercontent.com/github-production-release-asset-2e65be | |
15 | + */ | |
16 | +@Component | |
17 | +public class MyFirefoxDriver implements BrowserDriver{ | |
18 | + private static final String WIN_DRIVER_PATH = "D:\\driver\\firefox\\geckodriver.exe"; | |
19 | + private static final String LINUX_DRIVER_PATH = "/home/canrd/webmagic/chrome/chromedriver-linux64/chromedriver"; | |
20 | + | |
21 | + @Override | |
22 | + public WebDriver getDriver() { | |
23 | + // 创建 Firefox 配置文件 | |
24 | + FirefoxProfile profile = new FirefoxProfile(); | |
25 | + profile.setPreference("permissions.default.image", 2); // 禁用图片加载 | |
26 | + profile.setPreference("permissions.default.stylesheet", 2); // 禁用 CSS | |
27 | + profile.setPreference("media.peerconnection.enabled", false); // 禁用 WebRTC | |
28 | + profile.setPreference("dom.webnotifications.enabled", false); // 禁用通知 | |
29 | +// profile.setPreference("general.useragent.override", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"); // 设置自定义 User-Agent | |
30 | + | |
31 | + // 创建 Firefox 选项 | |
32 | + FirefoxOptions options = new FirefoxOptions(); | |
33 | + options.setProfile(profile); | |
34 | + | |
35 | + | |
36 | + | |
37 | + String os_name = System.getProperty("os.name"); | |
38 | + // 判断是否是windows系统 | |
39 | + if (os_name.toLowerCase().startsWith("win")) { | |
40 | + // windows | |
41 | + // 指定 GeckoDriver 的路径 | |
42 | + System.setProperty("webdriver.gecko.driver", WIN_DRIVER_PATH); // 替换为实际的 geckodriver 路径 | |
43 | + } else { | |
44 | + // linux | |
45 | + System.setProperty("webdriver.chrome.driver", LINUX_DRIVER_PATH); | |
46 | + } | |
47 | + | |
48 | + // 初始化 WebDriver | |
49 | + WebDriver driver = new org.openqa.selenium.firefox.FirefoxDriver(options); | |
50 | + return driver; | |
51 | + } | |
52 | +} | ... | ... |
src/main/resources/application-local.yml
... | ... | @@ -57,13 +57,13 @@ spring: |
57 | 57 | testWhileIdle: true |
58 | 58 | testOnBorrow: true |
59 | 59 | testOnReturn: true |
60 | - password: canrd@2024 | |
60 | + password: 123456 | |
61 | 61 | time-between-eviction-runs-millis: 1000 |
62 | - url: jdbc:mysql://39.108.227.113:3307/webmagic?useUnicode=true&characterEncoding=UTF-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=Asia/Shanghai&useSSL=false&autoReconnect=true&failOverReadOnly=false&maxReconnects=10&allowMultiQueries=true&useAffectedRows=true&autoReconnectForPools=true | |
62 | + url: jdbc:mysql://localhost:3306/webmagic?useUnicode=true&characterEncoding=UTF-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=Asia/Shanghai&useSSL=false&autoReconnect=true&failOverReadOnly=false&maxReconnects=10&allowMultiQueries=true&useAffectedRows=true&autoReconnectForPools=true | |
63 | 63 | username: root |
64 | 64 | redis: |
65 | 65 | database: 0 |
66 | - host: 39.108.227.113 | |
66 | + host: localhost | |
67 | 67 | lettuce: |
68 | 68 | pool: |
69 | 69 | max-active: 2000 | ... | ... |