Commit 236651f679bba19ae4fe5e9ebdac5961389d89f3

Authored by zhongnanhuang
1 parent f7957dbb

新增了火狐和edge驱动

@@ -48,6 +48,7 @@ @@ -48,6 +48,7 @@
48 <easyexcel.version>2.2.3</easyexcel.version> 48 <easyexcel.version>2.2.3</easyexcel.version>
49 <webmagic.version>0.10.0</webmagic.version> 49 <webmagic.version>0.10.0</webmagic.version>
50 <selenium.version>3.4.0</selenium.version> 50 <selenium.version>3.4.0</selenium.version>
  51 +<!-- <browsermob.version>2.1.5</browsermob.version>-->
51 </properties> 52 </properties>
52 53
53 <dependencies> 54 <dependencies>
@@ -63,6 +64,12 @@ @@ -63,6 +64,12 @@
63 </exclusion> 64 </exclusion>
64 </exclusions> 65 </exclusions>
65 </dependency> 66 </dependency>
  67 +<!-- <dependency>-->
  68 +<!-- <groupId>net.lightbody.bmp</groupId>-->
  69 +<!-- <artifactId>browsermob-core</artifactId>-->
  70 +<!-- <version>${browsermob.version}</version>-->
  71 +<!-- </dependency>-->
  72 +
66 73
67 <!-- webmagic核心库 --> 74 <!-- webmagic核心库 -->
68 <dependency> 75 <dependency>
src/main/java/com/canrd/webmagic/config/SeleniumConfig.java
1 package com.canrd.webmagic.config; 1 package com.canrd.webmagic.config;
2 2
  3 +import com.canrd.webmagic.processor.driver.BrowserDriver;
  4 +import com.canrd.webmagic.processor.driver.MyChromeDriver;
  5 +import com.canrd.webmagic.processor.driver.MyFirefoxDriver;
  6 +import lombok.Data;
3 import org.openqa.selenium.WebDriver; 7 import org.openqa.selenium.WebDriver;
4 -import org.openqa.selenium.chrome.ChromeDriver;  
5 -import org.openqa.selenium.chrome.ChromeOptions;  
6 -import org.springframework.context.annotation.Bean; 8 +import org.springframework.beans.factory.annotation.Autowired;
  9 +import org.springframework.context.ApplicationContext;
  10 +import org.springframework.context.ApplicationContextAware;
7 import org.springframework.context.annotation.Configuration; 11 import org.springframework.context.annotation.Configuration;
8 12
9 -import java.util.Arrays; 13 +import java.util.*;
  14 +
10 15
11 /** 16 /**
12 * @author: xms 17 * @author: xms
@@ -15,62 +20,32 @@ import java.util.Arrays; @@ -15,62 +20,32 @@ import java.util.Arrays;
15 * @version: 1.0 20 * @version: 1.0
16 */ 21 */
17 @Configuration 22 @Configuration
18 -public class SeleniumConfig { 23 +@Data
  24 +public class SeleniumConfig implements ApplicationContextAware {
19 25
20 -// @Bean  
21 - public WebDriver webDriver() throws InterruptedException {  
22 - // 初始化ChromeOptions  
23 - ChromeOptions options = new ChromeOptions();  
24 - // 添加代理,这里使用的代理是示例,需要替换为实际的代理服务器地址和端口  
25 -// options.addArguments("--proxy-server=http://proxy-server:port"); 26 + private static ApplicationContext context;
26 27
27 - // 禁用JavaScript,有时这能帮助绕过Cloudflare的检查  
28 - options.addArguments("--disable-javascript"); 28 + @Override
  29 + public void setApplicationContext(ApplicationContext applicationContext) {
  30 + context = applicationContext;
  31 + }
29 32
30 - // 禁用浏览器扩展,如果知道Cloudflare使用了特定的扩展,可以禁用它  
31 - options.addArguments("--disable-extensions"); 33 + public static final String FIREFOX_KEY = "myFirefoxDriver";
  34 + public static final String CHROME_KEY = "myChromeDriver";
  35 + public static final String EDGE_KEY = "myEdgeDriver";
32 36
33 - // 禁用本地缓存,确保每次访问都从服务器获取  
34 - options.addArguments("--disable-application-cache"); 37 + public static final List<String> DRIVER_KEYS = new ArrayList<>(Arrays.asList(
  38 + CHROME_KEY,
  39 + FIREFOX_KEY,
  40 + EDGE_KEY
  41 + ));
35 42
36 - // 禁止策略化  
37 - options.addArguments("--disable-infobars");  
38 - // 解决DevToolsActivePort文件不存在的报错  
39 - options.addArguments("--no-sandbox");  
40 - // 指定浏览器分辨  
41 - options.addArguments("window-size=1920x3000");  
42 - // 谷歌文档提到需要加上这个属性来规避bug  
43 - options.addArguments("--disable-gpu");  
44 - // 隐身模式(无痕模式)  
45 - options.addArguments("--incognito");  
46 - // 最大化运行(全屏窗口),不设置,取元素会报错  
47 - options.addArguments("--start-maximized");  
48 - // 禁用浏览器正在被自动化程序控制的提示  
49 - options.addArguments("--disable-infobars");  
50 - // 隐藏滚动条, 应对一些特殊页面  
51 - options.addArguments("--hide-scrollbars");  
52 - // 不加载图片, 提升速度  
53 - options.addArguments("blink-settings=imagesEnabled=false");  
54 - // 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败  
55 - options.addArguments("--headless");  
56 - //禁用 blink 特征  
57 - options.addArguments("disable-blink-features=AutomationControlled");  
58 - options.setExperimentalOption("excludeSwitches", Arrays.asList("enable-automation"));  
59 - options.setExperimentalOption("useAutomationExtension", false);  
60 - options.addArguments("--remote-allow-origins=*");  
61 43
62 - String os_name = System.getProperty("os.name");  
63 - // 判断是否是windows系统  
64 - if (os_name.toLowerCase().startsWith("win")) {  
65 - // windows  
66 - options.setBinary("D:\\chrome\\chrome-headless-shell-win64\\chrome-headless-shell-win64\\chrome-headless-shell.exe");  
67 - System.setProperty("webdriver.chrome.driver", "D:\\chrome\\chromedriver-win64\\chromedriver-win64\\chromedriver.exe");  
68 - } else {  
69 - // linux  
70 - options.setBinary("/home/canrd/webmagic/chrome/chrome-linux64/chrome");  
71 - System.setProperty("webdriver.chrome.driver", "/home/canrd/webmagic/chrome/chromedriver-linux64/chromedriver"); 44 + public WebDriver getWebDriver(String key) {
  45 + Object bean = context.getBean(key);
  46 + if (bean instanceof BrowserDriver) {
  47 + return ((BrowserDriver) bean).getDriver();
72 } 48 }
73 -  
74 - return new ChromeDriver(options); 49 + return null;
75 } 50 }
76 } 51 }
src/main/java/com/canrd/webmagic/processor/download/SeleniumDownloader.java
@@ -49,7 +49,8 @@ public class SeleniumDownloader extends AbstractDownloader { @@ -49,7 +49,8 @@ public class SeleniumDownloader extends AbstractDownloader {
49 Page page = Page.fail(); 49 Page page = Page.fail();
50 WebDriver webDriver = null; 50 WebDriver webDriver = null;
51 try { 51 try {
52 - webDriver = config.webDriver(); 52 + //0默认谷歌浏览器
  53 + webDriver = config.getWebDriver(SeleniumConfig.DRIVER_KEYS.get(0));
53 WebDriver.Options manage = webDriver.manage(); 54 WebDriver.Options manage = webDriver.manage();
54 Site site = task.getSite(); 55 Site site = task.getSite();
55 site.setUserAgent(Agent.getRandom()); 56 site.setUserAgent(Agent.getRandom());
@@ -88,7 +89,7 @@ public class SeleniumDownloader extends AbstractDownloader { @@ -88,7 +89,7 @@ public class SeleniumDownloader extends AbstractDownloader {
88 } finally { 89 } finally {
89 if (webDriver != null) { 90 if (webDriver != null) {
90 webDriver.close(); 91 webDriver.close();
91 - webDriver.quit(); 92 +// webDriver.quit();
92 webDriver = null; 93 webDriver = null;
93 } 94 }
94 } 95 }
src/main/java/com/canrd/webmagic/processor/driver/BrowserDriver.java 0 → 100644
  1 +package com.canrd.webmagic.processor.driver;
  2 +
  3 +import org.openqa.selenium.WebDriver;
  4 +
  5 +/**
  6 + * @author zhongnanhuang
  7 + * @version 1.0
  8 + * @project webmagic-canrd-service
  9 + * @description 驱动接口
  10 + * @date 2024/5/23 10:46:16
  11 + */
  12 +public interface BrowserDriver {
  13 +
  14 + WebDriver getDriver();
  15 +
  16 +}
src/main/java/com/canrd/webmagic/processor/driver/MyChromeDriver.java 0 → 100644
  1 +package com.canrd.webmagic.processor.driver;
  2 +
  3 +import org.openqa.selenium.WebDriver;
  4 +import org.openqa.selenium.chrome.ChromeOptions;
  5 +import org.springframework.stereotype.Component;
  6 +
  7 +import java.util.Arrays;
  8 +
  9 +/**
  10 + * @author zhongnanhuang
  11 + * @version 1.0
  12 + * @project webmagic-canrd-service
  13 + * @description 谷歌浏览器驱动
  14 + * @date 2024/5/23 10:49:30
  15 + * 驱动下载地址:https://storage.googleapis.com/chrome-for-testing-public
  16 + */
  17 +@Component
  18 +public class MyChromeDriver implements BrowserDriver{
  19 + private static final String WIN_DRIVER_PATH = "D:\\driver\\chrome\\chromedriver-win64\\chromedriver.exe";
  20 + private static final String LINUX_DRIVER_PATH = "/home/canrd/webmagic/chrome/chromedriver-linux64/chromedriver";
  21 + private static final String WIN_BINARY_PATH = "D:\\driver\\chrome\\chrome-headless-shell-win64\\chrome-headless-shell.exe";
  22 + private static final String LINUX_BINARY_PATH = "/home/canrd/webmagic/chrome/chrome-linux64/chrome";
  23 +
  24 + @Override
  25 + public WebDriver getDriver() {
  26 + // 初始化ChromeOptions
  27 + ChromeOptions options = new ChromeOptions();
  28 + // 添加代理,这里使用的代理是示例,需要替换为实际的代理服务器地址和端口
  29 +// options.addArguments("--proxy-server=http://proxy-server:port");
  30 +
  31 + // 禁用JavaScript,有时这能帮助绕过Cloudflare的检查
  32 + options.addArguments("--disable-javascript");
  33 +
  34 + // 禁用浏览器扩展,如果知道Cloudflare使用了特定的扩展,可以禁用它
  35 + options.addArguments("--disable-extensions");
  36 +
  37 + // 禁用本地缓存,确保每次访问都从服务器获取
  38 + options.addArguments("--disable-application-cache");
  39 +
  40 + // 禁止策略化
  41 + options.addArguments("--disable-infobars");
  42 + // 解决DevToolsActivePort文件不存在的报错
  43 + options.addArguments("--no-sandbox");
  44 + // 指定浏览器分辨
  45 + options.addArguments("window-size=1920x3000");
  46 + // 谷歌文档提到需要加上这个属性来规避bug
  47 + options.addArguments("--disable-gpu");
  48 + // 隐身模式(无痕模式)
  49 + options.addArguments("--incognito");
  50 + // 最大化运行(全屏窗口),不设置,取元素会报错
  51 + options.addArguments("--start-maximized");
  52 + // 禁用浏览器正在被自动化程序控制的提示
  53 + options.addArguments("--disable-infobars");
  54 + // 隐藏滚动条, 应对一些特殊页面
  55 + options.addArguments("--hide-scrollbars");
  56 + // 不加载图片, 提升速度
  57 + options.addArguments("blink-settings=imagesEnabled=false");
  58 + // 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
  59 + options.addArguments("--headless");
  60 + //禁用 blink 特征
  61 + options.addArguments("disable-blink-features=AutomationControlled");
  62 + options.setExperimentalOption("excludeSwitches", Arrays.asList("enable-automation"));
  63 + options.setExperimentalOption("useAutomationExtension", false);
  64 + options.addArguments("--remote-allow-origins=*");
  65 +
  66 +
  67 + String os_name = System.getProperty("os.name");
  68 + // 判断是否是windows系统
  69 + if (os_name.toLowerCase().startsWith("win")) {
  70 + // windows
  71 + options.setBinary(WIN_BINARY_PATH);
  72 + System.setProperty("webdriver.chrome.driver", WIN_DRIVER_PATH);
  73 + } else {
  74 + // linux
  75 + options.setBinary(LINUX_BINARY_PATH);
  76 + System.setProperty("webdriver.chrome.driver", LINUX_DRIVER_PATH);
  77 + }
  78 +
  79 + return new org.openqa.selenium.chrome.ChromeDriver(options);
  80 + }
  81 +}
src/main/java/com/canrd/webmagic/processor/driver/MyEdgeDriver.java 0 → 100644
  1 +package com.canrd.webmagic.processor.driver;
  2 +
  3 +import org.openqa.selenium.WebDriver;
  4 +import org.openqa.selenium.chrome.ChromeDriver;
  5 +import org.openqa.selenium.chrome.ChromeOptions;
  6 +import org.openqa.selenium.edge.EdgeDriver;
  7 +import org.openqa.selenium.edge.EdgeOptions;
  8 +import org.openqa.selenium.firefox.FirefoxOptions;
  9 +import org.openqa.selenium.firefox.FirefoxProfile;
  10 +import org.springframework.stereotype.Component;
  11 +
  12 +import java.time.Duration;
  13 +
  14 +/**
  15 + * @author zhongnanhuang
  16 + * @version 1.0
  17 + * @project webmagic-canrd-service
  18 + * @description 火狐浏览器驱动
  19 + * @date 2024/5/23 10:45:50
  20 + * 驱动下载地址:https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/?form=MA13LH#downloads
  21 + */
  22 +@Component
  23 +public class MyEdgeDriver implements BrowserDriver{
  24 + private static final String WIN_DRIVER_PATH = "D:\\driver\\edge\\msedgedriver.exe";
  25 + private static final String LINUX_DRIVER_PATH = "/home/canrd/webmagic/chrome/chromedriver-linux64/chromedriver";
  26 +
  27 + @Override
  28 + public WebDriver getDriver() {
  29 + // 设置 Chrome 驱动程序路径
  30 + System.setProperty("webdriver.edge.driver", WIN_DRIVER_PATH); // 替换为实际的驱动程序路径
  31 +
  32 + // 初始化 ChromeOptions 对象
  33 + EdgeOptions options = new EdgeOptions();
  34 +// options.setPageLoadTimeout(Duration.ofSeconds(30)); // 设置页面加载超时时间为30秒
  35 +// options.addArguments("--blink-settings=imagesEnabled=false"); // 禁用图片加载
  36 +// options.addArguments("--disable-notifications"); // 禁用浏览器通知
  37 +// options.addArguments("--disable-infobars"); // 禁用信息栏
  38 +// options.addArguments("--disable-extensions"); // 禁用扩展
  39 +// options.addArguments("--disable-dev-shm-usage"); // 禁用/dev/shm使用
  40 +// options.setHeadless(true);
  41 +
  42 +
  43 + // 设置其他选项,例如添加扩展程序等
  44 + // 初始化 WebDriver
  45 + EdgeDriver driver = new EdgeDriver(options);
  46 +
  47 + return driver;
  48 + }
  49 +}
src/main/java/com/canrd/webmagic/processor/driver/MyFirefoxDriver.java 0 → 100644
  1 +package com.canrd.webmagic.processor.driver;
  2 +
  3 +import org.openqa.selenium.WebDriver;
  4 +import org.openqa.selenium.firefox.FirefoxOptions;
  5 +import org.openqa.selenium.firefox.FirefoxProfile;
  6 +import org.springframework.stereotype.Component;
  7 +
  8 +/**
  9 + * @author zhongnanhuang
  10 + * @version 1.0
  11 + * @project webmagic-canrd-service
  12 + * @description 火狐浏览器驱动
  13 + * @date 2024/5/23 10:45:50
  14 + * 驱动下载地址:https://objects.githubusercontent.com/github-production-release-asset-2e65be
  15 + */
  16 +@Component
  17 +public class MyFirefoxDriver implements BrowserDriver{
  18 + private static final String WIN_DRIVER_PATH = "D:\\driver\\firefox\\geckodriver.exe";
  19 + private static final String LINUX_DRIVER_PATH = "/home/canrd/webmagic/chrome/chromedriver-linux64/chromedriver";
  20 +
  21 + @Override
  22 + public WebDriver getDriver() {
  23 + // 创建 Firefox 配置文件
  24 + FirefoxProfile profile = new FirefoxProfile();
  25 + profile.setPreference("permissions.default.image", 2); // 禁用图片加载
  26 + profile.setPreference("permissions.default.stylesheet", 2); // 禁用 CSS
  27 + profile.setPreference("media.peerconnection.enabled", false); // 禁用 WebRTC
  28 + profile.setPreference("dom.webnotifications.enabled", false); // 禁用通知
  29 +// profile.setPreference("general.useragent.override", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"); // 设置自定义 User-Agent
  30 +
  31 + // 创建 Firefox 选项
  32 + FirefoxOptions options = new FirefoxOptions();
  33 + options.setProfile(profile);
  34 +
  35 +
  36 +
  37 + String os_name = System.getProperty("os.name");
  38 + // 判断是否是windows系统
  39 + if (os_name.toLowerCase().startsWith("win")) {
  40 + // windows
  41 + // 指定 GeckoDriver 的路径
  42 + System.setProperty("webdriver.gecko.driver", WIN_DRIVER_PATH); // 替换为实际的 geckodriver 路径
  43 + } else {
  44 + // linux
  45 + System.setProperty("webdriver.chrome.driver", LINUX_DRIVER_PATH);
  46 + }
  47 +
  48 + // 初始化 WebDriver
  49 + WebDriver driver = new org.openqa.selenium.firefox.FirefoxDriver(options);
  50 + return driver;
  51 + }
  52 +}
src/main/resources/application-local.yml
@@ -57,13 +57,13 @@ spring: @@ -57,13 +57,13 @@ spring:
57 testWhileIdle: true 57 testWhileIdle: true
58 testOnBorrow: true 58 testOnBorrow: true
59 testOnReturn: true 59 testOnReturn: true
60 - password: canrd@2024 60 + password: 123456
61 time-between-eviction-runs-millis: 1000 61 time-between-eviction-runs-millis: 1000
62 - url: jdbc:mysql://39.108.227.113:3307/webmagic?useUnicode=true&characterEncoding=UTF-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=Asia/Shanghai&useSSL=false&autoReconnect=true&failOverReadOnly=false&maxReconnects=10&allowMultiQueries=true&useAffectedRows=true&autoReconnectForPools=true 62 + url: jdbc:mysql://localhost:3306/webmagic?useUnicode=true&characterEncoding=UTF-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=Asia/Shanghai&useSSL=false&autoReconnect=true&failOverReadOnly=false&maxReconnects=10&allowMultiQueries=true&useAffectedRows=true&autoReconnectForPools=true
63 username: root 63 username: root
64 redis: 64 redis:
65 database: 0 65 database: 0
66 - host: 39.108.227.113 66 + host: localhost
67 lettuce: 67 lettuce:
68 pool: 68 pool:
69 max-active: 2000 69 max-active: 2000