Commit 236651f679bba19ae4fe5e9ebdac5961389d89f3

Authored by zhongnanhuang
1 parent f7957dbb

新增了火狐和edge驱动

... ... @@ -48,6 +48,7 @@
48 48 <easyexcel.version>2.2.3</easyexcel.version>
49 49 <webmagic.version>0.10.0</webmagic.version>
50 50 <selenium.version>3.4.0</selenium.version>
  51 +<!-- <browsermob.version>2.1.5</browsermob.version>-->
51 52 </properties>
52 53  
53 54 <dependencies>
... ... @@ -63,6 +64,12 @@
63 64 </exclusion>
64 65 </exclusions>
65 66 </dependency>
  67 +<!-- <dependency>-->
  68 +<!-- <groupId>net.lightbody.bmp</groupId>-->
  69 +<!-- <artifactId>browsermob-core</artifactId>-->
  70 +<!-- <version>${browsermob.version}</version>-->
  71 +<!-- </dependency>-->
  72 +
66 73  
67 74 <!-- webmagic核心库 -->
68 75 <dependency>
... ...
src/main/java/com/canrd/webmagic/config/SeleniumConfig.java
1 1 package com.canrd.webmagic.config;
2 2  
  3 +import com.canrd.webmagic.processor.driver.BrowserDriver;
  4 +import com.canrd.webmagic.processor.driver.MyChromeDriver;
  5 +import com.canrd.webmagic.processor.driver.MyFirefoxDriver;
  6 +import lombok.Data;
3 7 import org.openqa.selenium.WebDriver;
4   -import org.openqa.selenium.chrome.ChromeDriver;
5   -import org.openqa.selenium.chrome.ChromeOptions;
6   -import org.springframework.context.annotation.Bean;
  8 +import org.springframework.beans.factory.annotation.Autowired;
  9 +import org.springframework.context.ApplicationContext;
  10 +import org.springframework.context.ApplicationContextAware;
7 11 import org.springframework.context.annotation.Configuration;
8 12  
9   -import java.util.Arrays;
  13 +import java.util.*;
  14 +
10 15  
11 16 /**
12 17 * @author: xms
... ... @@ -15,62 +20,32 @@ import java.util.Arrays;
15 20 * @version: 1.0
16 21 */
17 22 @Configuration
18   -public class SeleniumConfig {
  23 +@Data
  24 +public class SeleniumConfig implements ApplicationContextAware {
19 25  
20   -// @Bean
21   - public WebDriver webDriver() throws InterruptedException {
22   - // 初始化ChromeOptions
23   - ChromeOptions options = new ChromeOptions();
24   - // 添加代理,这里使用的代理是示例,需要替换为实际的代理服务器地址和端口
25   -// options.addArguments("--proxy-server=http://proxy-server:port");
  26 + private static ApplicationContext context;
26 27  
27   - // 禁用JavaScript,有时这能帮助绕过Cloudflare的检查
28   - options.addArguments("--disable-javascript");
  28 + @Override
  29 + public void setApplicationContext(ApplicationContext applicationContext) {
  30 + context = applicationContext;
  31 + }
29 32  
30   - // 禁用浏览器扩展,如果知道Cloudflare使用了特定的扩展,可以禁用它
31   - options.addArguments("--disable-extensions");
  33 + public static final String FIREFOX_KEY = "myFirefoxDriver";
  34 + public static final String CHROME_KEY = "myChromeDriver";
  35 + public static final String EDGE_KEY = "myEdgeDriver";
32 36  
33   - // 禁用本地缓存,确保每次访问都从服务器获取
34   - options.addArguments("--disable-application-cache");
  37 + public static final List<String> DRIVER_KEYS = new ArrayList<>(Arrays.asList(
  38 + CHROME_KEY,
  39 + FIREFOX_KEY,
  40 + EDGE_KEY
  41 + ));
35 42  
36   - // 禁止策略化
37   - options.addArguments("--disable-infobars");
38   - // 解决DevToolsActivePort文件不存在的报错
39   - options.addArguments("--no-sandbox");
40   - // 指定浏览器分辨
41   - options.addArguments("window-size=1920x3000");
42   - // 谷歌文档提到需要加上这个属性来规避bug
43   - options.addArguments("--disable-gpu");
44   - // 隐身模式(无痕模式)
45   - options.addArguments("--incognito");
46   - // 最大化运行(全屏窗口),不设置,取元素会报错
47   - options.addArguments("--start-maximized");
48   - // 禁用浏览器正在被自动化程序控制的提示
49   - options.addArguments("--disable-infobars");
50   - // 隐藏滚动条, 应对一些特殊页面
51   - options.addArguments("--hide-scrollbars");
52   - // 不加载图片, 提升速度
53   - options.addArguments("blink-settings=imagesEnabled=false");
54   - // 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
55   - options.addArguments("--headless");
56   - //禁用 blink 特征
57   - options.addArguments("disable-blink-features=AutomationControlled");
58   - options.setExperimentalOption("excludeSwitches", Arrays.asList("enable-automation"));
59   - options.setExperimentalOption("useAutomationExtension", false);
60   - options.addArguments("--remote-allow-origins=*");
61 43  
62   - String os_name = System.getProperty("os.name");
63   - // 判断是否是windows系统
64   - if (os_name.toLowerCase().startsWith("win")) {
65   - // windows
66   - options.setBinary("D:\\chrome\\chrome-headless-shell-win64\\chrome-headless-shell-win64\\chrome-headless-shell.exe");
67   - System.setProperty("webdriver.chrome.driver", "D:\\chrome\\chromedriver-win64\\chromedriver-win64\\chromedriver.exe");
68   - } else {
69   - // linux
70   - options.setBinary("/home/canrd/webmagic/chrome/chrome-linux64/chrome");
71   - System.setProperty("webdriver.chrome.driver", "/home/canrd/webmagic/chrome/chromedriver-linux64/chromedriver");
  44 + public WebDriver getWebDriver(String key) {
  45 + Object bean = context.getBean(key);
  46 + if (bean instanceof BrowserDriver) {
  47 + return ((BrowserDriver) bean).getDriver();
72 48 }
73   -
74   - return new ChromeDriver(options);
  49 + return null;
75 50 }
76 51 }
... ...
src/main/java/com/canrd/webmagic/processor/download/SeleniumDownloader.java
... ... @@ -49,7 +49,8 @@ public class SeleniumDownloader extends AbstractDownloader {
49 49 Page page = Page.fail();
50 50 WebDriver webDriver = null;
51 51 try {
52   - webDriver = config.webDriver();
  52 + //0默认谷歌浏览器
  53 + webDriver = config.getWebDriver(SeleniumConfig.DRIVER_KEYS.get(0));
53 54 WebDriver.Options manage = webDriver.manage();
54 55 Site site = task.getSite();
55 56 site.setUserAgent(Agent.getRandom());
... ... @@ -88,7 +89,7 @@ public class SeleniumDownloader extends AbstractDownloader {
88 89 } finally {
89 90 if (webDriver != null) {
90 91 webDriver.close();
91   - webDriver.quit();
  92 +// webDriver.quit();
92 93 webDriver = null;
93 94 }
94 95 }
... ...
src/main/java/com/canrd/webmagic/processor/driver/BrowserDriver.java 0 → 100644
  1 +package com.canrd.webmagic.processor.driver;
  2 +
  3 +import org.openqa.selenium.WebDriver;
  4 +
  5 +/**
  6 + * @author zhongnanhuang
  7 + * @version 1.0
  8 + * @project webmagic-canrd-service
  9 + * @description 驱动接口
  10 + * @date 2024/5/23 10:46:16
  11 + */
  12 +public interface BrowserDriver {
  13 +
  14 + WebDriver getDriver();
  15 +
  16 +}
... ...
src/main/java/com/canrd/webmagic/processor/driver/MyChromeDriver.java 0 → 100644
  1 +package com.canrd.webmagic.processor.driver;
  2 +
  3 +import org.openqa.selenium.WebDriver;
  4 +import org.openqa.selenium.chrome.ChromeOptions;
  5 +import org.springframework.stereotype.Component;
  6 +
  7 +import java.util.Arrays;
  8 +
  9 +/**
  10 + * @author zhongnanhuang
  11 + * @version 1.0
  12 + * @project webmagic-canrd-service
  13 + * @description 谷歌浏览器驱动
  14 + * @date 2024/5/23 10:49:30
  15 + * 驱动下载地址:https://storage.googleapis.com/chrome-for-testing-public
  16 + */
  17 +@Component
  18 +public class MyChromeDriver implements BrowserDriver{
  19 + private static final String WIN_DRIVER_PATH = "D:\\driver\\chrome\\chromedriver-win64\\chromedriver.exe";
  20 + private static final String LINUX_DRIVER_PATH = "/home/canrd/webmagic/chrome/chromedriver-linux64/chromedriver";
  21 + private static final String WIN_BINARY_PATH = "D:\\driver\\chrome\\chrome-headless-shell-win64\\chrome-headless-shell.exe";
  22 + private static final String LINUX_BINARY_PATH = "/home/canrd/webmagic/chrome/chrome-linux64/chrome";
  23 +
  24 + @Override
  25 + public WebDriver getDriver() {
  26 + // 初始化ChromeOptions
  27 + ChromeOptions options = new ChromeOptions();
  28 + // 添加代理,这里使用的代理是示例,需要替换为实际的代理服务器地址和端口
  29 +// options.addArguments("--proxy-server=http://proxy-server:port");
  30 +
  31 + // 禁用JavaScript,有时这能帮助绕过Cloudflare的检查
  32 + options.addArguments("--disable-javascript");
  33 +
  34 + // 禁用浏览器扩展,如果知道Cloudflare使用了特定的扩展,可以禁用它
  35 + options.addArguments("--disable-extensions");
  36 +
  37 + // 禁用本地缓存,确保每次访问都从服务器获取
  38 + options.addArguments("--disable-application-cache");
  39 +
  40 + // 禁止策略化
  41 + options.addArguments("--disable-infobars");
  42 + // 解决DevToolsActivePort文件不存在的报错
  43 + options.addArguments("--no-sandbox");
  44 + // 指定浏览器分辨
  45 + options.addArguments("window-size=1920x3000");
  46 + // 谷歌文档提到需要加上这个属性来规避bug
  47 + options.addArguments("--disable-gpu");
  48 + // 隐身模式(无痕模式)
  49 + options.addArguments("--incognito");
  50 + // 最大化运行(全屏窗口),不设置,取元素会报错
  51 + options.addArguments("--start-maximized");
  52 + // 禁用浏览器正在被自动化程序控制的提示
  53 + options.addArguments("--disable-infobars");
  54 + // 隐藏滚动条, 应对一些特殊页面
  55 + options.addArguments("--hide-scrollbars");
  56 + // 不加载图片, 提升速度
  57 + options.addArguments("blink-settings=imagesEnabled=false");
  58 + // 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
  59 + options.addArguments("--headless");
  60 + //禁用 blink 特征
  61 + options.addArguments("disable-blink-features=AutomationControlled");
  62 + options.setExperimentalOption("excludeSwitches", Arrays.asList("enable-automation"));
  63 + options.setExperimentalOption("useAutomationExtension", false);
  64 + options.addArguments("--remote-allow-origins=*");
  65 +
  66 +
  67 + String os_name = System.getProperty("os.name");
  68 + // 判断是否是windows系统
  69 + if (os_name.toLowerCase().startsWith("win")) {
  70 + // windows
  71 + options.setBinary(WIN_BINARY_PATH);
  72 + System.setProperty("webdriver.chrome.driver", WIN_DRIVER_PATH);
  73 + } else {
  74 + // linux
  75 + options.setBinary(LINUX_BINARY_PATH);
  76 + System.setProperty("webdriver.chrome.driver", LINUX_DRIVER_PATH);
  77 + }
  78 +
  79 + return new org.openqa.selenium.chrome.ChromeDriver(options);
  80 + }
  81 +}
... ...
src/main/java/com/canrd/webmagic/processor/driver/MyEdgeDriver.java 0 → 100644
  1 +package com.canrd.webmagic.processor.driver;
  2 +
  3 +import org.openqa.selenium.WebDriver;
  4 +import org.openqa.selenium.chrome.ChromeDriver;
  5 +import org.openqa.selenium.chrome.ChromeOptions;
  6 +import org.openqa.selenium.edge.EdgeDriver;
  7 +import org.openqa.selenium.edge.EdgeOptions;
  8 +import org.openqa.selenium.firefox.FirefoxOptions;
  9 +import org.openqa.selenium.firefox.FirefoxProfile;
  10 +import org.springframework.stereotype.Component;
  11 +
  12 +import java.time.Duration;
  13 +
  14 +/**
  15 + * @author zhongnanhuang
  16 + * @version 1.0
  17 + * @project webmagic-canrd-service
  18 + * @description 火狐浏览器驱动
  19 + * @date 2024/5/23 10:45:50
  20 + * 驱动下载地址:https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/?form=MA13LH#downloads
  21 + */
  22 +@Component
  23 +public class MyEdgeDriver implements BrowserDriver{
  24 + private static final String WIN_DRIVER_PATH = "D:\\driver\\edge\\msedgedriver.exe";
  25 + private static final String LINUX_DRIVER_PATH = "/home/canrd/webmagic/chrome/chromedriver-linux64/chromedriver";
  26 +
  27 + @Override
  28 + public WebDriver getDriver() {
  29 + // 设置 Chrome 驱动程序路径
  30 + System.setProperty("webdriver.edge.driver", WIN_DRIVER_PATH); // 替换为实际的驱动程序路径
  31 +
  32 + // 初始化 ChromeOptions 对象
  33 + EdgeOptions options = new EdgeOptions();
  34 +// options.setPageLoadTimeout(Duration.ofSeconds(30)); // 设置页面加载超时时间为30秒
  35 +// options.addArguments("--blink-settings=imagesEnabled=false"); // 禁用图片加载
  36 +// options.addArguments("--disable-notifications"); // 禁用浏览器通知
  37 +// options.addArguments("--disable-infobars"); // 禁用信息栏
  38 +// options.addArguments("--disable-extensions"); // 禁用扩展
  39 +// options.addArguments("--disable-dev-shm-usage"); // 禁用/dev/shm使用
  40 +// options.setHeadless(true);
  41 +
  42 +
  43 + // 设置其他选项,例如添加扩展程序等
  44 + // 初始化 WebDriver
  45 + EdgeDriver driver = new EdgeDriver(options);
  46 +
  47 + return driver;
  48 + }
  49 +}
... ...
src/main/java/com/canrd/webmagic/processor/driver/MyFirefoxDriver.java 0 → 100644
  1 +package com.canrd.webmagic.processor.driver;
  2 +
  3 +import org.openqa.selenium.WebDriver;
  4 +import org.openqa.selenium.firefox.FirefoxOptions;
  5 +import org.openqa.selenium.firefox.FirefoxProfile;
  6 +import org.springframework.stereotype.Component;
  7 +
  8 +/**
  9 + * @author zhongnanhuang
  10 + * @version 1.0
  11 + * @project webmagic-canrd-service
  12 + * @description 火狐浏览器驱动
  13 + * @date 2024/5/23 10:45:50
  14 + * 驱动下载地址:https://objects.githubusercontent.com/github-production-release-asset-2e65be
  15 + */
  16 +@Component
  17 +public class MyFirefoxDriver implements BrowserDriver{
  18 + private static final String WIN_DRIVER_PATH = "D:\\driver\\firefox\\geckodriver.exe";
  19 + private static final String LINUX_DRIVER_PATH = "/home/canrd/webmagic/chrome/chromedriver-linux64/chromedriver";
  20 +
  21 + @Override
  22 + public WebDriver getDriver() {
  23 + // 创建 Firefox 配置文件
  24 + FirefoxProfile profile = new FirefoxProfile();
  25 + profile.setPreference("permissions.default.image", 2); // 禁用图片加载
  26 + profile.setPreference("permissions.default.stylesheet", 2); // 禁用 CSS
  27 + profile.setPreference("media.peerconnection.enabled", false); // 禁用 WebRTC
  28 + profile.setPreference("dom.webnotifications.enabled", false); // 禁用通知
  29 +// profile.setPreference("general.useragent.override", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"); // 设置自定义 User-Agent
  30 +
  31 + // 创建 Firefox 选项
  32 + FirefoxOptions options = new FirefoxOptions();
  33 + options.setProfile(profile);
  34 +
  35 +
  36 +
  37 + String os_name = System.getProperty("os.name");
  38 + // 判断是否是windows系统
  39 + if (os_name.toLowerCase().startsWith("win")) {
  40 + // windows
  41 + // 指定 GeckoDriver 的路径
  42 + System.setProperty("webdriver.gecko.driver", WIN_DRIVER_PATH); // 替换为实际的 geckodriver 路径
  43 + } else {
  44 + // linux
  45 + System.setProperty("webdriver.chrome.driver", LINUX_DRIVER_PATH);
  46 + }
  47 +
  48 + // 初始化 WebDriver
  49 + WebDriver driver = new org.openqa.selenium.firefox.FirefoxDriver(options);
  50 + return driver;
  51 + }
  52 +}
... ...
src/main/resources/application-local.yml
... ... @@ -57,13 +57,13 @@ spring:
57 57 testWhileIdle: true
58 58 testOnBorrow: true
59 59 testOnReturn: true
60   - password: canrd@2024
  60 + password: 123456
61 61 time-between-eviction-runs-millis: 1000
62   - url: jdbc:mysql://39.108.227.113:3307/webmagic?useUnicode=true&characterEncoding=UTF-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=Asia/Shanghai&useSSL=false&autoReconnect=true&failOverReadOnly=false&maxReconnects=10&allowMultiQueries=true&useAffectedRows=true&autoReconnectForPools=true
  62 + url: jdbc:mysql://localhost:3306/webmagic?useUnicode=true&characterEncoding=UTF-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=Asia/Shanghai&useSSL=false&autoReconnect=true&failOverReadOnly=false&maxReconnects=10&allowMultiQueries=true&useAffectedRows=true&autoReconnectForPools=true
63 63 username: root
64 64 redis:
65 65 database: 0
66   - host: 39.108.227.113
  66 + host: localhost
67 67 lettuce:
68 68 pool:
69 69 max-active: 2000
... ...