Commit b17c8d02d4038eb64c998b624715cc596c69a83a

Authored by PurelzMgnead
2 parents b9ded713 236651f6

Merge branch 'master' of http://39.108.227.113:8001/xiemaosheng2/webmagic-canrd-service

# Conflicts:
#	pom.xml
#	src/main/java/com/canrd/webmagic/config/SeleniumConfig.java
... ... @@ -48,9 +48,11 @@
48 48 <easyexcel.version>2.2.3</easyexcel.version>
49 49 <webmagic.version>0.10.0</webmagic.version>
50 50 <selenium.version>3.4.0</selenium.version>
  51 +<!-- <browsermob.version>2.1.5</browsermob.version>-->
51 52 </properties>
52 53  
53 54 <dependencies>
  55 +
54 56 <dependency>
55 57 <groupId>org.springframework.boot</groupId>
56 58 <artifactId>spring-boot-starter-web</artifactId>
... ... @@ -62,6 +64,13 @@
62 64 </exclusion>
63 65 </exclusions>
64 66 </dependency>
  67 +<!-- <dependency>-->
  68 +<!-- <groupId>net.lightbody.bmp</groupId>-->
  69 +<!-- <artifactId>browsermob-core</artifactId>-->
  70 +<!-- <version>${browsermob.version}</version>-->
  71 +<!-- </dependency>-->
  72 +
  73 +
65 74 <!-- webmagic核心库 -->
66 75 <dependency>
67 76 <groupId>us.codecraft</groupId>
... ...
src/main/java/com/canrd/webmagic/config/SeleniumConfig.java
1 1 package com.canrd.webmagic.config;
2 2  
3   -import com.canrd.webmagic.driver.ChromeBuildDriver;
4   -import com.google.common.collect.ImmutableList;
5   -import com.google.gson.JsonObject;
6   -import com.sun.java.swing.plaf.windows.resources.windows;
  3 +import com.canrd.webmagic.processor.driver.BrowserDriver;
  4 +import com.canrd.webmagic.processor.driver.MyChromeDriver;
  5 +import com.canrd.webmagic.processor.driver.MyFirefoxDriver;
  6 +import lombok.Data;
7 7 import org.openqa.selenium.WebDriver;
8   -import org.openqa.selenium.chrome.ChromeDriver;
9   -import org.openqa.selenium.chrome.ChromeOptions;
10   -import org.openqa.selenium.edge.EdgeDriver;
11   -import org.openqa.selenium.edge.EdgeDriverService;
12   -import org.openqa.selenium.edge.EdgeOptions;
13   -import org.openqa.selenium.remote.DesiredCapabilities;
14   -import org.springframework.context.annotation.Bean;
  8 +import org.springframework.beans.factory.annotation.Autowired;
  9 +import org.springframework.context.ApplicationContext;
  10 +import org.springframework.context.ApplicationContextAware;
15 11 import org.springframework.context.annotation.Configuration;
16 12  
17   -import java.io.File;
18   -import java.io.IOException;
19   -import java.util.Arrays;
  13 +import java.util.*;
  14 +
20 15  
21 16 /**
22 17 * @author: xms
... ... @@ -25,45 +20,31 @@ import java.util.Arrays;
25 20 * @version: 1.0
26 21 */
27 22 @Configuration
28   -public class SeleniumConfig {
29   - private static String currentDriver = "";
30   -
31   - // @Bean
32   - public WebDriver webDriver() throws InterruptedException, IOException {
  23 +@Data
  24 +public class SeleniumConfig implements ApplicationContextAware {
33 25  
34   - // 初始化ChromeOptions
35   - ChromeOptions chromeOptions = new ChromeBuildDriver().build("C:\\Program Files\\Google\\Chrome\\Application\\chromedriver.exe");
  26 + private static ApplicationContext context;
36 27  
37   - EdgeOptions edgeOptions = new EdgeOptions();
38   -
39   - //配置Edge
40   - File edgeFile = new File("C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedgedriver.exe");
  28 + @Override
  29 + public void setApplicationContext(ApplicationContext applicationContext) {
  30 + context = applicationContext;
  31 + }
41 32  
42   - edgeOptions.setPageLoadStrategy("none");
43   -// JsonObject jsonObject = edgeOptions.toJson();
  33 + public static final String FIREFOX_KEY = "myFirefoxDriver";
  34 + public static final String CHROME_KEY = "myChromeDriver";
  35 + public static final String EDGE_KEY = "myEdgeDriver";
44 36  
  37 + public static final List<String> DRIVER_KEYS = new ArrayList<>(Arrays.asList(
  38 + CHROME_KEY,
  39 + FIREFOX_KEY,
  40 + EDGE_KEY
  41 + ));
45 42  
46   - String os_name = System.getProperty("os.name");
47   - // 判断是否是windows系统
48   - if (os_name.toLowerCase().startsWith("win")) {
49 43  
50   -// edgeOptions.setBinary("C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedgedriver.exe");
51   - currentDriver = "edge";
52   - //windows
53   - if (currentDriver.equals("") || currentDriver.equals("edge")) {
54   - System.out.printf("chrome启动");
55   - System.setProperty("webdriver.chrome.driver", "C:\\Program Files\\Google\\Chrome\\Application\\chromedriver.exe");
56   - currentDriver = "chrome";
57   - return new ChromeDriver(chromeOptions);
58   - } else if (currentDriver.equals("chrome")) {
59   - System.out.printf("edge启动");
60   - System.setProperty("webdriver.edge.driver", "C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedgedriver.exe");
61   - currentDriver = "edge";
62   - return new EdgeDriver(edgeOptions);
63   - }
64   - } else {
65   - // linux
66   - System.setProperty("webdriver.chrome.driver", "C:\\Program Files\\Google\\Chrome\\Application\\chromedriver.exe");
  44 + public WebDriver getWebDriver(String key) {
  45 + Object bean = context.getBean(key);
  46 + if (bean instanceof BrowserDriver) {
  47 + return ((BrowserDriver) bean).getDriver();
67 48 }
68 49 return null;
69 50 }
... ...
src/main/java/com/canrd/webmagic/processor/download/SeleniumDownloader.java
... ... @@ -49,7 +49,8 @@ public class SeleniumDownloader extends AbstractDownloader {
49 49 Page page = Page.fail();
50 50 WebDriver webDriver = null;
51 51 try {
52   - webDriver = config.webDriver();
  52 + //0默认谷歌浏览器
  53 + webDriver = config.getWebDriver(SeleniumConfig.DRIVER_KEYS.get(0));
53 54 WebDriver.Options manage = webDriver.manage();
54 55 Site site = task.getSite();
55 56 site.setUserAgent(Agent.getRandom());
... ... @@ -88,7 +89,7 @@ public class SeleniumDownloader extends AbstractDownloader {
88 89 } finally {
89 90 if (webDriver != null) {
90 91 webDriver.close();
91   - webDriver.quit();
  92 +// webDriver.quit();
92 93 webDriver = null;
93 94 }
94 95 }
... ...
src/main/java/com/canrd/webmagic/processor/driver/BrowserDriver.java 0 → 100644
  1 +package com.canrd.webmagic.processor.driver;
  2 +
  3 +import org.openqa.selenium.WebDriver;
  4 +
  5 +/**
  6 + * @author zhongnanhuang
  7 + * @version 1.0
  8 + * @project webmagic-canrd-service
  9 + * @description 驱动接口
  10 + * @date 2024/5/23 10:46:16
  11 + */
  12 +public interface BrowserDriver {
  13 +
  14 + WebDriver getDriver();
  15 +
  16 +}
... ...
src/main/java/com/canrd/webmagic/processor/driver/MyChromeDriver.java 0 → 100644
  1 +package com.canrd.webmagic.processor.driver;
  2 +
  3 +import org.openqa.selenium.WebDriver;
  4 +import org.openqa.selenium.chrome.ChromeOptions;
  5 +import org.springframework.stereotype.Component;
  6 +
  7 +import java.util.Arrays;
  8 +
  9 +/**
  10 + * @author zhongnanhuang
  11 + * @version 1.0
  12 + * @project webmagic-canrd-service
  13 + * @description 谷歌浏览器驱动
  14 + * @date 2024/5/23 10:49:30
  15 + * 驱动下载地址:https://storage.googleapis.com/chrome-for-testing-public
  16 + */
  17 +@Component
  18 +public class MyChromeDriver implements BrowserDriver{
  19 + private static final String WIN_DRIVER_PATH = "D:\\driver\\chrome\\chromedriver-win64\\chromedriver.exe";
  20 + private static final String LINUX_DRIVER_PATH = "/home/canrd/webmagic/chrome/chromedriver-linux64/chromedriver";
  21 + private static final String WIN_BINARY_PATH = "D:\\driver\\chrome\\chrome-headless-shell-win64\\chrome-headless-shell.exe";
  22 + private static final String LINUX_BINARY_PATH = "/home/canrd/webmagic/chrome/chrome-linux64/chrome";
  23 +
  24 + @Override
  25 + public WebDriver getDriver() {
  26 + // 初始化ChromeOptions
  27 + ChromeOptions options = new ChromeOptions();
  28 + // 添加代理,这里使用的代理是示例,需要替换为实际的代理服务器地址和端口
  29 +// options.addArguments("--proxy-server=http://proxy-server:port");
  30 +
  31 + // 禁用JavaScript,有时这能帮助绕过Cloudflare的检查
  32 + options.addArguments("--disable-javascript");
  33 +
  34 + // 禁用浏览器扩展,如果知道Cloudflare使用了特定的扩展,可以禁用它
  35 + options.addArguments("--disable-extensions");
  36 +
  37 + // 禁用本地缓存,确保每次访问都从服务器获取
  38 + options.addArguments("--disable-application-cache");
  39 +
  40 + // 禁止策略化
  41 + options.addArguments("--disable-infobars");
  42 + // 解决DevToolsActivePort文件不存在的报错
  43 + options.addArguments("--no-sandbox");
  44 + // 指定浏览器分辨
  45 + options.addArguments("window-size=1920x3000");
  46 + // 谷歌文档提到需要加上这个属性来规避bug
  47 + options.addArguments("--disable-gpu");
  48 + // 隐身模式(无痕模式)
  49 + options.addArguments("--incognito");
  50 + // 最大化运行(全屏窗口),不设置,取元素会报错
  51 + options.addArguments("--start-maximized");
  52 + // 禁用浏览器正在被自动化程序控制的提示
  53 + options.addArguments("--disable-infobars");
  54 + // 隐藏滚动条, 应对一些特殊页面
  55 + options.addArguments("--hide-scrollbars");
  56 + // 不加载图片, 提升速度
  57 + options.addArguments("blink-settings=imagesEnabled=false");
  58 + // 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
  59 + options.addArguments("--headless");
  60 + //禁用 blink 特征
  61 + options.addArguments("disable-blink-features=AutomationControlled");
  62 + options.setExperimentalOption("excludeSwitches", Arrays.asList("enable-automation"));
  63 + options.setExperimentalOption("useAutomationExtension", false);
  64 + options.addArguments("--remote-allow-origins=*");
  65 +
  66 +
  67 + String os_name = System.getProperty("os.name");
  68 + // 判断是否是windows系统
  69 + if (os_name.toLowerCase().startsWith("win")) {
  70 + // windows
  71 + options.setBinary(WIN_BINARY_PATH);
  72 + System.setProperty("webdriver.chrome.driver", WIN_DRIVER_PATH);
  73 + } else {
  74 + // linux
  75 + options.setBinary(LINUX_BINARY_PATH);
  76 + System.setProperty("webdriver.chrome.driver", LINUX_DRIVER_PATH);
  77 + }
  78 +
  79 + return new org.openqa.selenium.chrome.ChromeDriver(options);
  80 + }
  81 +}
... ...
src/main/java/com/canrd/webmagic/processor/driver/MyEdgeDriver.java 0 → 100644
  1 +package com.canrd.webmagic.processor.driver;
  2 +
  3 +import org.openqa.selenium.WebDriver;
  4 +import org.openqa.selenium.chrome.ChromeDriver;
  5 +import org.openqa.selenium.chrome.ChromeOptions;
  6 +import org.openqa.selenium.edge.EdgeDriver;
  7 +import org.openqa.selenium.edge.EdgeOptions;
  8 +import org.openqa.selenium.firefox.FirefoxOptions;
  9 +import org.openqa.selenium.firefox.FirefoxProfile;
  10 +import org.springframework.stereotype.Component;
  11 +
  12 +import java.time.Duration;
  13 +
  14 +/**
  15 + * @author zhongnanhuang
  16 + * @version 1.0
  17 + * @project webmagic-canrd-service
  18 + * @description 火狐浏览器驱动
  19 + * @date 2024/5/23 10:45:50
  20 + * 驱动下载地址:https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/?form=MA13LH#downloads
  21 + */
  22 +@Component
  23 +public class MyEdgeDriver implements BrowserDriver{
  24 + private static final String WIN_DRIVER_PATH = "D:\\driver\\edge\\msedgedriver.exe";
  25 + private static final String LINUX_DRIVER_PATH = "/home/canrd/webmagic/chrome/chromedriver-linux64/chromedriver";
  26 +
  27 + @Override
  28 + public WebDriver getDriver() {
  29 + // 设置 Chrome 驱动程序路径
  30 + System.setProperty("webdriver.edge.driver", WIN_DRIVER_PATH); // 替换为实际的驱动程序路径
  31 +
  32 + // 初始化 ChromeOptions 对象
  33 + EdgeOptions options = new EdgeOptions();
  34 +// options.setPageLoadTimeout(Duration.ofSeconds(30)); // 设置页面加载超时时间为30秒
  35 +// options.addArguments("--blink-settings=imagesEnabled=false"); // 禁用图片加载
  36 +// options.addArguments("--disable-notifications"); // 禁用浏览器通知
  37 +// options.addArguments("--disable-infobars"); // 禁用信息栏
  38 +// options.addArguments("--disable-extensions"); // 禁用扩展
  39 +// options.addArguments("--disable-dev-shm-usage"); // 禁用/dev/shm使用
  40 +// options.setHeadless(true);
  41 +
  42 +
  43 + // 设置其他选项,例如添加扩展程序等
  44 + // 初始化 WebDriver
  45 + EdgeDriver driver = new EdgeDriver(options);
  46 +
  47 + return driver;
  48 + }
  49 +}
... ...
src/main/java/com/canrd/webmagic/processor/driver/MyFirefoxDriver.java 0 → 100644
  1 +package com.canrd.webmagic.processor.driver;
  2 +
  3 +import org.openqa.selenium.WebDriver;
  4 +import org.openqa.selenium.firefox.FirefoxOptions;
  5 +import org.openqa.selenium.firefox.FirefoxProfile;
  6 +import org.springframework.stereotype.Component;
  7 +
  8 +/**
  9 + * @author zhongnanhuang
  10 + * @version 1.0
  11 + * @project webmagic-canrd-service
  12 + * @description 火狐浏览器驱动
  13 + * @date 2024/5/23 10:45:50
  14 + * 驱动下载地址:https://objects.githubusercontent.com/github-production-release-asset-2e65be
  15 + */
  16 +@Component
  17 +public class MyFirefoxDriver implements BrowserDriver{
  18 + private static final String WIN_DRIVER_PATH = "D:\\driver\\firefox\\geckodriver.exe";
  19 + private static final String LINUX_DRIVER_PATH = "/home/canrd/webmagic/chrome/chromedriver-linux64/chromedriver";
  20 +
  21 + @Override
  22 + public WebDriver getDriver() {
  23 + // 创建 Firefox 配置文件
  24 + FirefoxProfile profile = new FirefoxProfile();
  25 + profile.setPreference("permissions.default.image", 2); // 禁用图片加载
  26 + profile.setPreference("permissions.default.stylesheet", 2); // 禁用 CSS
  27 + profile.setPreference("media.peerconnection.enabled", false); // 禁用 WebRTC
  28 + profile.setPreference("dom.webnotifications.enabled", false); // 禁用通知
  29 +// profile.setPreference("general.useragent.override", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"); // 设置自定义 User-Agent
  30 +
  31 + // 创建 Firefox 选项
  32 + FirefoxOptions options = new FirefoxOptions();
  33 + options.setProfile(profile);
  34 +
  35 +
  36 +
  37 + String os_name = System.getProperty("os.name");
  38 + // 判断是否是windows系统
  39 + if (os_name.toLowerCase().startsWith("win")) {
  40 + // windows
  41 + // 指定 GeckoDriver 的路径
  42 + System.setProperty("webdriver.gecko.driver", WIN_DRIVER_PATH); // 替换为实际的 geckodriver 路径
  43 + } else {
  44 + // linux
  45 + System.setProperty("webdriver.chrome.driver", LINUX_DRIVER_PATH);
  46 + }
  47 +
  48 + // 初始化 WebDriver
  49 + WebDriver driver = new org.openqa.selenium.firefox.FirefoxDriver(options);
  50 + return driver;
  51 + }
  52 +}
... ...
src/main/resources/application-local.yml
... ... @@ -57,13 +57,13 @@ spring:
57 57 testWhileIdle: true
58 58 testOnBorrow: true
59 59 testOnReturn: true
60   - password: canrd@2024
  60 + password: 123456
61 61 time-between-eviction-runs-millis: 1000
62   - url: jdbc:mysql://39.108.227.113:3307/webmagic?useUnicode=true&characterEncoding=UTF-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=Asia/Shanghai&useSSL=false&autoReconnect=true&failOverReadOnly=false&maxReconnects=10&allowMultiQueries=true&useAffectedRows=true&autoReconnectForPools=true
  62 + url: jdbc:mysql://localhost:3306/webmagic?useUnicode=true&characterEncoding=UTF-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=Asia/Shanghai&useSSL=false&autoReconnect=true&failOverReadOnly=false&maxReconnects=10&allowMultiQueries=true&useAffectedRows=true&autoReconnectForPools=true
63 63 username: root
64 64 redis:
65 65 database: 0
66   - host: 39.108.227.113
  66 + host: localhost
67 67 lettuce:
68 68 pool:
69 69 max-active: 2000
... ...