Commit ab0c014a729e8e154cda59c3d2c456768beaf55f

Authored by 谢茂盛
1 parent cfb42811

feat:

1、science 爬取
src/main/java/com/canrd/webmagic/config/SeleniumConfig.java
@@ -6,6 +6,8 @@ import org.openqa.selenium.chrome.ChromeOptions; @@ -6,6 +6,8 @@ import org.openqa.selenium.chrome.ChromeOptions;
6 import org.springframework.context.annotation.Bean; 6 import org.springframework.context.annotation.Bean;
7 import org.springframework.context.annotation.Configuration; 7 import org.springframework.context.annotation.Configuration;
8 8
  9 +import java.util.Arrays;
  10 +
9 /** 11 /**
10 * @author: xms 12 * @author: xms
11 * @description: TODO 13 * @description: TODO
@@ -31,6 +33,32 @@ public class SeleniumConfig { @@ -31,6 +33,32 @@ public class SeleniumConfig {
31 // 禁用本地缓存,确保每次访问都从服务器获取 33 // 禁用本地缓存,确保每次访问都从服务器获取
32 options.addArguments("--disable-application-cache"); 34 options.addArguments("--disable-application-cache");
33 35
  36 + // 禁止策略化
  37 + options.addArguments("--disable-infobars");
  38 + // 解决DevToolsActivePort文件不存在的报错
  39 + options.addArguments("--no-sandbox");
  40 + // 指定浏览器分辨
  41 + options.addArguments("window-size=1920x3000");
  42 + // 谷歌文档提到需要加上这个属性来规避bug
  43 + options.addArguments("--disable-gpu");
  44 + // 隐身模式(无痕模式)
  45 + options.addArguments("--incognito");
  46 + // 最大化运行(全屏窗口),不设置,取元素会报错
  47 + options.addArguments("--start-maximized");
  48 + // 禁用浏览器正在被自动化程序控制的提示
  49 + options.addArguments("--disable-infobars");
  50 + // 隐藏滚动条, 应对一些特殊页面
  51 + options.addArguments("--hide-scrollbars");
  52 + // 不加载图片, 提升速度
  53 + options.addArguments("blink-settings=imagesEnabled=false");
  54 + // 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
  55 + options.addArguments("--headless");
  56 + //禁用 blink 特征
  57 + options.addArguments("disable-blink-features=AutomationControlled");
  58 + options.setExperimentalOption("excludeSwitches", Arrays.asList("enable-automation"));
  59 + options.setExperimentalOption("useAutomationExtension", false);
  60 + options.addArguments("--remote-allow-origins=*");
  61 +
34 String os_name = System.getProperty("os.name"); 62 String os_name = System.getProperty("os.name");
35 // 判断是否是windows系统 63 // 判断是否是windows系统
36 if (os_name.toLowerCase().startsWith("win")) { 64 if (os_name.toLowerCase().startsWith("win")) {
src/main/java/com/canrd/webmagic/processor/download/SeleniumDownloader.java
@@ -28,7 +28,7 @@ import java.util.Map; @@ -28,7 +28,7 @@ import java.util.Map;
28 @Slf4j 28 @Slf4j
29 @Component 29 @Component
30 public class SeleniumDownloader extends AbstractDownloader { 30 public class SeleniumDownloader extends AbstractDownloader {
31 - private int sleepTime = 30; 31 + private int sleepTime = 3000;
32 32
33 @Resource 33 @Resource
34 private SeleniumConfig config; 34 private SeleniumConfig config;
@@ -67,24 +67,13 @@ public class SeleniumDownloader extends AbstractDownloader { @@ -67,24 +67,13 @@ public class SeleniumDownloader extends AbstractDownloader {
67 webDriver.get(request.getUrl()); 67 webDriver.get(request.getUrl());
68 try { 68 try {
69 if (sleepTime > 0) { 69 if (sleepTime > 0) {
  70 + //休眠3秒就是为了动态的数据渲染完成后在进行获取
70 Thread.sleep(sleepTime); 71 Thread.sleep(sleepTime);
71 } 72 }
72 } catch (InterruptedException e) { 73 } catch (InterruptedException e) {
73 e.printStackTrace(); 74 e.printStackTrace();
74 } 75 }
75 76
76 -  
77 - /*  
78 - * TODO You can add mouse event or other processes  
79 - *  
80 - * @author: bob.li.0718@gmail.com  
81 - */  
82 - try {  
83 - //休眠3秒就是为了动态的数据渲染完成后在进行获取  
84 - Thread.sleep(3000);  
85 - } catch (InterruptedException e) {  
86 - throw new RuntimeException(e);  
87 - }  
88 WebElement webElement = webDriver.findElement(By.xpath("/html")); 77 WebElement webElement = webDriver.findElement(By.xpath("/html"));
89 String content = webElement.getAttribute("outerHTML"); 78 String content = webElement.getAttribute("outerHTML");
90 page.setDownloadSuccess(true); 79 page.setDownloadSuccess(true);