Commit ab0c014a729e8e154cda59c3d2c456768beaf55f
1 parent
cfb42811
feat:
1、science 爬取
Showing
2 changed files
with
30 additions
and
13 deletions
src/main/java/com/canrd/webmagic/config/SeleniumConfig.java
@@ -6,6 +6,8 @@ import org.openqa.selenium.chrome.ChromeOptions; | @@ -6,6 +6,8 @@ import org.openqa.selenium.chrome.ChromeOptions; | ||
6 | import org.springframework.context.annotation.Bean; | 6 | import org.springframework.context.annotation.Bean; |
7 | import org.springframework.context.annotation.Configuration; | 7 | import org.springframework.context.annotation.Configuration; |
8 | 8 | ||
9 | +import java.util.Arrays; | ||
10 | + | ||
9 | /** | 11 | /** |
10 | * @author: xms | 12 | * @author: xms |
11 | * @description: TODO | 13 | * @description: TODO |
@@ -31,6 +33,32 @@ public class SeleniumConfig { | @@ -31,6 +33,32 @@ public class SeleniumConfig { | ||
31 | // 禁用本地缓存,确保每次访问都从服务器获取 | 33 | // 禁用本地缓存,确保每次访问都从服务器获取 |
32 | options.addArguments("--disable-application-cache"); | 34 | options.addArguments("--disable-application-cache"); |
33 | 35 | ||
36 | + // 禁止策略化 | ||
37 | + options.addArguments("--disable-infobars"); | ||
38 | + // 解决DevToolsActivePort文件不存在的报错 | ||
39 | + options.addArguments("--no-sandbox"); | ||
40 | + // 指定浏览器分辨 | ||
41 | + options.addArguments("window-size=1920x3000"); | ||
42 | + // 谷歌文档提到需要加上这个属性来规避bug | ||
43 | + options.addArguments("--disable-gpu"); | ||
44 | + // 隐身模式(无痕模式) | ||
45 | + options.addArguments("--incognito"); | ||
46 | + // 最大化运行(全屏窗口),不设置,取元素会报错 | ||
47 | + options.addArguments("--start-maximized"); | ||
48 | + // 禁用浏览器正在被自动化程序控制的提示 | ||
49 | + options.addArguments("--disable-infobars"); | ||
50 | + // 隐藏滚动条, 应对一些特殊页面 | ||
51 | + options.addArguments("--hide-scrollbars"); | ||
52 | + // 不加载图片, 提升速度 | ||
53 | + options.addArguments("blink-settings=imagesEnabled=false"); | ||
54 | + // 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败 | ||
55 | + options.addArguments("--headless"); | ||
56 | + //禁用 blink 特征 | ||
57 | + options.addArguments("disable-blink-features=AutomationControlled"); | ||
58 | + options.setExperimentalOption("excludeSwitches", Arrays.asList("enable-automation")); | ||
59 | + options.setExperimentalOption("useAutomationExtension", false); | ||
60 | + options.addArguments("--remote-allow-origins=*"); | ||
61 | + | ||
34 | String os_name = System.getProperty("os.name"); | 62 | String os_name = System.getProperty("os.name"); |
35 | // 判断是否是windows系统 | 63 | // 判断是否是windows系统 |
36 | if (os_name.toLowerCase().startsWith("win")) { | 64 | if (os_name.toLowerCase().startsWith("win")) { |
src/main/java/com/canrd/webmagic/processor/download/SeleniumDownloader.java
@@ -28,7 +28,7 @@ import java.util.Map; | @@ -28,7 +28,7 @@ import java.util.Map; | ||
28 | @Slf4j | 28 | @Slf4j |
29 | @Component | 29 | @Component |
30 | public class SeleniumDownloader extends AbstractDownloader { | 30 | public class SeleniumDownloader extends AbstractDownloader { |
31 | - private int sleepTime = 30; | 31 | + private int sleepTime = 3000; |
32 | 32 | ||
33 | @Resource | 33 | @Resource |
34 | private SeleniumConfig config; | 34 | private SeleniumConfig config; |
@@ -67,24 +67,13 @@ public class SeleniumDownloader extends AbstractDownloader { | @@ -67,24 +67,13 @@ public class SeleniumDownloader extends AbstractDownloader { | ||
67 | webDriver.get(request.getUrl()); | 67 | webDriver.get(request.getUrl()); |
68 | try { | 68 | try { |
69 | if (sleepTime > 0) { | 69 | if (sleepTime > 0) { |
70 | + //休眠3秒就是为了动态的数据渲染完成后在进行获取 | ||
70 | Thread.sleep(sleepTime); | 71 | Thread.sleep(sleepTime); |
71 | } | 72 | } |
72 | } catch (InterruptedException e) { | 73 | } catch (InterruptedException e) { |
73 | e.printStackTrace(); | 74 | e.printStackTrace(); |
74 | } | 75 | } |
75 | 76 | ||
76 | - | ||
77 | - /* | ||
78 | - * TODO You can add mouse event or other processes | ||
79 | - * | ||
80 | - * @author: bob.li.0718@gmail.com | ||
81 | - */ | ||
82 | - try { | ||
83 | - //休眠3秒就是为了动态的数据渲染完成后在进行获取 | ||
84 | - Thread.sleep(3000); | ||
85 | - } catch (InterruptedException e) { | ||
86 | - throw new RuntimeException(e); | ||
87 | - } | ||
88 | WebElement webElement = webDriver.findElement(By.xpath("/html")); | 77 | WebElement webElement = webDriver.findElement(By.xpath("/html")); |
89 | String content = webElement.getAttribute("outerHTML"); | 78 | String content = webElement.getAttribute("outerHTML"); |
90 | page.setDownloadSuccess(true); | 79 | page.setDownloadSuccess(true); |