Commit ab0c014a729e8e154cda59c3d2c456768beaf55f
1 parent
cfb42811
feat:
1、science 爬取
Showing
2 changed files
with
30 additions
and
13 deletions
src/main/java/com/canrd/webmagic/config/SeleniumConfig.java
... | ... | @@ -6,6 +6,8 @@ import org.openqa.selenium.chrome.ChromeOptions; |
6 | 6 | import org.springframework.context.annotation.Bean; |
7 | 7 | import org.springframework.context.annotation.Configuration; |
8 | 8 | |
9 | +import java.util.Arrays; | |
10 | + | |
9 | 11 | /** |
10 | 12 | * @author: xms |
11 | 13 | * @description: TODO |
... | ... | @@ -31,6 +33,32 @@ public class SeleniumConfig { |
31 | 33 | // 禁用本地缓存,确保每次访问都从服务器获取 |
32 | 34 | options.addArguments("--disable-application-cache"); |
33 | 35 | |
36 | + // 禁止策略化 | |
37 | + options.addArguments("--disable-infobars"); | |
38 | + // 解决DevToolsActivePort文件不存在的报错 | |
39 | + options.addArguments("--no-sandbox"); | |
40 | + // 指定浏览器分辨 | |
41 | + options.addArguments("window-size=1920x3000"); | |
42 | + // 谷歌文档提到需要加上这个属性来规避bug | |
43 | + options.addArguments("--disable-gpu"); | |
44 | + // 隐身模式(无痕模式) | |
45 | + options.addArguments("--incognito"); | |
46 | + // 最大化运行(全屏窗口),不设置,取元素会报错 | |
47 | + options.addArguments("--start-maximized"); | |
48 | + // 禁用浏览器正在被自动化程序控制的提示 | |
49 | + options.addArguments("--disable-infobars"); | |
50 | + // 隐藏滚动条, 应对一些特殊页面 | |
51 | + options.addArguments("--hide-scrollbars"); | |
52 | + // 不加载图片, 提升速度 | |
53 | + options.addArguments("blink-settings=imagesEnabled=false"); | |
54 | + // 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败 | |
55 | + options.addArguments("--headless"); | |
56 | + //禁用 blink 特征 | |
57 | + options.addArguments("disable-blink-features=AutomationControlled"); | |
58 | + options.setExperimentalOption("excludeSwitches", Arrays.asList("enable-automation")); | |
59 | + options.setExperimentalOption("useAutomationExtension", false); | |
60 | + options.addArguments("--remote-allow-origins=*"); | |
61 | + | |
34 | 62 | String os_name = System.getProperty("os.name"); |
35 | 63 | // 判断是否是windows系统 |
36 | 64 | if (os_name.toLowerCase().startsWith("win")) { | ... | ... |
src/main/java/com/canrd/webmagic/processor/download/SeleniumDownloader.java
... | ... | @@ -28,7 +28,7 @@ import java.util.Map; |
28 | 28 | @Slf4j |
29 | 29 | @Component |
30 | 30 | public class SeleniumDownloader extends AbstractDownloader { |
31 | - private int sleepTime = 30; | |
31 | + private int sleepTime = 3000; | |
32 | 32 | |
33 | 33 | @Resource |
34 | 34 | private SeleniumConfig config; |
... | ... | @@ -67,24 +67,13 @@ public class SeleniumDownloader extends AbstractDownloader { |
67 | 67 | webDriver.get(request.getUrl()); |
68 | 68 | try { |
69 | 69 | if (sleepTime > 0) { |
70 | + //休眠3秒就是为了动态的数据渲染完成后在进行获取 | |
70 | 71 | Thread.sleep(sleepTime); |
71 | 72 | } |
72 | 73 | } catch (InterruptedException e) { |
73 | 74 | e.printStackTrace(); |
74 | 75 | } |
75 | 76 | |
76 | - | |
77 | - /* | |
78 | - * TODO You can add mouse event or other processes | |
79 | - * | |
80 | - * @author: bob.li.0718@gmail.com | |
81 | - */ | |
82 | - try { | |
83 | - //休眠3秒就是为了动态的数据渲染完成后在进行获取 | |
84 | - Thread.sleep(3000); | |
85 | - } catch (InterruptedException e) { | |
86 | - throw new RuntimeException(e); | |
87 | - } | |
88 | 77 | WebElement webElement = webDriver.findElement(By.xpath("/html")); |
89 | 78 | String content = webElement.getAttribute("outerHTML"); |
90 | 79 | page.setDownloadSuccess(true); | ... | ... |