SeleniumDownloader.java
4.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
package com.canrd.webmagic.processor.download;
import com.canrd.webmagic.config.SeleniumConfig;
import com.canrd.webmagic.processor.config.Agent;
import lombok.extern.slf4j.Slf4j;
import org.openqa.selenium.*;
import org.openqa.selenium.support.ui.ExpectedConditions;
import org.openqa.selenium.support.ui.WebDriverWait;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.downloader.AbstractDownloader;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText;
import javax.annotation.Resource;
import java.util.Map;
/**
* @author: xms
* @description: TODO
* @date: 2024/4/26 16:36
* @version: 1.0
*/
@Slf4j
@Component
public class SeleniumDownloader extends AbstractDownloader {
private int sleepTime = 3000000;
@Resource
private SeleniumConfig config;
/**
* set sleep time to wait until load success
*
* @param sleepTime sleepTime
* @return this
*/
public SeleniumDownloader setSleepTime(int sleepTime) {
this.sleepTime = sleepTime;
return this;
}
@Override
public Page download(Request request, Task task) {
Page page = Page.fail();
WebDriver webDriver = null;
try {
//0默认谷歌浏览器
webDriver = config.getWebDriver(SeleniumConfig.DRIVER_KEYS.get(0));
WebDriver.Options manage = webDriver.manage();
Site site = task.getSite();
site.setUserAgent(Agent.getRandom());
if (site.getCookies() != null) {
for (Map.Entry<String, String> cookieEntry : site.getCookies()
.entrySet()) {
Cookie cookie = new Cookie(cookieEntry.getKey(),
cookieEntry.getValue());
manage.addCookie(cookie);
}
}
log.info("downloading page " + request.getUrl());
webDriver.get(request.getUrl());
if (request.getUrl().equals("https://www.cell.com/matter/home")) {
WebElement searchText = webDriver.findElement(By.id("searchText"));
searchText.sendKeys("Aluminum foil");
WebElement element = webDriver.findElement(By.xpath("//div[@class='quick-search__toggle']/button"));
element.submit();
WebDriverWait wait = new WebDriverWait(webDriver, 30);
wait.until(ExpectedConditions.urlContains("https://www.cell.com/action/doSearch?"));
// wait.until(ExpectedConditions.visibilityOfElementLocated(By.xpath("//div[@class='cb-c']")));
// WebElement cloudFlare = webDriver.findElement(By.xpath("//div[@class='cb-c']/label/input"));
// if (cloudFlare!=null){
// cloudFlare.click();
// }
}
if (request.getUrl().contains("https://www.cell.com/action/doSearch?")){
}
try {
if (sleepTime > 0) {
//休眠3秒就是为了动态的数据渲染完成后在进行获取
Thread.sleep(sleepTime);
}
} catch (InterruptedException e) {
e.printStackTrace();
}
// WebElement targetElement;
// do {
// try {
// targetElement = webDriver.findElement(By.xpath("//h2[@class=\"h2\"]"));
// log.info(String.valueOf(targetElement));
// log.info("等待验证中");
// Thread.sleep(sleepTime); // 等待一段时间后再检查
// } catch (NoSuchElementException e) {
// targetElement = null; // 如果找不到特定元素,则退出循环
// }
// } while (targetElement != null);
WebElement webElement = webDriver.findElement(By.xpath("/html"));
String content = webElement.getAttribute("outerHTML");
page.setDownloadSuccess(true);
page.setRawText(content);
page.setHtml(new Html(content, request.getUrl()));
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
onSuccess(request, task);
} catch (Exception e) {
log.warn("download page {} error", request.getUrl(), e);
onError(request, task, e);
} finally {
if (webDriver != null) {
webDriver.close();
// webDriver.quit();
webDriver = null;
}
}
return page;
}
@Override
public void setThread(int i) {
}
}