SeleniumDownloader.java
6.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
package com.canrd.webmagic.processor.download;
import com.canrd.webmagic.config.SeleniumConfig;
import com.canrd.webmagic.processor.config.Agent;
import lombok.extern.slf4j.Slf4j;
import org.openqa.selenium.*;
import org.openqa.selenium.support.ui.ExpectedConditions;
import org.openqa.selenium.support.ui.WebDriverWait;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.downloader.AbstractDownloader;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText;
import javax.annotation.Resource;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @author: xms
* @description: TODO
* @date: 2024/4/26 16:36
* @version: 1.0
*/
@Slf4j
@Component
public class SeleniumDownloader extends AbstractDownloader {
private int sleepTime = 10000;
@Resource
private SeleniumConfig config;
/**
* set sleep time to wait until load success
*
* @param sleepTime sleepTime
* @return this
*/
public SeleniumDownloader setSleepTime(int sleepTime) {
this.sleepTime = sleepTime;
return this;
}
public static boolean checkUrl(String url) {
String pattern = "https://id.elsevier.com/as/[a-zA-Z0-9]+/resume/as/";
Pattern r = Pattern.compile(pattern);
Matcher m = r.matcher(url);
return m.find();
}
@Override
public Page download(Request request, Task task) {
Page page = Page.fail();
WebDriver webDriver = null;
try {
//0默认谷歌浏览器
webDriver = config.getWebDriver(SeleniumConfig.DRIVER_KEYS.get(0));
WebDriver.Options manage = webDriver.manage();
Site site = task.getSite();
site.setUserAgent(Agent.getRandom());
if (site.getCookies() != null) {
for (Map.Entry<String, String> cookieEntry : site.getCookies()
.entrySet()) {
Cookie cookie = new Cookie(cookieEntry.getKey(),
cookieEntry.getValue());
manage.addCookie(cookie);
}
}
log.info("downloading page " + request.getUrl());
webDriver.get(request.getUrl());
if (request.getUrl().equals("https://www.cell.com/matter/home")) {
WebElement searchText = webDriver.findElement(By.id("searchText"));
searchText.sendKeys("Aluminum foil");
WebElement element = webDriver.findElement(By.xpath("//div[@class='quick-search__toggle']/button"));
element.submit();
WebDriverWait wait = new WebDriverWait(webDriver, 30);
wait.until(ExpectedConditions.urlContains("https://www.cell.com/action/doSearch?"));
// wait.until(ExpectedConditions.visibilityOfElementLocated(By.xpath("//div[@class='cb-c']")));
// WebElement cloudFlare = webDriver.findElement(By.xpath("//div[@class='cb-c']/label/input"));
// if (cloudFlare!=null){
// cloudFlare.click();
// }
}
if (request.getUrl().equals("https://www.sciencedirect.com/search?qs=battery&pub=Nano%20Today&cid=273494")) {
WebDriverWait wait = new WebDriverWait(webDriver, 60);
Boolean until = wait.until(ExpectedConditions.urlContains("https://id.elsevier.com/"));
if (until) {
log.info(webDriver.getCurrentUrl());
// if (checkUrl(webDriver.getCurrentUrl())) {
String currentUrl = webDriver.getCurrentUrl();
log.info("currentUrl=" + currentUrl);
String pageSource = webDriver.getPageSource();
log.info(pageSource);
// WebElement element = webDriver.findElement(By.xpath("//div[@class='form-row']/from/div[@id='jsEnabled']/input"));
// webDriver.findElement(By.xpath("//a[@class='ot-sdk-show-settings cookie anchor-text']")).click();
webDriver.findElement(By.xpath("//input[@id='bdd-email']")).getAttribute("1187551704@qq.com");
webDriver.findElement(By.xpath("//button[@id='bdd-els-searchBtn']")).submit();
// String text = element.getText();
// log.info(text);
// }
} else {
log.info("跳转失败");
}
}
try {
if (sleepTime > 0) {
//休眠3秒就是为了动态的数据渲染完成后在进行获取
Thread.sleep(sleepTime);
}
} catch (InterruptedException e) {
e.printStackTrace();
}
// WebElement targetElement;
// do {
// try {
// targetElement = webDriver.findElement(By.xpath("//h2[@class=\"h2\"]"));
// log.info(String.valueOf(targetElement));
// log.info("等待验证中");
// Thread.sleep(sleepTime); // 等待一段时间后再检查
// } catch (NoSuchElementException e) {
// targetElement = null; // 如果找不到特定元素,则退出循环
// }
// } while (targetElement != null);
WebElement webElement = webDriver.findElement(By.xpath("/html"));
String content = webElement.getAttribute("outerHTML");
page.setDownloadSuccess(true);
page.setRawText(content);
page.setHtml(new Html(content, request.getUrl()));
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
log.info("onSuccess");
onSuccess(request, task);
} catch (Exception e) {
log.warn("download page {} error", request.getUrl(), e);
onError(request, task, e);
} finally {
if (webDriver != null) {
webDriver.close();
webDriver.quit();
webDriver = null;
}
}
return page;
}
@Override
public void setThread(int i) {
}
}