SeleniumDownloader.java
3.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
package com.canrd.webmagic.processor.download;
import com.canrd.webmagic.config.SeleniumConfig;
import com.canrd.webmagic.processor.config.Agent;
import lombok.extern.slf4j.Slf4j;
import org.openqa.selenium.By;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.downloader.AbstractDownloader;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText;
import javax.annotation.Resource;
import java.util.Map;
/**
* @author: xms
* @description: TODO
* @date: 2024/4/26 16:36
* @version: 1.0
*/
@Slf4j
@Component
public class SeleniumDownloader extends AbstractDownloader {
private int sleepTime = 3000;
@Resource
private SeleniumConfig config;
/**
* set sleep time to wait until load success
*
* @param sleepTime sleepTime
* @return this
*/
public SeleniumDownloader setSleepTime(int sleepTime) {
this.sleepTime = sleepTime;
return this;
}
@Override
public Page download(Request request, Task task) {
Page page = Page.fail();
WebDriver webDriver = null;
try {
//0默认谷歌浏览器
webDriver = config.getWebDriver(SeleniumConfig.DRIVER_KEYS.get(0));
WebDriver.Options manage = webDriver.manage();
Site site = task.getSite();
site.setUserAgent(Agent.getRandom());
if (site.getCookies() != null) {
for (Map.Entry<String, String> cookieEntry : site.getCookies()
.entrySet()) {
Cookie cookie = new Cookie(cookieEntry.getKey(),
cookieEntry.getValue());
manage.addCookie(cookie);
}
}
log.info("downloading page " + request.getUrl());
webDriver.get(request.getUrl());
try {
if (sleepTime > 0) {
//休眠3秒就是为了动态的数据渲染完成后在进行获取
Thread.sleep(sleepTime);
}
} catch (InterruptedException e) {
e.printStackTrace();
}
WebElement webElement = webDriver.findElement(By.xpath("/html"));
String content = webElement.getAttribute("outerHTML");
page.setDownloadSuccess(true);
page.setRawText(content);
page.setHtml(new Html(content, request.getUrl()));
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
onSuccess(request, task);
} catch (Exception e) {
log.warn("download page {} error", request.getUrl(), e);
onError(request, task, e);
} finally {
if (webDriver != null) {
webDriver.close();
// webDriver.quit();
webDriver = null;
}
}
return page;
}
@Override
public void setThread(int i) {
}
}