package com.canrd.webmagic.processor.download; import com.canrd.webmagic.config.SeleniumConfig; import com.canrd.webmagic.processor.config.Agent; import lombok.extern.slf4j.Slf4j; import org.openqa.selenium.By; import org.openqa.selenium.Cookie; import org.openqa.selenium.WebDriver; import org.openqa.selenium.WebElement; import org.springframework.stereotype.Component; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.downloader.AbstractDownloader; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.PlainText; import javax.annotation.Resource; import java.util.Map; /** * @author: xms * @description: TODO * @date: 2024/4/26 16:36 * @version: 1.0 */ @Slf4j @Component public class SeleniumDownloader extends AbstractDownloader { private int sleepTime = 3000; @Resource private SeleniumConfig config; /** * set sleep time to wait until load success * * @param sleepTime sleepTime * @return this */ public SeleniumDownloader setSleepTime(int sleepTime) { this.sleepTime = sleepTime; return this; } @Override public Page download(Request request, Task task) { Page page = Page.fail(); WebDriver webDriver = null; try { webDriver = config.webDriver(); WebDriver.Options manage = webDriver.manage(); Site site = task.getSite(); site.setUserAgent(Agent.getRandom()); if (site.getCookies() != null) { for (Map.Entry<String, String> cookieEntry : site.getCookies() .entrySet()) { Cookie cookie = new Cookie(cookieEntry.getKey(), cookieEntry.getValue()); manage.addCookie(cookie); } } log.info("downloading page " + request.getUrl()); webDriver.get(request.getUrl()); try { if (sleepTime > 0) { //休眠3秒就是为了动态的数据渲染完成后在进行获取 Thread.sleep(sleepTime); } } catch (InterruptedException e) { e.printStackTrace(); } WebElement webElement = webDriver.findElement(By.xpath("/html")); String content = webElement.getAttribute("outerHTML"); page.setDownloadSuccess(true); page.setRawText(content); page.setHtml(new Html(content, request.getUrl())); page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); onSuccess(request, task); } catch (Exception e) { log.warn("download page {} error", request.getUrl(), e); onError(request, task, e); } finally { if (webDriver != null) { webDriver.close(); webDriver.quit(); webDriver = null; } } return page; } @Override public void setThread(int i) { } }