SeleniumDownloader.java 3.19 KB
package com.canrd.webmagic.processor.download;

import com.canrd.webmagic.config.SeleniumConfig;
import com.canrd.webmagic.processor.config.Agent;
import lombok.extern.slf4j.Slf4j;
import org.openqa.selenium.By;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.downloader.AbstractDownloader;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText;

import javax.annotation.Resource;
import java.util.Map;

/**
 * @author: xms
 * @description: TODO
 * @date: 2024/4/26 16:36
 * @version: 1.0
 */
@Slf4j
@Component
public class SeleniumDownloader extends AbstractDownloader {
    private int sleepTime = 3000;

    @Resource
    private SeleniumConfig config;

    /**
     * set sleep time to wait until load success
     *
     * @param sleepTime sleepTime
     * @return this
     */
    public SeleniumDownloader setSleepTime(int sleepTime) {
        this.sleepTime = sleepTime;
        return this;
    }

    @Override
    public Page download(Request request, Task task) {
        Page page = Page.fail();
        WebDriver webDriver = null;
        try {
            //0默认谷歌浏览器
            webDriver = config.getWebDriver(SeleniumConfig.DRIVER_KEYS.get(0));
            WebDriver.Options manage = webDriver.manage();
            Site site = task.getSite();
            site.setUserAgent(Agent.getRandom());
            if (site.getCookies() != null) {
                for (Map.Entry<String, String> cookieEntry : site.getCookies()
                        .entrySet()) {
                    Cookie cookie = new Cookie(cookieEntry.getKey(),
                            cookieEntry.getValue());
                    manage.addCookie(cookie);
                }
            }

            log.info("downloading page " + request.getUrl());

            webDriver.get(request.getUrl());
            try {
                if (sleepTime > 0) {
                    //休眠3秒就是为了动态的数据渲染完成后在进行获取
                    Thread.sleep(sleepTime);
                }
            } catch (InterruptedException e) {
                e.printStackTrace();
            }

            WebElement webElement = webDriver.findElement(By.xpath("/html"));
            String content = webElement.getAttribute("outerHTML");
            page.setDownloadSuccess(true);
            page.setRawText(content);
            page.setHtml(new Html(content, request.getUrl()));
            page.setUrl(new PlainText(request.getUrl()));
            page.setRequest(request);
            onSuccess(request, task);
        } catch (Exception e) {
            log.warn("download page {} error", request.getUrl(), e);
            onError(request, task, e);
        } finally {
            if (webDriver != null) {
                webDriver.close();
//                webDriver.quit();
                webDriver = null;
            }
        }
        return page;
    }

    @Override
    public void setThread(int i) {

    }
}