SeleniumDownloader.java 6.25 KB
package com.canrd.webmagic.processor.download;

import com.canrd.webmagic.config.SeleniumConfig;
import com.canrd.webmagic.processor.config.Agent;
import lombok.extern.slf4j.Slf4j;
import org.openqa.selenium.*;
import org.openqa.selenium.support.ui.ExpectedConditions;
import org.openqa.selenium.support.ui.WebDriverWait;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.downloader.AbstractDownloader;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText;

import javax.annotation.Resource;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @author: xms
 * @description: TODO
 * @date: 2024/4/26 16:36
 * @version: 1.0
 */
@Slf4j
@Component
public class SeleniumDownloader extends AbstractDownloader {
    private int sleepTime = 3000000;

    @Resource
    private SeleniumConfig config;

    /**
     * set sleep time to wait until load success
     *
     * @param sleepTime sleepTime
     * @return this
     */
    public SeleniumDownloader setSleepTime(int sleepTime) {
        this.sleepTime = sleepTime;
        return this;
    }

    public static boolean checkUrl(String url) {
        String pattern = "https://id.elsevier.com/as/[a-zA-Z0-9]+/resume/as/";
        Pattern r = Pattern.compile(pattern);
        Matcher m = r.matcher(url);
        return m.find();
    }

    @Override
    public Page download(Request request, Task task) {
        Page page = Page.fail();
        WebDriver webDriver = null;
        try {
            //0默认谷歌浏览器
            webDriver = config.getWebDriver(SeleniumConfig.DRIVER_KEYS.get(0));
            WebDriver.Options manage = webDriver.manage();
            Site site = task.getSite();
            site.setUserAgent(Agent.getRandom());
            if (site.getCookies() != null) {
                for (Map.Entry<String, String> cookieEntry : site.getCookies()
                        .entrySet()) {
                    Cookie cookie = new Cookie(cookieEntry.getKey(),
                            cookieEntry.getValue());
                    manage.addCookie(cookie);
                }
            }

            log.info("downloading page " + request.getUrl());
            webDriver.get(request.getUrl());
            if (request.getUrl().equals("https://www.cell.com/matter/home")) {
                WebElement searchText = webDriver.findElement(By.id("searchText"));
                searchText.sendKeys("Aluminum foil");
                WebElement element = webDriver.findElement(By.xpath("//div[@class='quick-search__toggle']/button"));
                element.submit();
                WebDriverWait wait = new WebDriverWait(webDriver, 30);
                wait.until(ExpectedConditions.urlContains("https://www.cell.com/action/doSearch?"));

//                wait.until(ExpectedConditions.visibilityOfElementLocated(By.xpath("//div[@class='cb-c']")));
//                WebElement cloudFlare = webDriver.findElement(By.xpath("//div[@class='cb-c']/label/input"));
//                if (cloudFlare!=null){
//                    cloudFlare.click();
//                }
            }
            if (request.getUrl().equals("https://www.sciencedirect.com/search?qs=battery&pub=Nano%20Today&cid=273494")) {
                WebDriverWait wait = new WebDriverWait(webDriver, 60);
                Boolean until = wait.until(ExpectedConditions.urlContains("https://id.elsevier.com/"));
                if (until) {
                    log.info(webDriver.getCurrentUrl());
//                    if (checkUrl(webDriver.getCurrentUrl())) {
                    String currentUrl = webDriver.getCurrentUrl();
                    log.info("currentUrl=" + currentUrl);
                    String pageSource = webDriver.getPageSource();
                    log.info(pageSource);
//                    WebElement element = webDriver.findElement(By.xpath("//div[@class='form-row']/from/div[@id='jsEnabled']/input"));
                    webDriver.findElement(By.xpath("//a[@class='ot-sdk-show-settings cookie anchor-text']")).click();
                    webDriver.findElement(By.xpath("//input[@id='bdd-email']")).getAttribute("1187551704@qq.com");
                    webDriver.findElement(By.xpath("//button[@id='bdd-els-searchBtn']")).submit();
//                    String text = element.getText();
//                    log.info(text);
//                    }
                } else {
                    log.info("跳转失败");
                }

            }
            try {
                if (sleepTime > 0) {
                    //休眠3秒就是为了动态的数据渲染完成后在进行获取
                    Thread.sleep(sleepTime);
                }
            } catch (InterruptedException e) {
                e.printStackTrace();
            }

//            WebElement targetElement;
//            do {
//                try {
//                    targetElement = webDriver.findElement(By.xpath("//h2[@class=\"h2\"]"));
//                    log.info(String.valueOf(targetElement));
//                    log.info("等待验证中");
//                    Thread.sleep(sleepTime); // 等待一段时间后再检查
//                } catch (NoSuchElementException e) {
//                    targetElement = null; // 如果找不到特定元素,则退出循环
//                }
//            } while (targetElement != null);

            WebElement webElement = webDriver.findElement(By.xpath("/html"));
            String content = webElement.getAttribute("outerHTML");
            page.setDownloadSuccess(true);
            page.setRawText(content);
            page.setHtml(new Html(content, request.getUrl()));
            page.setUrl(new PlainText(request.getUrl()));
            page.setRequest(request);
            onSuccess(request, task);
        } catch (Exception e) {
            log.warn("download page {} error", request.getUrl(), e);
            onError(request, task, e);
        } finally {
            if (webDriver != null) {
                webDriver.close();
                webDriver.quit();
                webDriver = null;
            }
        }
        return page;
    }

    @Override
    public void setThread(int i) {

    }
}