package com.canrd.webmagic.processor; import com.canrd.webmagic.processor.config.Agent; import com.canrd.webmagic.processor.download.SeleniumDownloader; import com.canrd.webmagic.processor.pipeline.ArticlePipeline; import lombok.extern.slf4j.Slf4j; import org.apache.logging.log4j.core.util.UuidUtil; import org.springframework.stereotype.Component; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.Selectable; import us.codecraft.webmagic.selector.XpathSelector; import javax.annotation.Resource; import java.util.List; /** * https://www.univie.ac.at/suche/?q=battery * * @author: xms * @description: TODO * @date: 2024/4/1 14:19 * @version: 1.0 */ @Slf4j @Component public class UnivieSearchPageProcessor implements PageProcessor { @Resource private Univie4PhysnanoArticlePageProcessor univie4PhysnanoArticlePageProcessor; @Resource private SeleniumDownloader seleniumDownloader; @Resource private ArticlePipeline articlePipeline; /** * 抓取网站的相关配置,包括编码、抓取间隔、重试次数等 */ private Site site = Site.me().setRetryTimes(3).setSleepTime(5).setUserAgent(Agent.getRandom()); /** * 定制爬虫逻辑的核心接口,在这里编写抽取逻辑 * * @param page */ @Override public void process(Page page) { doSearch(page); } /** * @param page */ private void doSearch(Page page) { String url = page.getUrl().get(); /** * 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容 * 1、通过$或css()方法获取到该page html下某元素dom */ Selectable selectable = page.getHtml().xpath("//div[@class='univie-search']").xpath("//div[@class='content-element-margin-small yacy-result']"); List<Selectable> nodes = selectable.nodes(); /** * 获取到指定的dom后,从这些dom中提取元素内容。 */ for (int i = 0; i <= nodes.size() - 1; i++) { String link = nodes.get(i).links().get(); String title = nodes.get(i).xpath("//a/h2/text()").get(); if (link.contains(".pdf") || link.contains(".docx")) { continue; } if (link.contains("physnano.univie.ac.at/publications/publication-detail/pure")) { log.info("关键字文章列表链接:{},标题:{},文章链接:{}", url, title, link); Spider.create(univie4PhysnanoArticlePageProcessor) .addUrl(link) .addPipeline(articlePipeline) // .setDownloader(seleniumDownloader) .setUUID(UuidUtil.getTimeBasedUuid().toString()) // 开启5个线程执行,并开始爬取 .thread(1).run(); } } } @Override public Site getSite() { return site; } }