package com.canrd.webmagic.processor; import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONObject; import com.baomidou.mybatisplus.core.toolkit.CollectionUtils; import com.canrd.webmagic.common.utils.StringUtils; import com.canrd.webmagic.domain.ArticleTypeEnum; import com.canrd.webmagic.domain.dto.ArticleDO; import com.canrd.webmagic.processor.config.Agent; import com.canrd.webmagic.processor.pipeline.NatureArticlePipeline; import lombok.extern.slf4j.Slf4j; import org.springframework.stereotype.Component; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Selectable; import us.codecraft.webmagic.selector.XpathSelector; import java.util.ArrayList; import java.util.List; import java.util.Objects; import java.util.stream.Collectors; /** * @author: xms * @description: TODO * @date: 2024/4/1 14:19 * @version: 1.0 */ @Slf4j @Component public class NatureSearchPageProcessor implements PageProcessor { private String agent = Agent.getRandom(); // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等 private Site site = Site.me().setRetryTimes(3).setSleepTime(100); /** * 定制爬虫逻辑的核心接口,在这里编写抽取逻辑 * * @param page */ @Override public void process(Page page) { if (page.getUrl().get().contains("search")) { doArticleList(page); } else if (page.getUrl().get().contains("research-articles")) { doArticleList4ReSearch(page); } else { doArticleContent(page); } } /** * @param page */ private void doArticleList4ReSearch(Page page) { String url = page.getUrl().get(); String[] split = url.split("="); Integer pageIndex = Integer.parseInt(split[split.length - 1]); /** * 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容 * 1、通过$或css()方法获取到该page html下某元素dom */ Selectable selectable = page.getHtml().$(".app-article-list-row").select( new XpathSelector("li[@class='app-article-list-row__item']") ); List<Selectable> nodes = selectable.nodes(); /** * 获取到指定的dom后,从这些dom中提取元素内容。 */ for (int i = 1; i <= nodes.size() - 1; i++) { Selectable node = nodes.get(i).$(".u-full-height").nodes().get(2).nodes().get(0).$(".u-full-height").select(new XpathSelector("a[@class='c-card__link u-link-inherit']")).nodes().get(0); String link = node.$("a", "href").get(); page.addTargetRequest(link); String link1 = node.links().get(); String title = node.$("a", "text").get(); System.out.printf("%d、%s,访问地址:%s%n", i, title, link1); } } private void doArticleContent(Page page) { if (page.getUrl().get().contains("redirect") || !page.getUrl().get().contains("nature")) { return; } //解析页面 Html html = page.getHtml(); String[] urlArr = page.getUrl().get().split("/"); String articleCode = urlArr[urlArr.length - 1]; Selectable headSelectable = html.xpath("//div[@class='c-article-header']/header"); List<Selectable> authorEmailSelectables = html.xpath("//p[@id='corresponding-author-list']/a").nodes(); Selectable referencesSelectable = html.xpath("//ol[@class='c-article-references']").select(new XpathSelector("li[@class='c-article-references__item js-c-reading-companion-references-item']")); Selectable authorAddressSelectable = html.xpath("//ol[@class='c-article-author-affiliation__list']").select(new XpathSelector("li")); String title = headSelectable.xpath("//div/h1/text()").get(); if (StringUtils.isBlank(title)) { title = headSelectable.xpath("//h1/text()").get(); } String articleDesc = html.xpath("//div[@class='c-article-section__content']/p/text()").get(); String publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get(); Selectable authorSelectable = headSelectable.xpath("//ul").nodes().get(1).select(new XpathSelector("li[@class='c-article-author-list__item']")); List<Selectable> authorNodes = authorSelectable.nodes(); StringBuffer authorName = new StringBuffer(); for (Selectable node : authorNodes) { authorName.append(node.xpath("//a/text()")); } JSONArray authorAddress = new JSONArray(); List<Selectable> authorAddressList = authorAddressSelectable.nodes(); if (CollectionUtils.isNotEmpty(authorAddressList)) { for (Selectable selectable : authorAddressList) { String address = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__address']/text()").get(); String authorNames = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__authors-list']/text()").get(); JSONObject object = new JSONObject(); object.put("address", address); object.put("authorNames", authorNames); authorAddress.add(object); } } JSONArray references = new JSONArray(); List<Selectable> referenceList = referencesSelectable.nodes(); if (CollectionUtils.isNotEmpty(referenceList)) { for (Selectable reference : referenceList) { String referenceTitle = reference.xpath("//p").xpath("//p[@class='c-article-references__text']/text()").get(); List<Selectable> referenceLinks = reference.xpath("//p").xpath("//p[@class='c-article-references__links u-hide-print']").links().nodes(); List<String> links = new ArrayList<>(); if (CollectionUtils.isNotEmpty(referenceLinks)) { links = referenceLinks.stream().map(x -> x.get()).collect(Collectors.toList()); } JSONObject object = new JSONObject(); object.put("referenceTitle", referenceTitle); object.put("links", links); if (CollectionUtils.isNotEmpty(links)) { page.addTargetRequests(links.stream().filter(x -> x.contains("nature")).collect(Collectors.toList())); } references.add(object); } } JSONArray authorEmail = new JSONArray(); for (Selectable authorEmailSelectable : authorEmailSelectables) { String[] split = authorEmailSelectable.xpath("//a").links().get().split(":"); String email = Objects.isNull(split) ? "" : split[split.length - 1]; String authorEmailName = authorEmailSelectable.xpath("//a/text()").get(); JSONObject jsonObject = new JSONObject(); jsonObject.put("authorEmailName", authorEmailName); jsonObject.put("email", email); authorEmail.add(jsonObject); } System.out.println("code:" + articleCode + ",发布时间:" + publishTime + ",标题:" + title + ",作者:" + authorName + ",邮箱信息:" + authorEmail.toJSONString()); page.putField("article", ArticleDO.builder() .articleType(ArticleTypeEnum.NATURE.getType()) .articleCode(articleCode) .authorName(authorName.toString()) .title(title) .publishTime(publishTime) .emailInfo(authorEmail.toJSONString()) .articleDesc(articleDesc) .authorAddress(authorAddress.toJSONString()) .referenceInfo(references.toJSONString()).build()); } private void doArticleList(Page page) { String url = page.getUrl().get(); String[] split = url.split("="); Integer pageIndex = Integer.parseInt(split[split.length - 1]); /** * 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容 * 1、通过$或css()方法获取到该page html下某元素dom */ Selectable selectable = page.getHtml().$(".app-article-list-row").select( new XpathSelector("li[@class='app-article-list-row__item']") ); List<Selectable> nodes = selectable.nodes(); /** * 获取到指定的dom后,从这些dom中提取元素内容。 */ for (int i = 1; i <= nodes.size() - 1; i++) { Selectable node = nodes.get(i).$(".u-full-height").nodes().get(2).nodes().get(0).$(".u-full-height").select(new XpathSelector("a[@class='c-card__link u-link-inherit']")).nodes().get(0); String link = node.$("a", "href").get(); page.addTargetRequest(link); String link1 = node.links().get(); String title = node.$("a", "text").get(); System.out.printf("%d、%s,访问地址:%s%n", i, title, link1); } } @Override public Site getSite() { return site; } public static void main(String[] args) { // 创建一个Spider,并把我们的处理器放进去 Spider.create(new NatureSearchPageProcessor()) // 添加这个Spider要爬取的网页地址 .addUrl("https://www.nature.com/nature/research-articles?sort=PubDate&page=1") .addPipeline(new NatureArticlePipeline()) // 开启5个线程执行,并开始爬取 .thread(5).run(); } }