package com.canrd.webmagic.processor; import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONObject; import com.canrd.webmagic.common.utils.DateUtil; import com.canrd.webmagic.domain.ArticleTypeEnum; import com.canrd.webmagic.domain.dto.ArticleDO; import com.canrd.webmagic.processor.config.Agent; import com.canrd.webmagic.processor.pipeline.ArticlePipeline; import lombok.extern.slf4j.Slf4j; import org.springframework.stereotype.Component; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Selectable; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Date; import java.util.List; import java.util.Locale; import java.util.Objects; /** * @author: xms * @description: TODO * @date: 2024/4/1 14:19 * @version: 1.0 */ @Slf4j @Component public class Science4JournalArticlePageProcessor implements PageProcessor { private String agent = Agent.getRandom(); // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等 private Site site = Site.me().setRetryTimes(3).setSleepTime(100).setUserAgent(Agent.getRandom()); /** * 定制爬虫逻辑的核心接口,在这里编写抽取逻辑 * * @param page */ @Override public void process(Page page) { doArticleContent(page); } /** * @param page */ private void doArticleContent(Page page) { //解析页面 Html html = page.getHtml(); String articleCode = page.getUrl().get(); Selectable headSelectable = html.xpath("//div[@class='article-container']/article/header"); String title = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='core-lede']/div/text()").get(); String articleDesc = html.xpath("//div[@class='article-container']/article").xpath("//section[@id='bodymatter']/div/div/text()").get(); String publishTime = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='core-self-citation']").xpath("//div[@class='core-date-published']/span/text()").get(); Date publishTimeDateTime = null; SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH); try { publishTimeDateTime = formatter.parse(publishTime); } catch (ParseException e) { e.printStackTrace(); } List<Selectable> authorNodes = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='contributors']/span/span/span").nodes(); StringBuffer authorName = new StringBuffer(); for (Selectable node : authorNodes) { authorName.append(node.xpath("//a/span/text()").get()).append(" "); } JSONArray authorEmail = new JSONArray(); List<Selectable> authorEmailSelectables = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='contributors']").xpath("//span[@class='authors']").xpath("//span[@role='list']").xpath("//span[@property='author']").nodes(); for (Selectable authorEmailSelectable : authorEmailSelectables) { String givenName = authorEmailSelectable.xpath("//span[@property='givenName']/text()").get(); String familyName = authorEmailSelectable.xpath("//span[@property='familyName']/text()").get(); String email = authorEmailSelectable.xpath("//a[@property='email']/text()").get(); JSONObject jsonObject = new JSONObject(); jsonObject.put("authorEmailName", givenName + "" + familyName); jsonObject.put("email", email); authorEmail.add(jsonObject); } log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorEmail.toJSONString()); page.putField("article", ArticleDO.builder() .articleType(ArticleTypeEnum.SCIENCE.getType()) .articleCode(articleCode) .authorName(authorName.toString()) .title(title) .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE)) .emailInfo(authorEmail.toJSONString()) .articleDesc(articleDesc) .authorAddress(null) .referenceInfo(null).build()); } @Override public Site getSite() { return site; } public static void main(String[] args) { // 创建一个Spider,并把我们的处理器放进去 Spider.create(new Science4JournalArticlePageProcessor()) // 添加这个Spider要爬取的网页地址 .addUrl("https://www.science.org/journal/science/insights?startPage=0") .addPipeline(new ArticlePipeline()) // 开启5个线程执行,并开始爬取 .thread(5).run(); } }