package com.canrd.webmagic.processor; import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONObject; import com.canrd.webmagic.common.utils.DateUtil; import com.canrd.webmagic.common.utils.StringUtils; import com.canrd.webmagic.domain.ArticleTypeEnum; import com.canrd.webmagic.domain.dto.ArticleDO; import com.canrd.webmagic.processor.config.Agent; import com.canrd.webmagic.processor.pipeline.ArticlePipeline; import lombok.extern.slf4j.Slf4j; import org.springframework.stereotype.Component; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Selectable; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Date; import java.util.List; import java.util.Locale; import java.util.Objects; /** * @author: xms * @description: TODO * @date: 2024/4/1 14:19 * @version: 1.0 */ @Slf4j @Component public class Science4JournalArticlePageProcessor implements PageProcessor { // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等 private Site site = Site.me().setRetryTimes(3).setSleepTime(5).setUserAgent(Agent.getRandom()); /** * 定制爬虫逻辑的核心接口,在这里编写抽取逻辑 * * @param page */ @Override public void process(Page page) { doArticleContent(page); } /** * @param page */ private void doArticleContent(Page page) { //解析页面 Html html = page.getHtml(); String articleCode = page.getUrl().get(); Selectable articleSelectable = html.xpath("//article[@xmlns='http://www.w3.org/1999/xhtml']"); Selectable headSelectable = articleSelectable.xpath("//header/div"); String title = headSelectable.xpath("//div[@class='core-lede']/div/text()").get(); if (StringUtils.isBlank(title)) { title = headSelectable.xpath("//h1[@property='name']/text()").get(); } String articleDesc = articleSelectable.xpath("//section[@id='bodymatter']/div/div/text()").get(); if (StringUtils.isBlank(articleDesc)) { articleDesc = articleSelectable.xpath("//div[@role='paragraph']/text()").get(); } String publishTime = headSelectable.xpath("//span[@property='datePublished']/text()").get(); Date publishTimeDateTime = null; SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH); try { publishTimeDateTime = formatter.parse(publishTime); } catch (ParseException e) { e.printStackTrace(); } List<Selectable> authorNodes = headSelectable.xpath("//span[@property='author']").nodes(); StringBuffer authorName = new StringBuffer(); for (Selectable node : authorNodes) { String giveName = node.xpath("//span[@property='givenName']/text()").get(); String familyName = node.xpath("//span[@property='familyName']/text()").get(); if (StringUtils.isBlank(giveName) && StringUtils.isBlank(familyName)) { continue; } authorName.append(giveName).append(" ").append(familyName).append(","); } JSONArray authorEmail = new JSONArray(); List<Selectable> authorEmailSelectables = headSelectable.xpath("//span[@property='author']").nodes(); for (Selectable authorEmailSelectable : authorEmailSelectables) { String givenName = authorEmailSelectable.xpath("//span[@property='givenName']/text()").get(); String familyName = authorEmailSelectable.xpath("//span[@property='familyName']/text()").get(); String email = authorEmailSelectable.xpath("//a[@property='email']/text()").get(); if (StringUtils.isBlank(email)) { continue; } JSONObject jsonObject = new JSONObject(); jsonObject.put("authorEmailName", givenName + " " + familyName); jsonObject.put("email", email); authorEmail.add(jsonObject); } log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorEmail.toJSONString()); page.putField("article", ArticleDO.builder() .articleType(ArticleTypeEnum.SCIENCE.getType()) .articleCode(articleCode) .authorName(authorName.toString()) .title(title) .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE)) .emailInfo(authorEmail.toJSONString()) .articleDesc(articleDesc) .authorAddress(null) .referenceInfo(null).build()); } @Override public Site getSite() { return site; } public void setSite(Site site) { this.site = site; } public static void main(String[] args) { // 创建一个Spider,并把我们的处理器放进去 Spider.create(new Science4JournalArticlePageProcessor()) // 添加这个Spider要爬取的网页地址 .addUrl("https://www.science.org/journal/science/insights?startPage=0") .addPipeline(new ArticlePipeline()) // 开启5个线程执行,并开始爬取 .thread(5).run(); } }