package com.canrd.webmagic.processor; import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONObject; import com.canrd.webmagic.common.utils.DateUtil; import com.canrd.webmagic.common.utils.StringUtils; import com.canrd.webmagic.domain.ArticleTypeEnum; import com.canrd.webmagic.domain.dto.ArticleDO; import com.canrd.webmagic.processor.config.Agent; import lombok.extern.slf4j.Slf4j; import org.springframework.stereotype.Component; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Selectable; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Date; import java.util.Locale; import java.util.Objects; /** * @author: xms * @description: TODO * @date: 2024/4/1 14:19 * @version: 1.0 */ @Slf4j @Component public class Univie4PhysnanoArticlePageProcessor implements PageProcessor { // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等 private Site site = Site.me().setRetryTimes(3).setSleepTime(5).setUserAgent(Agent.getRandom()); /** * 定制爬虫逻辑的核心接口,在这里编写抽取逻辑 * * @param page */ @Override public void process(Page page) { doArticleContent(page); } /** * @param page */ private void doArticleContent(Page page) { //解析页面 Html html = page.getHtml(); String articleCode = page.getUrl().get(); String title = html.xpath("//h1[@class=' content-element-margin']/text()").get(); String articleDesc = html.xpath("//dl[@class='row']").xpath("//dd[@class='col-sm-9 content-element-margin-small']").nodes().get(1).xpath("//p/text()").get(); Date publishTimeDateTime = null; SimpleDateFormat formatter = new SimpleDateFormat("dd-yyyy", Locale.ENGLISH); String publishTime = html.xpath("//dl[@class='row']").xpath("//dd[@class='col-sm-9']").nodes().get(10).xpath("//dd/text()").get(); try { publishTimeDateTime = formatter.parse(publishTime.trim()); } catch (ParseException e) { try { publishTime = html.xpath("//dl[@class='row']").xpath("//dd[@class='col-sm-9']").nodes().get(9).xpath("//dd/text()").get(); publishTimeDateTime = formatter.parse(publishTime.trim()); }catch (Exception e1) { } } StringBuffer authorName = new StringBuffer(); authorName.append(html.xpath("//dl[@class='row']/").xpath("//dd[@class='col-sm-9 content-element-margin-small']").nodes().get(0).xpath("//dd/text()").get()); StringBuffer authorAddress = new StringBuffer(); authorAddress.append(html.xpath("//dl[@class='row']/").xpath("//dd[@class='col-sm-9 content-element-margin-small']").nodes().get(3).xpath("//dd/text()").get()); JSONArray authorEmail = new JSONArray(); String contractStr = html.xpath("//div[@class='col-md-3 sidebar content-element-margin']/aside/address/text()").get(); JSONObject jsonObject = new JSONObject(); if (StringUtils.isNotBlank(contractStr)) { String authorEmailName = contractStr.split(":")[0]; String telephone = contractStr.split(":")[1]; jsonObject.put("authorEmailName", authorEmailName); jsonObject.put("telephone", telephone); } String splitStr = html.xpath("//div[@class='col-md-3 sidebar content-element-margin']/aside/address/a").get().replaceAll("<span>","") .replaceAll("</span>","").replaceAll("</a>",""); if (StringUtils.isNotBlank(splitStr)) { String email = splitStr.split(">")[1]; jsonObject.put("email", email); } authorEmail.add(jsonObject); log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorEmail.toJSONString()); page.putField("article", ArticleDO.builder() .articleType(ArticleTypeEnum.UNIVIE_PHYSNANO.getType()) .articleCode(articleCode) .authorName(authorName.toString()) .title(title) .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE)) .emailInfo(authorEmail.toJSONString()) .articleDesc(articleDesc) .authorAddress(authorAddress.toString()) .referenceInfo(null).build()); } @Override public Site getSite() { return site; } public void setSite(Site site) { this.site = site; } }