Commit 7182dcd56e75cb672cccf02544806fff6ec0dbc9

Authored by 谢茂盛
1 parent 7b1216da

feat:

1、univie-physnano 爬取
src/main/java/com/canrd/webmagic/controller/UnivieController.java 0 → 100644
  1 +package com.canrd.webmagic.controller;
  2 +
  3 +import com.canrd.webmagic.common.constant.ServerResult;
  4 +import com.canrd.webmagic.common.utils.KeywordUtil;
  5 +import com.canrd.webmagic.processor.Science4SpjSearchPageProcessor;
  6 +import com.canrd.webmagic.processor.UnivieSearchPageProcessor;
  7 +import com.canrd.webmagic.processor.download.SeleniumDownloader;
  8 +import org.apache.logging.log4j.core.util.UuidUtil;
  9 +import org.springframework.web.bind.annotation.GetMapping;
  10 +import org.springframework.web.bind.annotation.RequestMapping;
  11 +import org.springframework.web.bind.annotation.RequestParam;
  12 +import org.springframework.web.bind.annotation.RestController;
  13 +import us.codecraft.webmagic.Spider;
  14 +
  15 +import javax.annotation.Resource;
  16 +
  17 +/**
  18 + * nature-文章信息(NatureArticle)表控制层
  19 + *
  20 + * @author makejava
  21 + * @since 2024-04-07 18:39:41
  22 + */
  23 +@RestController
  24 +@RequestMapping("/univie/ac")
  25 +public class UnivieController {
  26 +
  27 + @Resource
  28 + private UnivieSearchPageProcessor univieSearchPageProcessor;
  29 +
  30 + @Resource
  31 + private SeleniumDownloader seleniumDownloader;
  32 +
  33 + /**
  34 + * @return
  35 + */
  36 + @GetMapping("/start")
  37 + public ServerResult start(@RequestParam(value = "indexSize") Integer indexSize) {
  38 + for (int i = 0; i <= indexSize; i++) {
  39 + Spider.create(univieSearchPageProcessor)
  40 + .addUrl("https://www.univie.ac.at/suche/?&currentPage=" + i)
  41 + // 开启5个线程执行,并开始爬取
  42 + .setUUID(UuidUtil.getTimeBasedUuid().toString())
  43 +// .setDownloader(seleniumDownloader)
  44 + .thread(5).run();
  45 + }
  46 +
  47 + return ServerResult.success();
  48 + }
  49 +
  50 +
  51 + /**
  52 + * @return
  53 + */
  54 + @GetMapping("/search")
  55 + public ServerResult search(@RequestParam(value = "indexSize") Integer indexSize) {
  56 + for (int i = 1; i <= indexSize; i++) {
  57 + for (String keyword : KeywordUtil.getKeyWordList()) {
  58 + Spider.create(univieSearchPageProcessor)
  59 + .addUrl("https://www.univie.ac.at/suche/?q=" + keyword + "&currentPage=" + i)
  60 + // 开启5个线程执行,并开始爬取
  61 + .setUUID(UuidUtil.getTimeBasedUuid().toString())
  62 +// .setDownloader(seleniumDownloader)
  63 + .thread(5).run();
  64 + }
  65 + }
  66 +
  67 + return ServerResult.success();
  68 + }
  69 +
  70 +}
  71 +
src/main/java/com/canrd/webmagic/domain/ArticleTypeEnum.java
@@ -17,6 +17,7 @@ public enum ArticleTypeEnum { @@ -17,6 +17,7 @@ public enum ArticleTypeEnum {
17 NATURE("nature", "nature网址"), 17 NATURE("nature", "nature网址"),
18 SCIENCE("science", "science网址"), 18 SCIENCE("science", "science网址"),
19 SCIENCE_SPJ("science-spj", "science网址-spj"), 19 SCIENCE_SPJ("science-spj", "science网址-spj"),
  20 + UNIVIE_PHYSNANO("univie-physnano", "univie网址-physnano"),
20 ; 21 ;
21 private String type; 22 private String type;
22 private String desc; 23 private String desc;
src/main/java/com/canrd/webmagic/processor/Univie4PhysnanoArticlePageProcessor.java 0 → 100644
  1 +package com.canrd.webmagic.processor;
  2 +
  3 +import com.alibaba.fastjson.JSONArray;
  4 +import com.alibaba.fastjson.JSONObject;
  5 +import com.canrd.webmagic.common.utils.DateUtil;
  6 +import com.canrd.webmagic.common.utils.StringUtils;
  7 +import com.canrd.webmagic.domain.ArticleTypeEnum;
  8 +import com.canrd.webmagic.domain.dto.ArticleDO;
  9 +import com.canrd.webmagic.processor.config.Agent;
  10 +import lombok.extern.slf4j.Slf4j;
  11 +import org.springframework.stereotype.Component;
  12 +import us.codecraft.webmagic.Page;
  13 +import us.codecraft.webmagic.Site;
  14 +import us.codecraft.webmagic.processor.PageProcessor;
  15 +import us.codecraft.webmagic.selector.Html;
  16 +import us.codecraft.webmagic.selector.Selectable;
  17 +
  18 +import java.text.ParseException;
  19 +import java.text.SimpleDateFormat;
  20 +import java.util.Date;
  21 +import java.util.Locale;
  22 +import java.util.Objects;
  23 +
  24 +/**
  25 + * @author: xms
  26 + * @description: TODO
  27 + * @date: 2024/4/1 14:19
  28 + * @version: 1.0
  29 + */
  30 +@Slf4j
  31 +@Component
  32 +public class Univie4PhysnanoArticlePageProcessor implements PageProcessor {
  33 +
  34 + // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等
  35 + private Site site = Site.me().setRetryTimes(3).setSleepTime(5).setUserAgent(Agent.getRandom());
  36 +
  37 + /**
  38 + * 定制爬虫逻辑的核心接口,在这里编写抽取逻辑
  39 + *
  40 + * @param page
  41 + */
  42 + @Override
  43 + public void process(Page page) {
  44 + doArticleContent(page);
  45 + }
  46 +
  47 + /**
  48 + * @param page
  49 + */
  50 + private void doArticleContent(Page page) {
  51 +
  52 + //解析页面
  53 + Html html = page.getHtml();
  54 + String articleCode = page.getUrl().get();
  55 +
  56 + String title = html.xpath("//h1[@class=' content-element-margin']/text()").get();
  57 +
  58 + String articleDesc = html.xpath("//dl[@class='row']").xpath("//dd[@class='col-sm-9 content-element-margin-small']").nodes().get(1).xpath("//p/text()").get();
  59 +
  60 +
  61 + Date publishTimeDateTime = null;
  62 + SimpleDateFormat formatter = new SimpleDateFormat("dd-yyyy", Locale.ENGLISH);
  63 + String publishTime = html.xpath("//dl[@class='row']").xpath("//dd[@class='col-sm-9']").nodes().get(10).xpath("//dd/text()").get();
  64 + try {
  65 +
  66 + publishTimeDateTime = formatter.parse(publishTime.trim());
  67 + } catch (ParseException e) {
  68 + try {
  69 + publishTime = html.xpath("//dl[@class='row']").xpath("//dd[@class='col-sm-9']").nodes().get(9).xpath("//dd/text()").get();
  70 + publishTimeDateTime = formatter.parse(publishTime.trim());
  71 + }catch (Exception e1) {
  72 +
  73 + }
  74 +
  75 + }
  76 + StringBuffer authorName = new StringBuffer();
  77 + authorName.append(html.xpath("//dl[@class='row']/").xpath("//dd[@class='col-sm-9 content-element-margin-small']").nodes().get(0).xpath("//dd/text()").get());
  78 +
  79 + StringBuffer authorAddress = new StringBuffer();
  80 + authorAddress.append(html.xpath("//dl[@class='row']/").xpath("//dd[@class='col-sm-9 content-element-margin-small']").nodes().get(3).xpath("//dd/text()").get());
  81 +
  82 + JSONArray authorEmail = new JSONArray();
  83 + String contractStr = html.xpath("//div[@class='col-md-3 sidebar content-element-margin']/aside/address/text()").get();
  84 + JSONObject jsonObject = new JSONObject();
  85 +
  86 + if (StringUtils.isNotBlank(contractStr)) {
  87 + String authorEmailName = contractStr.split(":")[0];
  88 + String telephone = contractStr.split(":")[1];
  89 + jsonObject.put("authorEmailName", authorEmailName);
  90 + jsonObject.put("telephone", telephone);
  91 + }
  92 + String splitStr = html.xpath("//div[@class='col-md-3 sidebar content-element-margin']/aside/address/a").get().replaceAll("<span>","")
  93 + .replaceAll("</span>","").replaceAll("</a>","");
  94 + if (StringUtils.isNotBlank(splitStr)) {
  95 + String email = splitStr.split(">")[1];
  96 + jsonObject.put("email", email);
  97 + }
  98 +
  99 + authorEmail.add(jsonObject);
  100 + log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorEmail.toJSONString());
  101 +
  102 + page.putField("article", ArticleDO.builder()
  103 + .articleType(ArticleTypeEnum.UNIVIE_PHYSNANO.getType())
  104 + .articleCode(articleCode)
  105 + .authorName(authorName.toString())
  106 + .title(title)
  107 + .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE))
  108 + .emailInfo(authorEmail.toJSONString())
  109 + .articleDesc(articleDesc)
  110 + .authorAddress(authorAddress.toString())
  111 + .referenceInfo(null).build());
  112 + }
  113 +
  114 + @Override
  115 + public Site getSite() {
  116 + return site;
  117 + }
  118 +
  119 + public void setSite(Site site) {
  120 + this.site = site;
  121 + }
  122 +}
0 \ No newline at end of file 123 \ No newline at end of file
src/main/java/com/canrd/webmagic/processor/UnivieSearchPageProcessor.java 0 → 100644
  1 +package com.canrd.webmagic.processor;
  2 +
  3 +import com.canrd.webmagic.processor.config.Agent;
  4 +import com.canrd.webmagic.processor.download.SeleniumDownloader;
  5 +import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
  6 +import lombok.extern.slf4j.Slf4j;
  7 +import org.apache.logging.log4j.core.util.UuidUtil;
  8 +import org.springframework.stereotype.Component;
  9 +import us.codecraft.webmagic.Page;
  10 +import us.codecraft.webmagic.Site;
  11 +import us.codecraft.webmagic.Spider;
  12 +import us.codecraft.webmagic.processor.PageProcessor;
  13 +import us.codecraft.webmagic.selector.Selectable;
  14 +import us.codecraft.webmagic.selector.XpathSelector;
  15 +
  16 +import javax.annotation.Resource;
  17 +import java.util.List;
  18 +
  19 +/**
  20 + * https://www.univie.ac.at/suche/?q=battery
  21 + *
  22 + * @author: xms
  23 + * @description: TODO
  24 + * @date: 2024/4/1 14:19
  25 + * @version: 1.0
  26 + */
  27 +@Slf4j
  28 +@Component
  29 +public class UnivieSearchPageProcessor implements PageProcessor {
  30 +
  31 + @Resource
  32 + private Univie4PhysnanoArticlePageProcessor univie4PhysnanoArticlePageProcessor;
  33 +
  34 + @Resource
  35 + private SeleniumDownloader seleniumDownloader;
  36 +
  37 + @Resource
  38 + private ArticlePipeline articlePipeline;
  39 +
  40 + /**
  41 + * 抓取网站的相关配置,包括编码、抓取间隔、重试次数等
  42 + */
  43 + private Site site = Site.me().setRetryTimes(3).setSleepTime(5).setUserAgent(Agent.getRandom());
  44 +
  45 + /**
  46 + * 定制爬虫逻辑的核心接口,在这里编写抽取逻辑
  47 + *
  48 + * @param page
  49 + */
  50 + @Override
  51 + public void process(Page page) {
  52 + doSearch(page);
  53 + }
  54 +
  55 + /**
  56 + * @param page
  57 + */
  58 + private void doSearch(Page page) {
  59 + String url = page.getUrl().get();
  60 + /**
  61 + * 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容
  62 + * 1、通过$或css()方法获取到该page html下某元素dom
  63 + */
  64 + Selectable selectable = page.getHtml().xpath("//div[@class='univie-search']").xpath("//div[@class='content-element-margin-small yacy-result']");
  65 + List<Selectable> nodes = selectable.nodes();
  66 +
  67 + /**
  68 + * 获取到指定的dom后,从这些dom中提取元素内容。
  69 + */
  70 + for (int i = 0; i <= nodes.size() - 1; i++) {
  71 + String link = nodes.get(i).links().get();
  72 + String title = nodes.get(i).xpath("//a/h2/text()").get();
  73 + if (link.contains(".pdf") || link.contains(".docx")) {
  74 + continue;
  75 + }
  76 + if (link.contains("physnano.univie.ac.at/publications/publication-detail/pure")) {
  77 + log.info("关键字文章列表链接:{},标题:{},文章链接:{}", url, title, link);
  78 + Spider.create(univie4PhysnanoArticlePageProcessor)
  79 + .addUrl(link)
  80 + .addPipeline(articlePipeline)
  81 +// .setDownloader(seleniumDownloader)
  82 + .setUUID(UuidUtil.getTimeBasedUuid().toString())
  83 + // 开启5个线程执行,并开始爬取
  84 + .thread(1).run();
  85 + }
  86 + }
  87 +
  88 + }
  89 +
  90 +
  91 + @Override
  92 + public Site getSite() {
  93 + return site;
  94 + }
  95 +}
0 \ No newline at end of file 96 \ No newline at end of file