Commit f85930c5880cdba04f82dcea44d2407d807fe2d8

Authored by 谢茂盛
1 parent 98eb2cc8

feat:

1、science-spj 爬取
src/main/java/com/canrd/webmagic/controller/Science4SpjController.java 0 → 100644
  1 +package com.canrd.webmagic.controller;
  2 +
  3 +import com.canrd.webmagic.common.constant.ServerResult;
  4 +import com.canrd.webmagic.common.utils.KeywordUtil;
  5 +import com.canrd.webmagic.processor.Science4JournalSearchPageProcessor;
  6 +import com.canrd.webmagic.processor.Science4SpjSearchPageProcessor;
  7 +import com.canrd.webmagic.processor.download.SeleniumDownloader;
  8 +import org.apache.logging.log4j.core.util.UuidUtil;
  9 +import org.springframework.web.bind.annotation.GetMapping;
  10 +import org.springframework.web.bind.annotation.RequestMapping;
  11 +import org.springframework.web.bind.annotation.RequestParam;
  12 +import org.springframework.web.bind.annotation.RestController;
  13 +import us.codecraft.webmagic.Spider;
  14 +
  15 +import javax.annotation.Resource;
  16 +
  17 +/**
  18 + * nature-文章信息(NatureArticle)表控制层
  19 + *
  20 + * @author makejava
  21 + * @since 2024-04-07 18:39:41
  22 + */
  23 +@RestController
  24 +@RequestMapping("/science/spj")
  25 +public class Science4SpjController {
  26 +
  27 + @Resource
  28 + private Science4SpjSearchPageProcessor science4SpjSearchPageProcessor;
  29 +
  30 + @Resource
  31 + private SeleniumDownloader seleniumDownloader;
  32 +
  33 + /**
  34 + * @return
  35 + */
  36 + @GetMapping("/start")
  37 + public ServerResult start(@RequestParam(value = "indexSize") Integer indexSize) {
  38 + for (int i = 0; i <= indexSize; i++) {
  39 + Spider.create(science4SpjSearchPageProcessor)
  40 + .addUrl("https://www.science.org/journal/science/insights?startPage=" + i)
  41 + // 开启5个线程执行,并开始爬取
  42 + .setUUID(UuidUtil.getTimeBasedUuid().toString())
  43 + .setDownloader(seleniumDownloader)
  44 + .thread(5).run();
  45 + }
  46 +
  47 + return ServerResult.success();
  48 + }
  49 +
  50 +
  51 + /**
  52 + * @return
  53 + */
  54 + @GetMapping("/search")
  55 + public ServerResult search(@RequestParam(value = "indexSize") Integer indexSize) {
  56 + for (int i = 0; i <= indexSize; i++) {
  57 + for (String keyword : KeywordUtil.getKeyWordList()) {
  58 + Spider.create(science4SpjSearchPageProcessor)
  59 + .addUrl("https://spj.science.org/action/doSearch?AllField=" + keyword + "&pageSize=20&startPage=" + i)
  60 + // 开启5个线程执行,并开始爬取
  61 + .setUUID(UuidUtil.getTimeBasedUuid().toString())
  62 + .setDownloader(seleniumDownloader)
  63 + .thread(5).run();
  64 + }
  65 + }
  66 +
  67 + return ServerResult.success();
  68 + }
  69 +
  70 +}
  71 +
src/main/java/com/canrd/webmagic/domain/ArticleTypeEnum.java
@@ -16,6 +16,7 @@ import lombok.NoArgsConstructor; @@ -16,6 +16,7 @@ import lombok.NoArgsConstructor;
16 public enum ArticleTypeEnum { 16 public enum ArticleTypeEnum {
17 NATURE("nature", "nature网址"), 17 NATURE("nature", "nature网址"),
18 SCIENCE("science", "science网址"), 18 SCIENCE("science", "science网址"),
  19 + SCIENCE_SPJ("science-spj", "science网址-spj"),
19 ; 20 ;
20 private String type; 21 private String type;
21 private String desc; 22 private String desc;
src/main/java/com/canrd/webmagic/processor/Science4JournalSearchPageProcessor.java
@@ -77,7 +77,7 @@ public class Science4JournalSearchPageProcessor implements PageProcessor { @@ -77,7 +77,7 @@ public class Science4JournalSearchPageProcessor implements PageProcessor {
77 */ 77 */
78 for (int i = 0; i <= nodes.size() - 1; i++) { 78 for (int i = 0; i <= nodes.size() - 1; i++) {
79 String title = nodes.get(i).xpath("//div[@class='card pb-3 mb-4 border-bottom']/div").xpath("//div[@class='d-flex justify-content-between align-items-end']/div/span/h2/a/text()").get(); 79 String title = nodes.get(i).xpath("//div[@class='card pb-3 mb-4 border-bottom']/div").xpath("//div[@class='d-flex justify-content-between align-items-end']/div/span/h2/a/text()").get();
80 - String time = nodes.get(i).xpath("//div[@class='card-meta align-middle mb-2 text-uppercase text-darker-gray']/span").nodes().get(2).xpath("//time/text()").get(); 80 + String time = nodes.get(i).xpath("//div[@class='card-meta align-middle mb-2 text-uppercase text-darker-gray']/span").xpath("//time/text()").get();
81 String link = nodes.get(i).links().get(); 81 String link = nodes.get(i).links().get();
82 SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH); 82 SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH);
83 try { 83 try {
src/main/java/com/canrd/webmagic/processor/Science4SpjArticlePageProcessor.java 0 → 100644
  1 +package com.canrd.webmagic.processor;
  2 +
  3 +import com.alibaba.fastjson.JSONArray;
  4 +import com.alibaba.fastjson.JSONObject;
  5 +import com.canrd.webmagic.common.utils.DateUtil;
  6 +import com.canrd.webmagic.common.utils.StringUtils;
  7 +import com.canrd.webmagic.domain.ArticleTypeEnum;
  8 +import com.canrd.webmagic.domain.dto.ArticleDO;
  9 +import com.canrd.webmagic.processor.config.Agent;
  10 +import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
  11 +import lombok.extern.slf4j.Slf4j;
  12 +import org.springframework.stereotype.Component;
  13 +import us.codecraft.webmagic.Page;
  14 +import us.codecraft.webmagic.Site;
  15 +import us.codecraft.webmagic.Spider;
  16 +import us.codecraft.webmagic.processor.PageProcessor;
  17 +import us.codecraft.webmagic.selector.Html;
  18 +import us.codecraft.webmagic.selector.Selectable;
  19 +
  20 +import java.text.ParseException;
  21 +import java.text.SimpleDateFormat;
  22 +import java.util.Date;
  23 +import java.util.List;
  24 +import java.util.Locale;
  25 +import java.util.Objects;
  26 +
  27 +/**
  28 + * @author: xms
  29 + * @description: TODO
  30 + * @date: 2024/4/1 14:19
  31 + * @version: 1.0
  32 + */
  33 +@Slf4j
  34 +@Component
  35 +public class Science4SpjArticlePageProcessor implements PageProcessor {
  36 +
  37 + // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等
  38 + private Site site = Site.me().setRetryTimes(3).setSleepTime(5).setUserAgent(Agent.getRandom());
  39 +
  40 + /**
  41 + * 定制爬虫逻辑的核心接口,在这里编写抽取逻辑
  42 + *
  43 + * @param page
  44 + */
  45 + @Override
  46 + public void process(Page page) {
  47 + doArticleContent(page);
  48 + }
  49 +
  50 + /**
  51 + * @param page
  52 + */
  53 + private void doArticleContent(Page page) {
  54 +
  55 + //解析页面
  56 + Html html = page.getHtml();
  57 + String articleCode = page.getUrl().get();
  58 + Selectable articleSelectable = html.xpath("//article[@xmlns='http://www.w3.org/1999/xhtml']");
  59 + Selectable headSelectable = articleSelectable.xpath("//header/div");
  60 +
  61 + String title = headSelectable.xpath("//div[@class='core-lede']/div/text()").get();
  62 + if (StringUtils.isBlank(title)) {
  63 + title = headSelectable.xpath("//h1[@property='name']/text()").get();
  64 + }
  65 +
  66 + String articleDesc = articleSelectable.xpath("//div[@role='paragraph']/text()").get();
  67 + if (StringUtils.isBlank(articleDesc)) {
  68 + articleDesc = html.xpath("//div[@class='article-container']/article").xpath("//div[@role='paragraph']/text()").get();
  69 + }
  70 +
  71 + String publishTime = headSelectable.xpath("//span[@property='datePublished']/text()").get();
  72 + Date publishTimeDateTime = null;
  73 + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH);
  74 +
  75 + try {
  76 + publishTimeDateTime = formatter.parse(publishTime);
  77 + } catch (ParseException e) {
  78 + e.printStackTrace();
  79 + }
  80 + List<Selectable> authorNodes = headSelectable.xpath("//span[@property='author']").nodes();
  81 + StringBuffer authorName = new StringBuffer();
  82 + for (Selectable node : authorNodes) {
  83 + String giveName = node.xpath("//span[@property='givenName']/text()").get();
  84 + String familyName = node.xpath("//span[@property='familyName']/text()").get();
  85 + if (StringUtils.isBlank(giveName) && StringUtils.isBlank(familyName)) {
  86 + continue;
  87 + }
  88 + authorName.append(giveName).append(" ").append(familyName).append(",");
  89 + }
  90 +
  91 +
  92 + JSONArray authorEmail = new JSONArray();
  93 + List<Selectable> authorEmailSelectables = headSelectable.xpath("//span[@property='author']").nodes();
  94 + for (Selectable authorEmailSelectable : authorEmailSelectables) {
  95 + String givenName = authorEmailSelectable.xpath("//span[@property='givenName']/text()").get();
  96 + String familyName = authorEmailSelectable.xpath("//span[@property='familyName']/text()").get();
  97 + String email = authorEmailSelectable.xpath("//a[@property='email']/text()").get();
  98 + if (StringUtils.isBlank(email)) {
  99 + continue;
  100 + }
  101 +
  102 + JSONObject jsonObject = new JSONObject();
  103 + jsonObject.put("authorEmailName", givenName + " " + familyName);
  104 + jsonObject.put("email", email);
  105 + authorEmail.add(jsonObject);
  106 + }
  107 + log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorEmail.toJSONString());
  108 +
  109 + page.putField("article", ArticleDO.builder()
  110 + .articleType(ArticleTypeEnum.SCIENCE_SPJ.getType())
  111 + .articleCode(articleCode)
  112 + .authorName(authorName.toString())
  113 + .title(title)
  114 + .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE))
  115 + .emailInfo(authorEmail.toJSONString())
  116 + .articleDesc(articleDesc)
  117 + .authorAddress(null)
  118 + .referenceInfo(null).build());
  119 + }
  120 +
  121 + @Override
  122 + public Site getSite() {
  123 + return site;
  124 + }
  125 +
  126 + public void setSite(Site site) {
  127 + this.site = site;
  128 + }
  129 +
  130 + public static void main(String[] args) {
  131 + // 创建一个Spider,并把我们的处理器放进去
  132 + Spider.create(new Science4SpjArticlePageProcessor())
  133 + // 添加这个Spider要爬取的网页地址
  134 + .addUrl("https://www.science.org/journal/science/insights?startPage=0")
  135 + .addPipeline(new ArticlePipeline())
  136 + // 开启5个线程执行,并开始爬取
  137 + .thread(5).run();
  138 + }
  139 +}
0 \ No newline at end of file 140 \ No newline at end of file
src/main/java/com/canrd/webmagic/processor/Science4SpjSearchPageProcessor.java 0 → 100644
  1 +package com.canrd.webmagic.processor;
  2 +
  3 +import com.canrd.webmagic.common.utils.DateUtil;
  4 +import com.canrd.webmagic.common.utils.KeywordUtil;
  5 +import com.canrd.webmagic.processor.config.Agent;
  6 +import com.canrd.webmagic.processor.download.SeleniumDownloader;
  7 +import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
  8 +import lombok.extern.slf4j.Slf4j;
  9 +import org.apache.logging.log4j.core.util.UuidUtil;
  10 +import org.springframework.stereotype.Component;
  11 +import us.codecraft.webmagic.Page;
  12 +import us.codecraft.webmagic.Site;
  13 +import us.codecraft.webmagic.Spider;
  14 +import us.codecraft.webmagic.processor.PageProcessor;
  15 +import us.codecraft.webmagic.selector.Selectable;
  16 +import us.codecraft.webmagic.selector.XpathSelector;
  17 +
  18 +import javax.annotation.Resource;
  19 +import java.text.ParseException;
  20 +import java.text.SimpleDateFormat;
  21 +import java.util.Date;
  22 +import java.util.List;
  23 +import java.util.Locale;
  24 +
  25 +/**
  26 + * https://spj.science.org/action/doSearch?AllField=Nickel+foam&pageSize=20&startPage=0
  27 + *
  28 + * @author: xms
  29 + * @description: TODO
  30 + * @date: 2024/4/1 14:19
  31 + * @version: 1.0
  32 + */
  33 +@Slf4j
  34 +@Component
  35 +public class Science4SpjSearchPageProcessor implements PageProcessor {
  36 +
  37 + @Resource
  38 + private Science4SpjArticlePageProcessor science4SpjArticlePageProcessor;
  39 +
  40 + @Resource
  41 + private SeleniumDownloader seleniumDownloader;
  42 +
  43 + @Resource
  44 + private ArticlePipeline articlePipeline;
  45 +
  46 + /**
  47 + * 抓取网站的相关配置,包括编码、抓取间隔、重试次数等
  48 + */
  49 + private Site site = Site.me().setRetryTimes(3).setSleepTime(5).setUserAgent(Agent.getRandom());
  50 +
  51 + /**
  52 + * 定制爬虫逻辑的核心接口,在这里编写抽取逻辑
  53 + *
  54 + * @param page
  55 + */
  56 + @Override
  57 + public void process(Page page) {
  58 + if (page.getUrl().get().contains("doSearch")) {
  59 + doSearch(page);
  60 + } else {
  61 + doArticleList(page);
  62 + }
  63 + }
  64 +
  65 + /**
  66 + * @param page
  67 + */
  68 + private void doSearch(Page page) {
  69 + String url = page.getUrl().get();
  70 + /**
  71 + * 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容
  72 + * 1、通过$或css()方法获取到该page html下某元素dom
  73 + */
  74 + Selectable selectable = page.getHtml().xpath("//div[@class=' search-result__body titles-results ']").select(new XpathSelector("div[@class='card pb-3 mb-4 border-bottom']"));
  75 + List<Selectable> nodes = selectable.nodes();
  76 +
  77 + /**
  78 + * 获取到指定的dom后,从这些dom中提取元素内容。
  79 + */
  80 + for (int i = 0; i <= nodes.size() - 1; i++) {
  81 + String title = nodes.get(i).xpath("//div[@class='card pb-3 mb-4 border-bottom']/div").xpath("//div[@class='d-flex justify-content-between align-items-end']/div/span/h2/a/text()").get();
  82 + String time = nodes.get(i).xpath("//div[@class='card-meta align-middle mb-2 text-uppercase text-darker-gray']/span").xpath("//time/text()").get();
  83 + String link = nodes.get(i).links().get();
  84 + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH);
  85 + try {
  86 + Date publishTimeDateTime = formatter.parse(time);
  87 + if (!publishTimeDateTime.before(DateUtil.localDate2Date(DateUtil.parseDate("2000-01-01", DateUtil.DATE)))) {
  88 + Spider.create(science4SpjArticlePageProcessor)
  89 + .addUrl(link)
  90 + .addPipeline(articlePipeline)
  91 + .setDownloader(seleniumDownloader)
  92 + .setUUID(UuidUtil.getTimeBasedUuid().toString())
  93 + // 开启5个线程执行,并开始爬取
  94 + .thread(1).run();
  95 + log.info("关键字文章列表链接:{},标题:{},文章链接:{}", url, title, link);
  96 + }
  97 + } catch (ParseException e) {
  98 + e.printStackTrace();
  99 + }
  100 + }
  101 +
  102 + }
  103 +
  104 + /**
  105 + * @param page
  106 + */
  107 + private void doArticleList(Page page) {
  108 + String url = page.getUrl().get();
  109 + /**
  110 + * 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容
  111 + * 1、通过$或css()方法获取到该page html下某元素dom
  112 + */
  113 + Selectable selectable = page.getHtml().xpath("//div[@class=' search-result__body titles-results ']").select(new XpathSelector("div[@class='card pb-3 mb-4 border-bottom']"));
  114 + List<Selectable> nodes = selectable.nodes();
  115 +
  116 + /**
  117 + * 获取到指定的dom后,从这些dom中提取元素内容。
  118 + */
  119 + for (int i = 0; i <= nodes.size() - 1; i++) {
  120 + String title = nodes.get(i).xpath("//div[@class='card pb-3 mb-4 border-bottom']/div").xpath("//div[@class='d-flex justify-content-between align-items-end']/div/span/h2/a/text()").get();
  121 + String time = nodes.get(i).xpath("//div[@class='card-meta align-middle mb-2 text-uppercase text-darker-gray']/span").xpath("//time/text()").get();
  122 + String link = nodes.get(i).links().get();
  123 + if (KeywordUtil.containKeywordsInTitle(title)) {
  124 + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH);
  125 + try {
  126 + Date publishTimeDateTime = formatter.parse(time);
  127 + if (!publishTimeDateTime.before(DateUtil.localDate2Date(DateUtil.parseDate("2000-01-01", DateUtil.DATE)))) {
  128 + Spider.create(science4SpjArticlePageProcessor)
  129 + .addUrl(link)
  130 + .addPipeline(articlePipeline)
  131 + .setDownloader(seleniumDownloader)
  132 + .setUUID(UuidUtil.getTimeBasedUuid().toString())
  133 + // 开启5个线程执行,并开始爬取
  134 + .thread(1).run();
  135 + log.info("关键字文章列表链接:{},标题:{},文章链接:{}", url, title, link);
  136 + }
  137 + } catch (ParseException e) {
  138 + e.printStackTrace();
  139 + }
  140 +
  141 + }
  142 + }
  143 +
  144 + }
  145 +
  146 + @Override
  147 + public Site getSite() {
  148 + return site;
  149 + }
  150 +
  151 + public static void main(String[] args) {
  152 + // 创建一个Spider,并把我们的处理器放进去
  153 + Spider.create(new Science4SpjSearchPageProcessor())
  154 + // 添加这个Spider要爬取的网页地址
  155 + .addUrl("https://www.science.org/journal/science/insights?startPage=0")
  156 + .addPipeline(new ArticlePipeline())
  157 + // 开启5个线程执行,并开始爬取
  158 + .thread(5).run();
  159 + }
  160 +}
0 \ No newline at end of file 161 \ No newline at end of file