Commit a24924df1d1fad46ddee4053e191bf6ac4731396

Authored by 谢茂盛
1 parent 8f304c93

feat:

1、selenium 整合
2、science 网站
@@ -47,6 +47,7 @@ @@ -47,6 +47,7 @@
47 <jjwt.version>0.10.6</jjwt.version> 47 <jjwt.version>0.10.6</jjwt.version>
48 <easyexcel.version>2.2.3</easyexcel.version> 48 <easyexcel.version>2.2.3</easyexcel.version>
49 <webmagic.version>0.10.0</webmagic.version> 49 <webmagic.version>0.10.0</webmagic.version>
  50 + <selenium.version>3.4.0</selenium.version>
50 </properties> 51 </properties>
51 52
52 <dependencies> 53 <dependencies>
@@ -78,6 +79,12 @@ @@ -78,6 +79,12 @@
78 <version>${webmagic.version}</version> 79 <version>${webmagic.version}</version>
79 </dependency> 80 </dependency>
80 81
  82 + <!-- selenium -->
  83 + <dependency>
  84 + <groupId>org.seleniumhq.selenium</groupId>
  85 + <artifactId>selenium-java</artifactId>
  86 + <version>${selenium.version}</version>
  87 + </dependency>
81 88
82 <!-- Lombok 依赖--> 89 <!-- Lombok 依赖-->
83 <dependency> 90 <dependency>
src/main/java/com/canrd/webmagic/config/SeleniumConfig.java 0 → 100644
  1 +package com.canrd.webmagic.config;
  2 +
  3 +import com.canrd.webmagic.processor.config.Agent;
  4 +import org.openqa.selenium.WebDriver;
  5 +import org.openqa.selenium.chrome.ChromeDriver;
  6 +import org.openqa.selenium.chrome.ChromeOptions;
  7 +import org.springframework.context.annotation.Bean;
  8 +import org.springframework.context.annotation.Configuration;
  9 +
  10 +/**
  11 + * @author: xms
  12 + * @description: TODO
  13 + * @date: 2024/4/26 14:37
  14 + * @version: 1.0
  15 + */
  16 +@Configuration
  17 +public class SeleniumConfig {
  18 +
  19 + @Bean
  20 + public WebDriver webDriver() {
  21 + System.setProperty("webdriver.chrome.driver", "D:\\chrome\\chromedriver-win64\\chromedriver-win64\\chromedriver.exe");
  22 + return new ChromeDriver();
  23 + }
  24 +}
src/main/java/com/canrd/webmagic/controller/ArticleController.java
@@ -5,8 +5,8 @@ import com.canrd.webmagic.common.jsr303.OperateGroup; @@ -5,8 +5,8 @@ import com.canrd.webmagic.common.jsr303.OperateGroup;
5 import com.canrd.webmagic.domain.vo.NatureArticleQueryVO; 5 import com.canrd.webmagic.domain.vo.NatureArticleQueryVO;
6 import com.canrd.webmagic.domain.vo.NatureArticleVO; 6 import com.canrd.webmagic.domain.vo.NatureArticleVO;
7 import com.canrd.webmagic.processor.NatureSearchPageProcessor; 7 import com.canrd.webmagic.processor.NatureSearchPageProcessor;
8 -import com.canrd.webmagic.processor.config.Downloader;  
9 -import com.canrd.webmagic.processor.pipeline.NatureArticlePipeline; 8 +import com.canrd.webmagic.processor.download.Downloader;
  9 +import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
10 import com.canrd.webmagic.service.ArticleService; 10 import com.canrd.webmagic.service.ArticleService;
11 import org.springframework.validation.annotation.Validated; 11 import org.springframework.validation.annotation.Validated;
12 import org.springframework.web.bind.annotation.*; 12 import org.springframework.web.bind.annotation.*;
@@ -33,7 +33,7 @@ public class ArticleController { @@ -33,7 +33,7 @@ public class ArticleController {
33 private NatureSearchPageProcessor natureSearchPageProcessor; 33 private NatureSearchPageProcessor natureSearchPageProcessor;
34 34
35 @Resource 35 @Resource
36 - private NatureArticlePipeline articlePipeline; 36 + private ArticlePipeline articlePipeline;
37 37
38 @Resource 38 @Resource
39 private Downloader downloader; 39 private Downloader downloader;
@@ -49,7 +49,7 @@ public class ArticleController { @@ -49,7 +49,7 @@ public class ArticleController {
49 .addUrl("https://www.nature.com/search?q=" + keyword + "&page=" + i) 49 .addUrl("https://www.nature.com/search?q=" + keyword + "&page=" + i)
50 .addUrl("https://www.nature.com/nature/research-articles?sort=PubDate&page=" + i) 50 .addUrl("https://www.nature.com/nature/research-articles?sort=PubDate&page=" + i)
51 .addPipeline(articlePipeline) 51 .addPipeline(articlePipeline)
52 - .setDownloader(downloader.newIpDownloader()) 52 +// .setDownloader(downloader.newIpDownloader())
53 // 开启5个线程执行,并开始爬取 53 // 开启5个线程执行,并开始爬取
54 .thread(5).run(); 54 .thread(5).run();
55 } 55 }
src/main/java/com/canrd/webmagic/controller/Science4JournalController.java 0 → 100644
  1 +package com.canrd.webmagic.controller;
  2 +
  3 +import com.canrd.webmagic.common.constant.ServerResult;
  4 +import com.canrd.webmagic.common.jsr303.OperateGroup;
  5 +import com.canrd.webmagic.domain.vo.NatureArticleQueryVO;
  6 +import com.canrd.webmagic.domain.vo.NatureArticleVO;
  7 +import com.canrd.webmagic.processor.Science4JournalSearchPageProcessor;
  8 +import com.canrd.webmagic.service.ArticleService;
  9 +import org.springframework.validation.annotation.Validated;
  10 +import org.springframework.web.bind.annotation.*;
  11 +import us.codecraft.webmagic.Spider;
  12 +
  13 +import javax.annotation.Resource;
  14 +
  15 +/**
  16 + * nature-文章信息(NatureArticle)表控制层
  17 + *
  18 + * @author makejava
  19 + * @since 2024-04-07 18:39:41
  20 + */
  21 +@RestController
  22 +@RequestMapping("/science/journal")
  23 +public class Science4JournalController {
  24 + /**
  25 + * 服务对象
  26 + */
  27 + @Resource
  28 + private ArticleService articleService;
  29 +
  30 + @Resource
  31 + private Science4JournalSearchPageProcessor science4JournalSearchPageProcessor;
  32 +
  33 + /**
  34 + * @return
  35 + */
  36 + @GetMapping("/start")
  37 + public ServerResult start(@RequestParam(value = "keyword") String keyword, @RequestParam(value = "indexSize") Integer indexSize) {
  38 + for (int i = 0; i <= indexSize; i++) {
  39 + Spider.create(science4JournalSearchPageProcessor)
  40 + .addUrl("http://www.science.org/journal/science/insights?startPage=" + i)
  41 + // 开启5个线程执行,并开始爬取
  42 + .thread(5).run();
  43 + }
  44 +
  45 + return ServerResult.success();
  46 + }
  47 +
  48 + /**
  49 + * 分页查询
  50 + *
  51 + * @param natureArticleQueryVO 查询条件
  52 + * @return 查询结果
  53 + */
  54 + @PostMapping("/list")
  55 + public ServerResult list(@RequestBody @Validated({OperateGroup.List.class}) NatureArticleQueryVO natureArticleQueryVO) {
  56 + return articleService.list(natureArticleQueryVO);
  57 + }
  58 +
  59 + /**
  60 + * 通过主键查询单条数据
  61 + *
  62 + * @param natureArticleQueryVO 查询条件
  63 + * @return 单条数据
  64 + */
  65 + @PostMapping("/query_by_id")
  66 + public ServerResult queryById(@RequestBody NatureArticleQueryVO natureArticleQueryVO) {
  67 + return articleService.queryById(natureArticleQueryVO);
  68 + }
  69 +
  70 + /**
  71 + * 新增数据
  72 + *
  73 + * @param natureArticleVO 数据VO
  74 + * @return 新增结果
  75 + */
  76 + @PostMapping("/add")
  77 + public ServerResult add(@RequestBody NatureArticleVO natureArticleVO) {
  78 + return articleService.add(natureArticleVO);
  79 + }
  80 +
  81 + /**
  82 + * 编辑数据
  83 + *
  84 + * @param natureArticleVO 数据VO
  85 + * @return 编辑结果
  86 + */
  87 + @PostMapping("/edit")
  88 + public ServerResult edit(@RequestBody NatureArticleVO natureArticleVO) {
  89 + return articleService.edit(natureArticleVO);
  90 + }
  91 +
  92 + /**
  93 + * 删除数据
  94 + *
  95 + * @param natureArticleQueryVO 查询条件
  96 + * @return 删除是否成功
  97 + */
  98 + @PostMapping("/delete_by_id")
  99 + public ServerResult deleteById(@RequestBody NatureArticleQueryVO natureArticleQueryVO) {
  100 + return articleService.deleteById(natureArticleQueryVO);
  101 + }
  102 +
  103 +}
  104 +
src/main/java/com/canrd/webmagic/domain/ArticleTypeEnum.java
@@ -15,6 +15,7 @@ import lombok.NoArgsConstructor; @@ -15,6 +15,7 @@ import lombok.NoArgsConstructor;
15 @NoArgsConstructor 15 @NoArgsConstructor
16 public enum ArticleTypeEnum { 16 public enum ArticleTypeEnum {
17 NATURE("nature", "nature网址"), 17 NATURE("nature", "nature网址"),
  18 + SCIENCE("science", "science网址"),
18 ; 19 ;
19 private String type; 20 private String type;
20 private String desc; 21 private String desc;
src/main/java/com/canrd/webmagic/job/NatureJob.java 0 → 100644
  1 +package com.canrd.webmagic.job;
  2 +
  3 +import com.canrd.webmagic.common.utils.KeywordUtil;
  4 +import com.canrd.webmagic.processor.NatureSearchPageProcessor;
  5 +import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
  6 +import org.springframework.scheduling.annotation.Scheduled;
  7 +import org.springframework.stereotype.Component;
  8 +import us.codecraft.webmagic.Spider;
  9 +
  10 +import javax.annotation.Resource;
  11 +
  12 +/**
  13 + * @author: xms
  14 + * @description: TODO
  15 + * @date: 2024/4/26 10:06
  16 + * @version: 1.0
  17 + */
  18 +@Component
  19 +public class NatureJob {
  20 +
  21 + @Resource
  22 + private NatureSearchPageProcessor natureSearchPageProcessor;
  23 +
  24 + @Resource
  25 + private ArticlePipeline articlePipeline;
  26 +
  27 + /**
  28 + * 每天凌晨执行一次
  29 + */
  30 +// @Scheduled(cron = "*/20 * * * * ?")
  31 + @Scheduled(cron = "0 0 0 * * ?")
  32 + public void executeByDay() {
  33 + for (String keyword : KeywordUtil.getKeyWordList()) {
  34 + Spider.create(natureSearchPageProcessor)
  35 + // 添加这个Spider要爬取的网页地址
  36 + .addUrl("https://www.nature.com/search?q=" + keyword + "&page=" + 1)
  37 + .addUrl("https://www.nature.com/search?q=" + keyword + "&page=" + 2)
  38 + .addUrl("https://www.nature.com/search?q=" + keyword + "&page=" + 3)
  39 + .addPipeline(articlePipeline)
  40 +// .setDownloader(downloader.newIpDownloader())
  41 + // 开启5个线程执行,并开始爬取
  42 + .thread(5).run();
  43 + }
  44 + }
  45 +}
src/main/java/com/canrd/webmagic/processor/NatureSearchPageProcessor.java
@@ -3,12 +3,13 @@ package com.canrd.webmagic.processor; @@ -3,12 +3,13 @@ package com.canrd.webmagic.processor;
3 import com.alibaba.fastjson.JSONArray; 3 import com.alibaba.fastjson.JSONArray;
4 import com.alibaba.fastjson.JSONObject; 4 import com.alibaba.fastjson.JSONObject;
5 import com.baomidou.mybatisplus.core.toolkit.CollectionUtils; 5 import com.baomidou.mybatisplus.core.toolkit.CollectionUtils;
  6 +import com.canrd.webmagic.common.utils.DateUtil;
6 import com.canrd.webmagic.common.utils.KeywordUtil; 7 import com.canrd.webmagic.common.utils.KeywordUtil;
7 import com.canrd.webmagic.common.utils.StringUtils; 8 import com.canrd.webmagic.common.utils.StringUtils;
8 import com.canrd.webmagic.domain.ArticleTypeEnum; 9 import com.canrd.webmagic.domain.ArticleTypeEnum;
9 import com.canrd.webmagic.domain.dto.ArticleDO; 10 import com.canrd.webmagic.domain.dto.ArticleDO;
10 import com.canrd.webmagic.processor.config.Agent; 11 import com.canrd.webmagic.processor.config.Agent;
11 -import com.canrd.webmagic.processor.pipeline.NatureArticlePipeline; 12 +import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
12 import lombok.extern.slf4j.Slf4j; 13 import lombok.extern.slf4j.Slf4j;
13 import org.springframework.stereotype.Component; 14 import org.springframework.stereotype.Component;
14 import us.codecraft.webmagic.Page; 15 import us.codecraft.webmagic.Page;
@@ -19,9 +20,9 @@ import us.codecraft.webmagic.selector.Html; @@ -19,9 +20,9 @@ import us.codecraft.webmagic.selector.Html;
19 import us.codecraft.webmagic.selector.Selectable; 20 import us.codecraft.webmagic.selector.Selectable;
20 import us.codecraft.webmagic.selector.XpathSelector; 21 import us.codecraft.webmagic.selector.XpathSelector;
21 22
22 -import java.util.ArrayList;  
23 -import java.util.List;  
24 -import java.util.Objects; 23 +import java.text.ParseException;
  24 +import java.text.SimpleDateFormat;
  25 +import java.util.*;
25 import java.util.stream.Collectors; 26 import java.util.stream.Collectors;
26 27
27 /** 28 /**
@@ -104,6 +105,7 @@ public class NatureSearchPageProcessor implements PageProcessor { @@ -104,6 +105,7 @@ public class NatureSearchPageProcessor implements PageProcessor {
104 } 105 }
105 String articleDesc = html.xpath("//div[@class='c-article-section__content']/p/text()").get(); 106 String articleDesc = html.xpath("//div[@class='c-article-section__content']/p/text()").get();
106 String publishTime; 107 String publishTime;
  108 + Date publishTimeDateTime = null;
107 try { 109 try {
108 publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get(); 110 publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get();
109 } catch (Exception e) { 111 } catch (Exception e) {
@@ -113,6 +115,13 @@ public class NatureSearchPageProcessor implements PageProcessor { @@ -113,6 +115,13 @@ public class NatureSearchPageProcessor implements PageProcessor {
113 publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(0).xpath("//li/time/text()").get(); 115 publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(0).xpath("//li/time/text()").get();
114 } 116 }
115 } 117 }
  118 + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH);
  119 +
  120 + try {
  121 + publishTimeDateTime = formatter.parse(publishTime);
  122 + } catch (ParseException e) {
  123 + e.printStackTrace();
  124 + }
116 Selectable authorSelectable = headSelectable.xpath("//ul").nodes().get(1).select(new XpathSelector("li[@class='c-article-author-list__item']")); 125 Selectable authorSelectable = headSelectable.xpath("//ul").nodes().get(1).select(new XpathSelector("li[@class='c-article-author-list__item']"));
117 List<Selectable> authorNodes = authorSelectable.nodes(); 126 List<Selectable> authorNodes = authorSelectable.nodes();
118 StringBuffer authorName = new StringBuffer(); 127 StringBuffer authorName = new StringBuffer();
@@ -170,7 +179,7 @@ public class NatureSearchPageProcessor implements PageProcessor { @@ -170,7 +179,7 @@ public class NatureSearchPageProcessor implements PageProcessor {
170 .articleCode(articleCode) 179 .articleCode(articleCode)
171 .authorName(authorName.toString()) 180 .authorName(authorName.toString())
172 .title(title) 181 .title(title)
173 - .publishTime(publishTime) 182 + .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE))
174 .emailInfo(authorEmail.toJSONString()) 183 .emailInfo(authorEmail.toJSONString())
175 .articleDesc(articleDesc) 184 .articleDesc(articleDesc)
176 .authorAddress(authorAddress.toJSONString()) 185 .authorAddress(authorAddress.toJSONString())
@@ -191,14 +200,25 @@ public class NatureSearchPageProcessor implements PageProcessor { @@ -191,14 +200,25 @@ public class NatureSearchPageProcessor implements PageProcessor {
191 /** 200 /**
192 * 获取到指定的dom后,从这些dom中提取元素内容。 201 * 获取到指定的dom后,从这些dom中提取元素内容。
193 */ 202 */
194 - for (int i = 1; i <= nodes.size() - 1; i++) { 203 + for (int i = 0; i <= nodes.size() - 1; i++) {
195 Selectable node = nodes.get(i).$(".u-full-height").nodes().get(2).nodes().get(0).$(".u-full-height").select(new XpathSelector("a[@class='c-card__link u-link-inherit']")).nodes().get(0); 204 Selectable node = nodes.get(i).$(".u-full-height").nodes().get(2).nodes().get(0).$(".u-full-height").select(new XpathSelector("a[@class='c-card__link u-link-inherit']")).nodes().get(0);
196 - String link = node.$("a", "href").get(); 205 + String link = node.links().get();
197 String title = node.$("a", "text").get(); 206 String title = node.$("a", "text").get();
198 if (KeywordUtil.containKeywordsInTitle(title)) { 207 if (KeywordUtil.containKeywordsInTitle(title)) {
199 - page.addTargetRequest(link);  
200 - log.info("关键字文章列表链接:{},标题:{},文章链接:{}", url, title, link); 208 + String publishTime = nodes.get(i).xpath("//div[@class='c-card__section c-meta']/time/text()").get();
  209 + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH);
  210 + try {
  211 + Date publishTimeDateTime = formatter.parse(publishTime);
  212 + if (!publishTimeDateTime.before(DateUtil.localDate2Date(DateUtil.parseDate("2000-01-01", DateUtil.DATE)))) {
  213 + page.addTargetRequest(link);
  214 + log.info("关键字文章列表链接:{},标题:{},文章链接:{}", url, title, link);
  215 + }
  216 + } catch (ParseException e) {
  217 + e.printStackTrace();
  218 + }
  219 +
201 } 220 }
  221 +
202 } 222 }
203 } 223 }
204 224
@@ -212,7 +232,7 @@ public class NatureSearchPageProcessor implements PageProcessor { @@ -212,7 +232,7 @@ public class NatureSearchPageProcessor implements PageProcessor {
212 Spider.create(new NatureSearchPageProcessor()) 232 Spider.create(new NatureSearchPageProcessor())
213 // 添加这个Spider要爬取的网页地址 233 // 添加这个Spider要爬取的网页地址
214 .addUrl("https://www.nature.com/nature/research-articles?sort=PubDate&page=1") 234 .addUrl("https://www.nature.com/nature/research-articles?sort=PubDate&page=1")
215 - .addPipeline(new NatureArticlePipeline()) 235 + .addPipeline(new ArticlePipeline())
216 // 开启5个线程执行,并开始爬取 236 // 开启5个线程执行,并开始爬取
217 .thread(5).run(); 237 .thread(5).run();
218 } 238 }
src/main/java/com/canrd/webmagic/processor/Science4JournalArticlePageProcessor.java 0 → 100644
  1 +package com.canrd.webmagic.processor;
  2 +
  3 +import com.alibaba.fastjson.JSONArray;
  4 +import com.alibaba.fastjson.JSONObject;
  5 +import com.canrd.webmagic.common.utils.DateUtil;
  6 +import com.canrd.webmagic.domain.ArticleTypeEnum;
  7 +import com.canrd.webmagic.domain.dto.ArticleDO;
  8 +import com.canrd.webmagic.processor.config.Agent;
  9 +import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
  10 +import lombok.extern.slf4j.Slf4j;
  11 +import org.springframework.stereotype.Component;
  12 +import us.codecraft.webmagic.Page;
  13 +import us.codecraft.webmagic.Site;
  14 +import us.codecraft.webmagic.Spider;
  15 +import us.codecraft.webmagic.processor.PageProcessor;
  16 +import us.codecraft.webmagic.selector.Html;
  17 +import us.codecraft.webmagic.selector.Selectable;
  18 +
  19 +import java.text.ParseException;
  20 +import java.text.SimpleDateFormat;
  21 +import java.util.Date;
  22 +import java.util.List;
  23 +import java.util.Locale;
  24 +import java.util.Objects;
  25 +
  26 +/**
  27 + * @author: xms
  28 + * @description: TODO
  29 + * @date: 2024/4/1 14:19
  30 + * @version: 1.0
  31 + */
  32 +@Slf4j
  33 +@Component
  34 +public class Science4JournalArticlePageProcessor implements PageProcessor {
  35 + private String agent = Agent.getRandom();
  36 +
  37 + // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等
  38 + private Site site = Site.me().setRetryTimes(3).setSleepTime(100).setUserAgent(Agent.getRandom());
  39 +
  40 + /**
  41 + * 定制爬虫逻辑的核心接口,在这里编写抽取逻辑
  42 + *
  43 + * @param page
  44 + */
  45 + @Override
  46 + public void process(Page page) {
  47 + doArticleContent(page);
  48 + }
  49 +
  50 + /**
  51 + * @param page
  52 + */
  53 + private void doArticleContent(Page page) {
  54 +
  55 + //解析页面
  56 + Html html = page.getHtml();
  57 + String articleCode = page.getUrl().get();
  58 + Selectable headSelectable = html.xpath("//div[@class='article-container']/article/header");
  59 +
  60 + String title = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='core-lede']/div/text()").get();
  61 +
  62 + String articleDesc = html.xpath("//div[@class='article-container']/article").xpath("//section[@id='bodymatter']/div/div/text()").get();
  63 +
  64 + String publishTime = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='core-self-citation']").xpath("//div[@class='core-date-published']/span/text()").get();
  65 + Date publishTimeDateTime = null;
  66 + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH);
  67 +
  68 + try {
  69 + publishTimeDateTime = formatter.parse(publishTime);
  70 + } catch (ParseException e) {
  71 + e.printStackTrace();
  72 + }
  73 + List<Selectable> authorNodes = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='contributors']/span/span/span").nodes();
  74 + StringBuffer authorName = new StringBuffer();
  75 + for (Selectable node : authorNodes) {
  76 + authorName.append(node.xpath("//a/span/text()").get()).append(" ");
  77 + }
  78 +
  79 +
  80 + JSONArray authorEmail = new JSONArray();
  81 + List<Selectable> authorEmailSelectables = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='contributors']").xpath("//span[@class='authors']").xpath("//span[@role='list']").xpath("//span[@property='author']").nodes();
  82 + for (Selectable authorEmailSelectable : authorEmailSelectables) {
  83 + String givenName = authorEmailSelectable.xpath("//span[@property='givenName']/text()").get();
  84 + String familyName = authorEmailSelectable.xpath("//span[@property='familyName']/text()").get();
  85 + String email = authorEmailSelectable.xpath("//a[@property='email']/text()").get();
  86 +
  87 + JSONObject jsonObject = new JSONObject();
  88 + jsonObject.put("authorEmailName", givenName + "" + familyName);
  89 + jsonObject.put("email", email);
  90 + authorEmail.add(jsonObject);
  91 + }
  92 + log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorEmail.toJSONString());
  93 +
  94 + page.putField("article", ArticleDO.builder()
  95 + .articleType(ArticleTypeEnum.SCIENCE.getType())
  96 + .articleCode(articleCode)
  97 + .authorName(authorName.toString())
  98 + .title(title)
  99 + .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE))
  100 + .emailInfo(authorEmail.toJSONString())
  101 + .articleDesc(articleDesc)
  102 + .authorAddress(null)
  103 + .referenceInfo(null).build());
  104 + }
  105 +
  106 + @Override
  107 + public Site getSite() {
  108 + return site;
  109 + }
  110 +
  111 + public static void main(String[] args) {
  112 + // 创建一个Spider,并把我们的处理器放进去
  113 + Spider.create(new Science4JournalArticlePageProcessor())
  114 + // 添加这个Spider要爬取的网页地址
  115 + .addUrl("https://www.science.org/journal/science/insights?startPage=0")
  116 + .addPipeline(new ArticlePipeline())
  117 + // 开启5个线程执行,并开始爬取
  118 + .thread(5).run();
  119 + }
  120 +}
0 \ No newline at end of file 121 \ No newline at end of file
src/main/java/com/canrd/webmagic/processor/Science4JournalSearchPageProcessor.java 0 → 100644
  1 +package com.canrd.webmagic.processor;
  2 +
  3 +import com.canrd.webmagic.common.utils.DateUtil;
  4 +import com.canrd.webmagic.common.utils.KeywordUtil;
  5 +import com.canrd.webmagic.processor.config.Agent;
  6 +import com.canrd.webmagic.processor.download.SeleniumDownloader;
  7 +import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
  8 +import lombok.extern.slf4j.Slf4j;
  9 +import org.springframework.stereotype.Component;
  10 +import us.codecraft.webmagic.Page;
  11 +import us.codecraft.webmagic.Site;
  12 +import us.codecraft.webmagic.Spider;
  13 +import us.codecraft.webmagic.processor.PageProcessor;
  14 +import us.codecraft.webmagic.selector.Selectable;
  15 +import us.codecraft.webmagic.selector.XpathSelector;
  16 +
  17 +import javax.annotation.Resource;
  18 +import java.text.ParseException;
  19 +import java.text.SimpleDateFormat;
  20 +import java.util.Date;
  21 +import java.util.List;
  22 +import java.util.Locale;
  23 +
  24 +/**
  25 + * @author: xms
  26 + * @description: TODO
  27 + * @date: 2024/4/1 14:19
  28 + * @version: 1.0
  29 + */
  30 +@Slf4j
  31 +@Component
  32 +public class Science4JournalSearchPageProcessor implements PageProcessor {
  33 +
  34 + @Resource
  35 + private Science4JournalArticlePageProcessor science4JournalArticlePageProcessor;
  36 +
  37 + @Resource
  38 + private SeleniumDownloader seleniumDownloader;
  39 +
  40 + @Resource
  41 + private ArticlePipeline articlePipeline;
  42 +
  43 + /**
  44 + * 抓取网站的相关配置,包括编码、抓取间隔、重试次数等
  45 + */
  46 + private Site site = Site.me().setRetryTimes(3).setSleepTime(100).setUserAgent(Agent.getRandom());
  47 +
  48 + /**
  49 + * 定制爬虫逻辑的核心接口,在这里编写抽取逻辑
  50 + *
  51 + * @param page
  52 + */
  53 + @Override
  54 + public void process(Page page) {
  55 + doArticleList(page);
  56 + }
  57 +
  58 + /**
  59 + * @param page
  60 + */
  61 + private void doArticleList(Page page) {
  62 + String url = page.getUrl().get();
  63 + /**
  64 + * 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容
  65 + * 1、通过$或css()方法获取到该page html下某元素dom
  66 + */
  67 + Selectable selectable = page.getHtml().xpath("//div[@class=' search-result__body titles-results ']").select(new XpathSelector("div[@class='card pb-3 mb-4 border-bottom']"));
  68 + List<Selectable> nodes = selectable.nodes();
  69 +
  70 + /**
  71 + * 获取到指定的dom后,从这些dom中提取元素内容。
  72 + */
  73 + for (int i = 0; i <= nodes.size() - 1; i++) {
  74 + String title = nodes.get(i).xpath("//div[@class='card pb-3 mb-4 border-bottom']/div").xpath("//div[@class='d-flex justify-content-between align-items-end']/div/span/h2/a/text()").get();
  75 + String time = nodes.get(i).xpath("//div[@class='card-meta align-middle mb-2 text-uppercase text-darker-gray']/span").nodes().get(2).xpath("//time/text()").get();
  76 + String link = nodes.get(0).links().get();
  77 + if (!KeywordUtil.containKeywordsInTitle(title)) {
  78 + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH);
  79 + try {
  80 + Date publishTimeDateTime = formatter.parse(time);
  81 + if (!publishTimeDateTime.before(DateUtil.localDate2Date(DateUtil.parseDate("2000-01-01", DateUtil.DATE)))) {
  82 +// page.addTargetRequest(link);
  83 + Spider.create(science4JournalArticlePageProcessor)
  84 + .addUrl(link)
  85 + .addPipeline(articlePipeline)
  86 + .setDownloader(seleniumDownloader)
  87 + // 开启5个线程执行,并开始爬取
  88 + .thread(1).run();
  89 + log.info("关键字文章列表链接:{},标题:{},文章链接:{}", url, title, link);
  90 + }
  91 + } catch (ParseException e) {
  92 + e.printStackTrace();
  93 + }
  94 +
  95 + }
  96 + }
  97 +
  98 + }
  99 +
  100 + @Override
  101 + public Site getSite() {
  102 + return site;
  103 + }
  104 +
  105 + public static void main(String[] args) {
  106 + // 创建一个Spider,并把我们的处理器放进去
  107 + Spider.create(new Science4JournalSearchPageProcessor())
  108 + // 添加这个Spider要爬取的网页地址
  109 + .addUrl("https://www.science.org/journal/science/insights?startPage=0")
  110 + .addPipeline(new ArticlePipeline())
  111 + // 开启5个线程执行,并开始爬取
  112 + .thread(5).run();
  113 + }
  114 +}
0 \ No newline at end of file 115 \ No newline at end of file
src/main/java/com/canrd/webmagic/processor/config/Downloader.java renamed to src/main/java/com/canrd/webmagic/processor/download/Downloader.java
1 -package com.canrd.webmagic.processor.config; 1 +package com.canrd.webmagic.processor.download;
2 2
3 import lombok.extern.slf4j.Slf4j; 3 import lombok.extern.slf4j.Slf4j;
4 import org.springframework.beans.factory.annotation.Autowired; 4 import org.springframework.beans.factory.annotation.Autowired;
src/main/java/com/canrd/webmagic/processor/download/SeleniumDownloader.java 0 → 100644
  1 +package com.canrd.webmagic.processor.download;
  2 +
  3 +import lombok.extern.slf4j.Slf4j;
  4 +import org.openqa.selenium.By;
  5 +import org.openqa.selenium.Cookie;
  6 +import org.openqa.selenium.WebDriver;
  7 +import org.openqa.selenium.WebElement;
  8 +import org.springframework.stereotype.Component;
  9 +import us.codecraft.webmagic.Page;
  10 +import us.codecraft.webmagic.Request;
  11 +import us.codecraft.webmagic.Site;
  12 +import us.codecraft.webmagic.Task;
  13 +import us.codecraft.webmagic.downloader.AbstractDownloader;
  14 +import us.codecraft.webmagic.selector.Html;
  15 +import us.codecraft.webmagic.selector.PlainText;
  16 +
  17 +import javax.annotation.Resource;
  18 +import java.util.Map;
  19 +
  20 +/**
  21 + * @author: xms
  22 + * @description: TODO
  23 + * @date: 2024/4/26 16:36
  24 + * @version: 1.0
  25 + */
  26 +@Slf4j
  27 +@Component
  28 +public class SeleniumDownloader extends AbstractDownloader {
  29 + private int sleepTime = 0;
  30 +
  31 + @Resource
  32 + private WebDriver webDriver;
  33 +
  34 + /**
  35 + * set sleep time to wait until load success
  36 + *
  37 + * @param sleepTime sleepTime
  38 + * @return this
  39 + */
  40 + public SeleniumDownloader setSleepTime(int sleepTime) {
  41 + this.sleepTime = sleepTime;
  42 + return this;
  43 + }
  44 +
  45 + @Override
  46 + public Page download(Request request, Task task) {
  47 + Page page = Page.fail();
  48 + try {
  49 +
  50 +
  51 + log.info("downloading page " + request.getUrl());
  52 + webDriver.get(request.getUrl());
  53 + try {
  54 + if (sleepTime > 0) {
  55 + Thread.sleep(sleepTime);
  56 + }
  57 + } catch (InterruptedException e) {
  58 + e.printStackTrace();
  59 + }
  60 + WebDriver.Options manage = webDriver.manage();
  61 + Site site = task.getSite();
  62 + if (site.getCookies() != null) {
  63 + for (Map.Entry<String, String> cookieEntry : site.getCookies()
  64 + .entrySet()) {
  65 + Cookie cookie = new Cookie(cookieEntry.getKey(),
  66 + cookieEntry.getValue());
  67 + manage.addCookie(cookie);
  68 + }
  69 + }
  70 +
  71 + /*
  72 + * TODO You can add mouse event or other processes
  73 + *
  74 + * @author: bob.li.0718@gmail.com
  75 + */
  76 + try {
  77 + //休眠3秒就是为了动态的数据渲染完成后在进行获取
  78 + Thread.sleep(30000);
  79 + } catch (InterruptedException e) {
  80 + throw new RuntimeException(e);
  81 + }
  82 + WebElement webElement = webDriver.findElement(By.xpath("/html"));
  83 + String content = webElement.getAttribute("outerHTML");
  84 + page.setDownloadSuccess(true);
  85 + page.setRawText(content);
  86 + page.setHtml(new Html(content, request.getUrl()));
  87 + page.setUrl(new PlainText(request.getUrl()));
  88 + page.setRequest(request);
  89 + onSuccess(request, task);
  90 + } catch (Exception e) {
  91 + log.warn("download page {} error", request.getUrl(), e);
  92 + onError(request, task, e);
  93 + } finally {
  94 +
  95 + }
  96 + return page;
  97 + }
  98 +
  99 + @Override
  100 + public void setThread(int i) {
  101 +
  102 + }
  103 +}
src/main/java/com/canrd/webmagic/processor/pipeline/NatureArticlePipeline.java renamed to src/main/java/com/canrd/webmagic/processor/pipeline/ArticlePipeline.java
@@ -14,7 +14,7 @@ import java.util.List; @@ -14,7 +14,7 @@ import java.util.List;
14 import java.util.Objects; 14 import java.util.Objects;
15 15
16 @Component 16 @Component
17 -public class NatureArticlePipeline implements Pipeline { 17 +public class ArticlePipeline implements Pipeline {
18 18
19 private ArticleService articleService; 19 private ArticleService articleService;
20 20
src/main/resources/user-agent/User-Agents.txt
1 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60 1 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60
2 -Opera/8.0 (Windows NT 5.1; U; en)  
3 Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50 2 Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50
4 Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50 3 Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50
5 Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0 4 Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0
6 -Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10  
7 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2 5 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2
8 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36 6 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36
9 Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11 7 Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11
@@ -19,4 +17,6 @@ Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C @@ -19,4 +17,6 @@ Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C
19 Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0 17 Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0
20 Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0) 18 Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)
21 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36 19 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36
22 -Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36  
23 \ No newline at end of file 20 \ No newline at end of file
  21 +Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36
  22 +Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36
  23 +Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0
24 \ No newline at end of file 24 \ No newline at end of file