Commit a24924df1d1fad46ddee4053e191bf6ac4731396

Authored by 谢茂盛
1 parent 8f304c93

feat:

1、selenium 整合
2、science 网站
... ... @@ -47,6 +47,7 @@
47 47 <jjwt.version>0.10.6</jjwt.version>
48 48 <easyexcel.version>2.2.3</easyexcel.version>
49 49 <webmagic.version>0.10.0</webmagic.version>
  50 + <selenium.version>3.4.0</selenium.version>
50 51 </properties>
51 52  
52 53 <dependencies>
... ... @@ -78,6 +79,12 @@
78 79 <version>${webmagic.version}</version>
79 80 </dependency>
80 81  
  82 + <!-- selenium -->
  83 + <dependency>
  84 + <groupId>org.seleniumhq.selenium</groupId>
  85 + <artifactId>selenium-java</artifactId>
  86 + <version>${selenium.version}</version>
  87 + </dependency>
81 88  
82 89 <!-- Lombok 依赖-->
83 90 <dependency>
... ...
src/main/java/com/canrd/webmagic/config/SeleniumConfig.java 0 → 100644
  1 +package com.canrd.webmagic.config;
  2 +
  3 +import com.canrd.webmagic.processor.config.Agent;
  4 +import org.openqa.selenium.WebDriver;
  5 +import org.openqa.selenium.chrome.ChromeDriver;
  6 +import org.openqa.selenium.chrome.ChromeOptions;
  7 +import org.springframework.context.annotation.Bean;
  8 +import org.springframework.context.annotation.Configuration;
  9 +
  10 +/**
  11 + * @author: xms
  12 + * @description: TODO
  13 + * @date: 2024/4/26 14:37
  14 + * @version: 1.0
  15 + */
  16 +@Configuration
  17 +public class SeleniumConfig {
  18 +
  19 + @Bean
  20 + public WebDriver webDriver() {
  21 + System.setProperty("webdriver.chrome.driver", "D:\\chrome\\chromedriver-win64\\chromedriver-win64\\chromedriver.exe");
  22 + return new ChromeDriver();
  23 + }
  24 +}
... ...
src/main/java/com/canrd/webmagic/controller/ArticleController.java
... ... @@ -5,8 +5,8 @@ import com.canrd.webmagic.common.jsr303.OperateGroup;
5 5 import com.canrd.webmagic.domain.vo.NatureArticleQueryVO;
6 6 import com.canrd.webmagic.domain.vo.NatureArticleVO;
7 7 import com.canrd.webmagic.processor.NatureSearchPageProcessor;
8   -import com.canrd.webmagic.processor.config.Downloader;
9   -import com.canrd.webmagic.processor.pipeline.NatureArticlePipeline;
  8 +import com.canrd.webmagic.processor.download.Downloader;
  9 +import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
10 10 import com.canrd.webmagic.service.ArticleService;
11 11 import org.springframework.validation.annotation.Validated;
12 12 import org.springframework.web.bind.annotation.*;
... ... @@ -33,7 +33,7 @@ public class ArticleController {
33 33 private NatureSearchPageProcessor natureSearchPageProcessor;
34 34  
35 35 @Resource
36   - private NatureArticlePipeline articlePipeline;
  36 + private ArticlePipeline articlePipeline;
37 37  
38 38 @Resource
39 39 private Downloader downloader;
... ... @@ -49,7 +49,7 @@ public class ArticleController {
49 49 .addUrl("https://www.nature.com/search?q=" + keyword + "&page=" + i)
50 50 .addUrl("https://www.nature.com/nature/research-articles?sort=PubDate&page=" + i)
51 51 .addPipeline(articlePipeline)
52   - .setDownloader(downloader.newIpDownloader())
  52 +// .setDownloader(downloader.newIpDownloader())
53 53 // 开启5个线程执行,并开始爬取
54 54 .thread(5).run();
55 55 }
... ...
src/main/java/com/canrd/webmagic/controller/Science4JournalController.java 0 → 100644
  1 +package com.canrd.webmagic.controller;
  2 +
  3 +import com.canrd.webmagic.common.constant.ServerResult;
  4 +import com.canrd.webmagic.common.jsr303.OperateGroup;
  5 +import com.canrd.webmagic.domain.vo.NatureArticleQueryVO;
  6 +import com.canrd.webmagic.domain.vo.NatureArticleVO;
  7 +import com.canrd.webmagic.processor.Science4JournalSearchPageProcessor;
  8 +import com.canrd.webmagic.service.ArticleService;
  9 +import org.springframework.validation.annotation.Validated;
  10 +import org.springframework.web.bind.annotation.*;
  11 +import us.codecraft.webmagic.Spider;
  12 +
  13 +import javax.annotation.Resource;
  14 +
  15 +/**
  16 + * nature-文章信息(NatureArticle)表控制层
  17 + *
  18 + * @author makejava
  19 + * @since 2024-04-07 18:39:41
  20 + */
  21 +@RestController
  22 +@RequestMapping("/science/journal")
  23 +public class Science4JournalController {
  24 + /**
  25 + * 服务对象
  26 + */
  27 + @Resource
  28 + private ArticleService articleService;
  29 +
  30 + @Resource
  31 + private Science4JournalSearchPageProcessor science4JournalSearchPageProcessor;
  32 +
  33 + /**
  34 + * @return
  35 + */
  36 + @GetMapping("/start")
  37 + public ServerResult start(@RequestParam(value = "keyword") String keyword, @RequestParam(value = "indexSize") Integer indexSize) {
  38 + for (int i = 0; i <= indexSize; i++) {
  39 + Spider.create(science4JournalSearchPageProcessor)
  40 + .addUrl("http://www.science.org/journal/science/insights?startPage=" + i)
  41 + // 开启5个线程执行,并开始爬取
  42 + .thread(5).run();
  43 + }
  44 +
  45 + return ServerResult.success();
  46 + }
  47 +
  48 + /**
  49 + * 分页查询
  50 + *
  51 + * @param natureArticleQueryVO 查询条件
  52 + * @return 查询结果
  53 + */
  54 + @PostMapping("/list")
  55 + public ServerResult list(@RequestBody @Validated({OperateGroup.List.class}) NatureArticleQueryVO natureArticleQueryVO) {
  56 + return articleService.list(natureArticleQueryVO);
  57 + }
  58 +
  59 + /**
  60 + * 通过主键查询单条数据
  61 + *
  62 + * @param natureArticleQueryVO 查询条件
  63 + * @return 单条数据
  64 + */
  65 + @PostMapping("/query_by_id")
  66 + public ServerResult queryById(@RequestBody NatureArticleQueryVO natureArticleQueryVO) {
  67 + return articleService.queryById(natureArticleQueryVO);
  68 + }
  69 +
  70 + /**
  71 + * 新增数据
  72 + *
  73 + * @param natureArticleVO 数据VO
  74 + * @return 新增结果
  75 + */
  76 + @PostMapping("/add")
  77 + public ServerResult add(@RequestBody NatureArticleVO natureArticleVO) {
  78 + return articleService.add(natureArticleVO);
  79 + }
  80 +
  81 + /**
  82 + * 编辑数据
  83 + *
  84 + * @param natureArticleVO 数据VO
  85 + * @return 编辑结果
  86 + */
  87 + @PostMapping("/edit")
  88 + public ServerResult edit(@RequestBody NatureArticleVO natureArticleVO) {
  89 + return articleService.edit(natureArticleVO);
  90 + }
  91 +
  92 + /**
  93 + * 删除数据
  94 + *
  95 + * @param natureArticleQueryVO 查询条件
  96 + * @return 删除是否成功
  97 + */
  98 + @PostMapping("/delete_by_id")
  99 + public ServerResult deleteById(@RequestBody NatureArticleQueryVO natureArticleQueryVO) {
  100 + return articleService.deleteById(natureArticleQueryVO);
  101 + }
  102 +
  103 +}
  104 +
... ...
src/main/java/com/canrd/webmagic/domain/ArticleTypeEnum.java
... ... @@ -15,6 +15,7 @@ import lombok.NoArgsConstructor;
15 15 @NoArgsConstructor
16 16 public enum ArticleTypeEnum {
17 17 NATURE("nature", "nature网址"),
  18 + SCIENCE("science", "science网址"),
18 19 ;
19 20 private String type;
20 21 private String desc;
... ...
src/main/java/com/canrd/webmagic/job/NatureJob.java 0 → 100644
  1 +package com.canrd.webmagic.job;
  2 +
  3 +import com.canrd.webmagic.common.utils.KeywordUtil;
  4 +import com.canrd.webmagic.processor.NatureSearchPageProcessor;
  5 +import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
  6 +import org.springframework.scheduling.annotation.Scheduled;
  7 +import org.springframework.stereotype.Component;
  8 +import us.codecraft.webmagic.Spider;
  9 +
  10 +import javax.annotation.Resource;
  11 +
  12 +/**
  13 + * @author: xms
  14 + * @description: TODO
  15 + * @date: 2024/4/26 10:06
  16 + * @version: 1.0
  17 + */
  18 +@Component
  19 +public class NatureJob {
  20 +
  21 + @Resource
  22 + private NatureSearchPageProcessor natureSearchPageProcessor;
  23 +
  24 + @Resource
  25 + private ArticlePipeline articlePipeline;
  26 +
  27 + /**
  28 + * 每天凌晨执行一次
  29 + */
  30 +// @Scheduled(cron = "*/20 * * * * ?")
  31 + @Scheduled(cron = "0 0 0 * * ?")
  32 + public void executeByDay() {
  33 + for (String keyword : KeywordUtil.getKeyWordList()) {
  34 + Spider.create(natureSearchPageProcessor)
  35 + // 添加这个Spider要爬取的网页地址
  36 + .addUrl("https://www.nature.com/search?q=" + keyword + "&page=" + 1)
  37 + .addUrl("https://www.nature.com/search?q=" + keyword + "&page=" + 2)
  38 + .addUrl("https://www.nature.com/search?q=" + keyword + "&page=" + 3)
  39 + .addPipeline(articlePipeline)
  40 +// .setDownloader(downloader.newIpDownloader())
  41 + // 开启5个线程执行,并开始爬取
  42 + .thread(5).run();
  43 + }
  44 + }
  45 +}
... ...
src/main/java/com/canrd/webmagic/processor/NatureSearchPageProcessor.java
... ... @@ -3,12 +3,13 @@ package com.canrd.webmagic.processor;
3 3 import com.alibaba.fastjson.JSONArray;
4 4 import com.alibaba.fastjson.JSONObject;
5 5 import com.baomidou.mybatisplus.core.toolkit.CollectionUtils;
  6 +import com.canrd.webmagic.common.utils.DateUtil;
6 7 import com.canrd.webmagic.common.utils.KeywordUtil;
7 8 import com.canrd.webmagic.common.utils.StringUtils;
8 9 import com.canrd.webmagic.domain.ArticleTypeEnum;
9 10 import com.canrd.webmagic.domain.dto.ArticleDO;
10 11 import com.canrd.webmagic.processor.config.Agent;
11   -import com.canrd.webmagic.processor.pipeline.NatureArticlePipeline;
  12 +import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
12 13 import lombok.extern.slf4j.Slf4j;
13 14 import org.springframework.stereotype.Component;
14 15 import us.codecraft.webmagic.Page;
... ... @@ -19,9 +20,9 @@ import us.codecraft.webmagic.selector.Html;
19 20 import us.codecraft.webmagic.selector.Selectable;
20 21 import us.codecraft.webmagic.selector.XpathSelector;
21 22  
22   -import java.util.ArrayList;
23   -import java.util.List;
24   -import java.util.Objects;
  23 +import java.text.ParseException;
  24 +import java.text.SimpleDateFormat;
  25 +import java.util.*;
25 26 import java.util.stream.Collectors;
26 27  
27 28 /**
... ... @@ -104,6 +105,7 @@ public class NatureSearchPageProcessor implements PageProcessor {
104 105 }
105 106 String articleDesc = html.xpath("//div[@class='c-article-section__content']/p/text()").get();
106 107 String publishTime;
  108 + Date publishTimeDateTime = null;
107 109 try {
108 110 publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get();
109 111 } catch (Exception e) {
... ... @@ -113,6 +115,13 @@ public class NatureSearchPageProcessor implements PageProcessor {
113 115 publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(0).xpath("//li/time/text()").get();
114 116 }
115 117 }
  118 + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH);
  119 +
  120 + try {
  121 + publishTimeDateTime = formatter.parse(publishTime);
  122 + } catch (ParseException e) {
  123 + e.printStackTrace();
  124 + }
116 125 Selectable authorSelectable = headSelectable.xpath("//ul").nodes().get(1).select(new XpathSelector("li[@class='c-article-author-list__item']"));
117 126 List<Selectable> authorNodes = authorSelectable.nodes();
118 127 StringBuffer authorName = new StringBuffer();
... ... @@ -170,7 +179,7 @@ public class NatureSearchPageProcessor implements PageProcessor {
170 179 .articleCode(articleCode)
171 180 .authorName(authorName.toString())
172 181 .title(title)
173   - .publishTime(publishTime)
  182 + .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE))
174 183 .emailInfo(authorEmail.toJSONString())
175 184 .articleDesc(articleDesc)
176 185 .authorAddress(authorAddress.toJSONString())
... ... @@ -191,14 +200,25 @@ public class NatureSearchPageProcessor implements PageProcessor {
191 200 /**
192 201 * 获取到指定的dom后,从这些dom中提取元素内容。
193 202 */
194   - for (int i = 1; i <= nodes.size() - 1; i++) {
  203 + for (int i = 0; i <= nodes.size() - 1; i++) {
195 204 Selectable node = nodes.get(i).$(".u-full-height").nodes().get(2).nodes().get(0).$(".u-full-height").select(new XpathSelector("a[@class='c-card__link u-link-inherit']")).nodes().get(0);
196   - String link = node.$("a", "href").get();
  205 + String link = node.links().get();
197 206 String title = node.$("a", "text").get();
198 207 if (KeywordUtil.containKeywordsInTitle(title)) {
199   - page.addTargetRequest(link);
200   - log.info("关键字文章列表链接:{},标题:{},文章链接:{}", url, title, link);
  208 + String publishTime = nodes.get(i).xpath("//div[@class='c-card__section c-meta']/time/text()").get();
  209 + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH);
  210 + try {
  211 + Date publishTimeDateTime = formatter.parse(publishTime);
  212 + if (!publishTimeDateTime.before(DateUtil.localDate2Date(DateUtil.parseDate("2000-01-01", DateUtil.DATE)))) {
  213 + page.addTargetRequest(link);
  214 + log.info("关键字文章列表链接:{},标题:{},文章链接:{}", url, title, link);
  215 + }
  216 + } catch (ParseException e) {
  217 + e.printStackTrace();
  218 + }
  219 +
201 220 }
  221 +
202 222 }
203 223 }
204 224  
... ... @@ -212,7 +232,7 @@ public class NatureSearchPageProcessor implements PageProcessor {
212 232 Spider.create(new NatureSearchPageProcessor())
213 233 // 添加这个Spider要爬取的网页地址
214 234 .addUrl("https://www.nature.com/nature/research-articles?sort=PubDate&page=1")
215   - .addPipeline(new NatureArticlePipeline())
  235 + .addPipeline(new ArticlePipeline())
216 236 // 开启5个线程执行,并开始爬取
217 237 .thread(5).run();
218 238 }
... ...
src/main/java/com/canrd/webmagic/processor/Science4JournalArticlePageProcessor.java 0 → 100644
  1 +package com.canrd.webmagic.processor;
  2 +
  3 +import com.alibaba.fastjson.JSONArray;
  4 +import com.alibaba.fastjson.JSONObject;
  5 +import com.canrd.webmagic.common.utils.DateUtil;
  6 +import com.canrd.webmagic.domain.ArticleTypeEnum;
  7 +import com.canrd.webmagic.domain.dto.ArticleDO;
  8 +import com.canrd.webmagic.processor.config.Agent;
  9 +import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
  10 +import lombok.extern.slf4j.Slf4j;
  11 +import org.springframework.stereotype.Component;
  12 +import us.codecraft.webmagic.Page;
  13 +import us.codecraft.webmagic.Site;
  14 +import us.codecraft.webmagic.Spider;
  15 +import us.codecraft.webmagic.processor.PageProcessor;
  16 +import us.codecraft.webmagic.selector.Html;
  17 +import us.codecraft.webmagic.selector.Selectable;
  18 +
  19 +import java.text.ParseException;
  20 +import java.text.SimpleDateFormat;
  21 +import java.util.Date;
  22 +import java.util.List;
  23 +import java.util.Locale;
  24 +import java.util.Objects;
  25 +
  26 +/**
  27 + * @author: xms
  28 + * @description: TODO
  29 + * @date: 2024/4/1 14:19
  30 + * @version: 1.0
  31 + */
  32 +@Slf4j
  33 +@Component
  34 +public class Science4JournalArticlePageProcessor implements PageProcessor {
  35 + private String agent = Agent.getRandom();
  36 +
  37 + // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等
  38 + private Site site = Site.me().setRetryTimes(3).setSleepTime(100).setUserAgent(Agent.getRandom());
  39 +
  40 + /**
  41 + * 定制爬虫逻辑的核心接口,在这里编写抽取逻辑
  42 + *
  43 + * @param page
  44 + */
  45 + @Override
  46 + public void process(Page page) {
  47 + doArticleContent(page);
  48 + }
  49 +
  50 + /**
  51 + * @param page
  52 + */
  53 + private void doArticleContent(Page page) {
  54 +
  55 + //解析页面
  56 + Html html = page.getHtml();
  57 + String articleCode = page.getUrl().get();
  58 + Selectable headSelectable = html.xpath("//div[@class='article-container']/article/header");
  59 +
  60 + String title = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='core-lede']/div/text()").get();
  61 +
  62 + String articleDesc = html.xpath("//div[@class='article-container']/article").xpath("//section[@id='bodymatter']/div/div/text()").get();
  63 +
  64 + String publishTime = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='core-self-citation']").xpath("//div[@class='core-date-published']/span/text()").get();
  65 + Date publishTimeDateTime = null;
  66 + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH);
  67 +
  68 + try {
  69 + publishTimeDateTime = formatter.parse(publishTime);
  70 + } catch (ParseException e) {
  71 + e.printStackTrace();
  72 + }
  73 + List<Selectable> authorNodes = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='contributors']/span/span/span").nodes();
  74 + StringBuffer authorName = new StringBuffer();
  75 + for (Selectable node : authorNodes) {
  76 + authorName.append(node.xpath("//a/span/text()").get()).append(" ");
  77 + }
  78 +
  79 +
  80 + JSONArray authorEmail = new JSONArray();
  81 + List<Selectable> authorEmailSelectables = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='contributors']").xpath("//span[@class='authors']").xpath("//span[@role='list']").xpath("//span[@property='author']").nodes();
  82 + for (Selectable authorEmailSelectable : authorEmailSelectables) {
  83 + String givenName = authorEmailSelectable.xpath("//span[@property='givenName']/text()").get();
  84 + String familyName = authorEmailSelectable.xpath("//span[@property='familyName']/text()").get();
  85 + String email = authorEmailSelectable.xpath("//a[@property='email']/text()").get();
  86 +
  87 + JSONObject jsonObject = new JSONObject();
  88 + jsonObject.put("authorEmailName", givenName + "" + familyName);
  89 + jsonObject.put("email", email);
  90 + authorEmail.add(jsonObject);
  91 + }
  92 + log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorEmail.toJSONString());
  93 +
  94 + page.putField("article", ArticleDO.builder()
  95 + .articleType(ArticleTypeEnum.SCIENCE.getType())
  96 + .articleCode(articleCode)
  97 + .authorName(authorName.toString())
  98 + .title(title)
  99 + .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE))
  100 + .emailInfo(authorEmail.toJSONString())
  101 + .articleDesc(articleDesc)
  102 + .authorAddress(null)
  103 + .referenceInfo(null).build());
  104 + }
  105 +
  106 + @Override
  107 + public Site getSite() {
  108 + return site;
  109 + }
  110 +
  111 + public static void main(String[] args) {
  112 + // 创建一个Spider,并把我们的处理器放进去
  113 + Spider.create(new Science4JournalArticlePageProcessor())
  114 + // 添加这个Spider要爬取的网页地址
  115 + .addUrl("https://www.science.org/journal/science/insights?startPage=0")
  116 + .addPipeline(new ArticlePipeline())
  117 + // 开启5个线程执行,并开始爬取
  118 + .thread(5).run();
  119 + }
  120 +}
0 121 \ No newline at end of file
... ...
src/main/java/com/canrd/webmagic/processor/Science4JournalSearchPageProcessor.java 0 → 100644
  1 +package com.canrd.webmagic.processor;
  2 +
  3 +import com.canrd.webmagic.common.utils.DateUtil;
  4 +import com.canrd.webmagic.common.utils.KeywordUtil;
  5 +import com.canrd.webmagic.processor.config.Agent;
  6 +import com.canrd.webmagic.processor.download.SeleniumDownloader;
  7 +import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
  8 +import lombok.extern.slf4j.Slf4j;
  9 +import org.springframework.stereotype.Component;
  10 +import us.codecraft.webmagic.Page;
  11 +import us.codecraft.webmagic.Site;
  12 +import us.codecraft.webmagic.Spider;
  13 +import us.codecraft.webmagic.processor.PageProcessor;
  14 +import us.codecraft.webmagic.selector.Selectable;
  15 +import us.codecraft.webmagic.selector.XpathSelector;
  16 +
  17 +import javax.annotation.Resource;
  18 +import java.text.ParseException;
  19 +import java.text.SimpleDateFormat;
  20 +import java.util.Date;
  21 +import java.util.List;
  22 +import java.util.Locale;
  23 +
  24 +/**
  25 + * @author: xms
  26 + * @description: TODO
  27 + * @date: 2024/4/1 14:19
  28 + * @version: 1.0
  29 + */
  30 +@Slf4j
  31 +@Component
  32 +public class Science4JournalSearchPageProcessor implements PageProcessor {
  33 +
  34 + @Resource
  35 + private Science4JournalArticlePageProcessor science4JournalArticlePageProcessor;
  36 +
  37 + @Resource
  38 + private SeleniumDownloader seleniumDownloader;
  39 +
  40 + @Resource
  41 + private ArticlePipeline articlePipeline;
  42 +
  43 + /**
  44 + * 抓取网站的相关配置,包括编码、抓取间隔、重试次数等
  45 + */
  46 + private Site site = Site.me().setRetryTimes(3).setSleepTime(100).setUserAgent(Agent.getRandom());
  47 +
  48 + /**
  49 + * 定制爬虫逻辑的核心接口,在这里编写抽取逻辑
  50 + *
  51 + * @param page
  52 + */
  53 + @Override
  54 + public void process(Page page) {
  55 + doArticleList(page);
  56 + }
  57 +
  58 + /**
  59 + * @param page
  60 + */
  61 + private void doArticleList(Page page) {
  62 + String url = page.getUrl().get();
  63 + /**
  64 + * 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容
  65 + * 1、通过$或css()方法获取到该page html下某元素dom
  66 + */
  67 + Selectable selectable = page.getHtml().xpath("//div[@class=' search-result__body titles-results ']").select(new XpathSelector("div[@class='card pb-3 mb-4 border-bottom']"));
  68 + List<Selectable> nodes = selectable.nodes();
  69 +
  70 + /**
  71 + * 获取到指定的dom后,从这些dom中提取元素内容。
  72 + */
  73 + for (int i = 0; i <= nodes.size() - 1; i++) {
  74 + String title = nodes.get(i).xpath("//div[@class='card pb-3 mb-4 border-bottom']/div").xpath("//div[@class='d-flex justify-content-between align-items-end']/div/span/h2/a/text()").get();
  75 + String time = nodes.get(i).xpath("//div[@class='card-meta align-middle mb-2 text-uppercase text-darker-gray']/span").nodes().get(2).xpath("//time/text()").get();
  76 + String link = nodes.get(0).links().get();
  77 + if (!KeywordUtil.containKeywordsInTitle(title)) {
  78 + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH);
  79 + try {
  80 + Date publishTimeDateTime = formatter.parse(time);
  81 + if (!publishTimeDateTime.before(DateUtil.localDate2Date(DateUtil.parseDate("2000-01-01", DateUtil.DATE)))) {
  82 +// page.addTargetRequest(link);
  83 + Spider.create(science4JournalArticlePageProcessor)
  84 + .addUrl(link)
  85 + .addPipeline(articlePipeline)
  86 + .setDownloader(seleniumDownloader)
  87 + // 开启5个线程执行,并开始爬取
  88 + .thread(1).run();
  89 + log.info("关键字文章列表链接:{},标题:{},文章链接:{}", url, title, link);
  90 + }
  91 + } catch (ParseException e) {
  92 + e.printStackTrace();
  93 + }
  94 +
  95 + }
  96 + }
  97 +
  98 + }
  99 +
  100 + @Override
  101 + public Site getSite() {
  102 + return site;
  103 + }
  104 +
  105 + public static void main(String[] args) {
  106 + // 创建一个Spider,并把我们的处理器放进去
  107 + Spider.create(new Science4JournalSearchPageProcessor())
  108 + // 添加这个Spider要爬取的网页地址
  109 + .addUrl("https://www.science.org/journal/science/insights?startPage=0")
  110 + .addPipeline(new ArticlePipeline())
  111 + // 开启5个线程执行,并开始爬取
  112 + .thread(5).run();
  113 + }
  114 +}
0 115 \ No newline at end of file
... ...
src/main/java/com/canrd/webmagic/processor/config/Downloader.java renamed to src/main/java/com/canrd/webmagic/processor/download/Downloader.java
1   -package com.canrd.webmagic.processor.config;
  1 +package com.canrd.webmagic.processor.download;
2 2  
3 3 import lombok.extern.slf4j.Slf4j;
4 4 import org.springframework.beans.factory.annotation.Autowired;
... ...
src/main/java/com/canrd/webmagic/processor/download/SeleniumDownloader.java 0 → 100644
  1 +package com.canrd.webmagic.processor.download;
  2 +
  3 +import lombok.extern.slf4j.Slf4j;
  4 +import org.openqa.selenium.By;
  5 +import org.openqa.selenium.Cookie;
  6 +import org.openqa.selenium.WebDriver;
  7 +import org.openqa.selenium.WebElement;
  8 +import org.springframework.stereotype.Component;
  9 +import us.codecraft.webmagic.Page;
  10 +import us.codecraft.webmagic.Request;
  11 +import us.codecraft.webmagic.Site;
  12 +import us.codecraft.webmagic.Task;
  13 +import us.codecraft.webmagic.downloader.AbstractDownloader;
  14 +import us.codecraft.webmagic.selector.Html;
  15 +import us.codecraft.webmagic.selector.PlainText;
  16 +
  17 +import javax.annotation.Resource;
  18 +import java.util.Map;
  19 +
  20 +/**
  21 + * @author: xms
  22 + * @description: TODO
  23 + * @date: 2024/4/26 16:36
  24 + * @version: 1.0
  25 + */
  26 +@Slf4j
  27 +@Component
  28 +public class SeleniumDownloader extends AbstractDownloader {
  29 + private int sleepTime = 0;
  30 +
  31 + @Resource
  32 + private WebDriver webDriver;
  33 +
  34 + /**
  35 + * set sleep time to wait until load success
  36 + *
  37 + * @param sleepTime sleepTime
  38 + * @return this
  39 + */
  40 + public SeleniumDownloader setSleepTime(int sleepTime) {
  41 + this.sleepTime = sleepTime;
  42 + return this;
  43 + }
  44 +
  45 + @Override
  46 + public Page download(Request request, Task task) {
  47 + Page page = Page.fail();
  48 + try {
  49 +
  50 +
  51 + log.info("downloading page " + request.getUrl());
  52 + webDriver.get(request.getUrl());
  53 + try {
  54 + if (sleepTime > 0) {
  55 + Thread.sleep(sleepTime);
  56 + }
  57 + } catch (InterruptedException e) {
  58 + e.printStackTrace();
  59 + }
  60 + WebDriver.Options manage = webDriver.manage();
  61 + Site site = task.getSite();
  62 + if (site.getCookies() != null) {
  63 + for (Map.Entry<String, String> cookieEntry : site.getCookies()
  64 + .entrySet()) {
  65 + Cookie cookie = new Cookie(cookieEntry.getKey(),
  66 + cookieEntry.getValue());
  67 + manage.addCookie(cookie);
  68 + }
  69 + }
  70 +
  71 + /*
  72 + * TODO You can add mouse event or other processes
  73 + *
  74 + * @author: bob.li.0718@gmail.com
  75 + */
  76 + try {
  77 + //休眠3秒就是为了动态的数据渲染完成后在进行获取
  78 + Thread.sleep(30000);
  79 + } catch (InterruptedException e) {
  80 + throw new RuntimeException(e);
  81 + }
  82 + WebElement webElement = webDriver.findElement(By.xpath("/html"));
  83 + String content = webElement.getAttribute("outerHTML");
  84 + page.setDownloadSuccess(true);
  85 + page.setRawText(content);
  86 + page.setHtml(new Html(content, request.getUrl()));
  87 + page.setUrl(new PlainText(request.getUrl()));
  88 + page.setRequest(request);
  89 + onSuccess(request, task);
  90 + } catch (Exception e) {
  91 + log.warn("download page {} error", request.getUrl(), e);
  92 + onError(request, task, e);
  93 + } finally {
  94 +
  95 + }
  96 + return page;
  97 + }
  98 +
  99 + @Override
  100 + public void setThread(int i) {
  101 +
  102 + }
  103 +}
... ...
src/main/java/com/canrd/webmagic/processor/pipeline/NatureArticlePipeline.java renamed to src/main/java/com/canrd/webmagic/processor/pipeline/ArticlePipeline.java
... ... @@ -14,7 +14,7 @@ import java.util.List;
14 14 import java.util.Objects;
15 15  
16 16 @Component
17   -public class NatureArticlePipeline implements Pipeline {
  17 +public class ArticlePipeline implements Pipeline {
18 18  
19 19 private ArticleService articleService;
20 20  
... ...
src/main/resources/user-agent/User-Agents.txt
1 1 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60
2   -Opera/8.0 (Windows NT 5.1; U; en)
3 2 Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50
4 3 Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50
5 4 Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0
6   -Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10
7 5 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2
8 6 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36
9 7 Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11
... ... @@ -19,4 +17,6 @@ Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C
19 17 Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0
20 18 Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)
21 19 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36
22   -Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36
23 20 \ No newline at end of file
  21 +Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36
  22 +Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36
  23 +Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0
24 24 \ No newline at end of file
... ...