Commit a24924df1d1fad46ddee4053e191bf6ac4731396
1 parent
8f304c93
feat:
1、selenium 整合 2、science 网站
Showing
13 changed files
with
557 additions
and
19 deletions
pom.xml
@@ -47,6 +47,7 @@ | @@ -47,6 +47,7 @@ | ||
47 | <jjwt.version>0.10.6</jjwt.version> | 47 | <jjwt.version>0.10.6</jjwt.version> |
48 | <easyexcel.version>2.2.3</easyexcel.version> | 48 | <easyexcel.version>2.2.3</easyexcel.version> |
49 | <webmagic.version>0.10.0</webmagic.version> | 49 | <webmagic.version>0.10.0</webmagic.version> |
50 | + <selenium.version>3.4.0</selenium.version> | ||
50 | </properties> | 51 | </properties> |
51 | 52 | ||
52 | <dependencies> | 53 | <dependencies> |
@@ -78,6 +79,12 @@ | @@ -78,6 +79,12 @@ | ||
78 | <version>${webmagic.version}</version> | 79 | <version>${webmagic.version}</version> |
79 | </dependency> | 80 | </dependency> |
80 | 81 | ||
82 | + <!-- selenium --> | ||
83 | + <dependency> | ||
84 | + <groupId>org.seleniumhq.selenium</groupId> | ||
85 | + <artifactId>selenium-java</artifactId> | ||
86 | + <version>${selenium.version}</version> | ||
87 | + </dependency> | ||
81 | 88 | ||
82 | <!-- Lombok 依赖--> | 89 | <!-- Lombok 依赖--> |
83 | <dependency> | 90 | <dependency> |
src/main/java/com/canrd/webmagic/config/SeleniumConfig.java
0 → 100644
1 | +package com.canrd.webmagic.config; | ||
2 | + | ||
3 | +import com.canrd.webmagic.processor.config.Agent; | ||
4 | +import org.openqa.selenium.WebDriver; | ||
5 | +import org.openqa.selenium.chrome.ChromeDriver; | ||
6 | +import org.openqa.selenium.chrome.ChromeOptions; | ||
7 | +import org.springframework.context.annotation.Bean; | ||
8 | +import org.springframework.context.annotation.Configuration; | ||
9 | + | ||
10 | +/** | ||
11 | + * @author: xms | ||
12 | + * @description: TODO | ||
13 | + * @date: 2024/4/26 14:37 | ||
14 | + * @version: 1.0 | ||
15 | + */ | ||
16 | +@Configuration | ||
17 | +public class SeleniumConfig { | ||
18 | + | ||
19 | + @Bean | ||
20 | + public WebDriver webDriver() { | ||
21 | + System.setProperty("webdriver.chrome.driver", "D:\\chrome\\chromedriver-win64\\chromedriver-win64\\chromedriver.exe"); | ||
22 | + return new ChromeDriver(); | ||
23 | + } | ||
24 | +} |
src/main/java/com/canrd/webmagic/controller/ArticleController.java
@@ -5,8 +5,8 @@ import com.canrd.webmagic.common.jsr303.OperateGroup; | @@ -5,8 +5,8 @@ import com.canrd.webmagic.common.jsr303.OperateGroup; | ||
5 | import com.canrd.webmagic.domain.vo.NatureArticleQueryVO; | 5 | import com.canrd.webmagic.domain.vo.NatureArticleQueryVO; |
6 | import com.canrd.webmagic.domain.vo.NatureArticleVO; | 6 | import com.canrd.webmagic.domain.vo.NatureArticleVO; |
7 | import com.canrd.webmagic.processor.NatureSearchPageProcessor; | 7 | import com.canrd.webmagic.processor.NatureSearchPageProcessor; |
8 | -import com.canrd.webmagic.processor.config.Downloader; | ||
9 | -import com.canrd.webmagic.processor.pipeline.NatureArticlePipeline; | 8 | +import com.canrd.webmagic.processor.download.Downloader; |
9 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | ||
10 | import com.canrd.webmagic.service.ArticleService; | 10 | import com.canrd.webmagic.service.ArticleService; |
11 | import org.springframework.validation.annotation.Validated; | 11 | import org.springframework.validation.annotation.Validated; |
12 | import org.springframework.web.bind.annotation.*; | 12 | import org.springframework.web.bind.annotation.*; |
@@ -33,7 +33,7 @@ public class ArticleController { | @@ -33,7 +33,7 @@ public class ArticleController { | ||
33 | private NatureSearchPageProcessor natureSearchPageProcessor; | 33 | private NatureSearchPageProcessor natureSearchPageProcessor; |
34 | 34 | ||
35 | @Resource | 35 | @Resource |
36 | - private NatureArticlePipeline articlePipeline; | 36 | + private ArticlePipeline articlePipeline; |
37 | 37 | ||
38 | @Resource | 38 | @Resource |
39 | private Downloader downloader; | 39 | private Downloader downloader; |
@@ -49,7 +49,7 @@ public class ArticleController { | @@ -49,7 +49,7 @@ public class ArticleController { | ||
49 | .addUrl("https://www.nature.com/search?q=" + keyword + "&page=" + i) | 49 | .addUrl("https://www.nature.com/search?q=" + keyword + "&page=" + i) |
50 | .addUrl("https://www.nature.com/nature/research-articles?sort=PubDate&page=" + i) | 50 | .addUrl("https://www.nature.com/nature/research-articles?sort=PubDate&page=" + i) |
51 | .addPipeline(articlePipeline) | 51 | .addPipeline(articlePipeline) |
52 | - .setDownloader(downloader.newIpDownloader()) | 52 | +// .setDownloader(downloader.newIpDownloader()) |
53 | // 开启5个线程执行,并开始爬取 | 53 | // 开启5个线程执行,并开始爬取 |
54 | .thread(5).run(); | 54 | .thread(5).run(); |
55 | } | 55 | } |
src/main/java/com/canrd/webmagic/controller/Science4JournalController.java
0 → 100644
1 | +package com.canrd.webmagic.controller; | ||
2 | + | ||
3 | +import com.canrd.webmagic.common.constant.ServerResult; | ||
4 | +import com.canrd.webmagic.common.jsr303.OperateGroup; | ||
5 | +import com.canrd.webmagic.domain.vo.NatureArticleQueryVO; | ||
6 | +import com.canrd.webmagic.domain.vo.NatureArticleVO; | ||
7 | +import com.canrd.webmagic.processor.Science4JournalSearchPageProcessor; | ||
8 | +import com.canrd.webmagic.service.ArticleService; | ||
9 | +import org.springframework.validation.annotation.Validated; | ||
10 | +import org.springframework.web.bind.annotation.*; | ||
11 | +import us.codecraft.webmagic.Spider; | ||
12 | + | ||
13 | +import javax.annotation.Resource; | ||
14 | + | ||
15 | +/** | ||
16 | + * nature-文章信息(NatureArticle)表控制层 | ||
17 | + * | ||
18 | + * @author makejava | ||
19 | + * @since 2024-04-07 18:39:41 | ||
20 | + */ | ||
21 | +@RestController | ||
22 | +@RequestMapping("/science/journal") | ||
23 | +public class Science4JournalController { | ||
24 | + /** | ||
25 | + * 服务对象 | ||
26 | + */ | ||
27 | + @Resource | ||
28 | + private ArticleService articleService; | ||
29 | + | ||
30 | + @Resource | ||
31 | + private Science4JournalSearchPageProcessor science4JournalSearchPageProcessor; | ||
32 | + | ||
33 | + /** | ||
34 | + * @return | ||
35 | + */ | ||
36 | + @GetMapping("/start") | ||
37 | + public ServerResult start(@RequestParam(value = "keyword") String keyword, @RequestParam(value = "indexSize") Integer indexSize) { | ||
38 | + for (int i = 0; i <= indexSize; i++) { | ||
39 | + Spider.create(science4JournalSearchPageProcessor) | ||
40 | + .addUrl("http://www.science.org/journal/science/insights?startPage=" + i) | ||
41 | + // 开启5个线程执行,并开始爬取 | ||
42 | + .thread(5).run(); | ||
43 | + } | ||
44 | + | ||
45 | + return ServerResult.success(); | ||
46 | + } | ||
47 | + | ||
48 | + /** | ||
49 | + * 分页查询 | ||
50 | + * | ||
51 | + * @param natureArticleQueryVO 查询条件 | ||
52 | + * @return 查询结果 | ||
53 | + */ | ||
54 | + @PostMapping("/list") | ||
55 | + public ServerResult list(@RequestBody @Validated({OperateGroup.List.class}) NatureArticleQueryVO natureArticleQueryVO) { | ||
56 | + return articleService.list(natureArticleQueryVO); | ||
57 | + } | ||
58 | + | ||
59 | + /** | ||
60 | + * 通过主键查询单条数据 | ||
61 | + * | ||
62 | + * @param natureArticleQueryVO 查询条件 | ||
63 | + * @return 单条数据 | ||
64 | + */ | ||
65 | + @PostMapping("/query_by_id") | ||
66 | + public ServerResult queryById(@RequestBody NatureArticleQueryVO natureArticleQueryVO) { | ||
67 | + return articleService.queryById(natureArticleQueryVO); | ||
68 | + } | ||
69 | + | ||
70 | + /** | ||
71 | + * 新增数据 | ||
72 | + * | ||
73 | + * @param natureArticleVO 数据VO | ||
74 | + * @return 新增结果 | ||
75 | + */ | ||
76 | + @PostMapping("/add") | ||
77 | + public ServerResult add(@RequestBody NatureArticleVO natureArticleVO) { | ||
78 | + return articleService.add(natureArticleVO); | ||
79 | + } | ||
80 | + | ||
81 | + /** | ||
82 | + * 编辑数据 | ||
83 | + * | ||
84 | + * @param natureArticleVO 数据VO | ||
85 | + * @return 编辑结果 | ||
86 | + */ | ||
87 | + @PostMapping("/edit") | ||
88 | + public ServerResult edit(@RequestBody NatureArticleVO natureArticleVO) { | ||
89 | + return articleService.edit(natureArticleVO); | ||
90 | + } | ||
91 | + | ||
92 | + /** | ||
93 | + * 删除数据 | ||
94 | + * | ||
95 | + * @param natureArticleQueryVO 查询条件 | ||
96 | + * @return 删除是否成功 | ||
97 | + */ | ||
98 | + @PostMapping("/delete_by_id") | ||
99 | + public ServerResult deleteById(@RequestBody NatureArticleQueryVO natureArticleQueryVO) { | ||
100 | + return articleService.deleteById(natureArticleQueryVO); | ||
101 | + } | ||
102 | + | ||
103 | +} | ||
104 | + |
src/main/java/com/canrd/webmagic/domain/ArticleTypeEnum.java
@@ -15,6 +15,7 @@ import lombok.NoArgsConstructor; | @@ -15,6 +15,7 @@ import lombok.NoArgsConstructor; | ||
15 | @NoArgsConstructor | 15 | @NoArgsConstructor |
16 | public enum ArticleTypeEnum { | 16 | public enum ArticleTypeEnum { |
17 | NATURE("nature", "nature网址"), | 17 | NATURE("nature", "nature网址"), |
18 | + SCIENCE("science", "science网址"), | ||
18 | ; | 19 | ; |
19 | private String type; | 20 | private String type; |
20 | private String desc; | 21 | private String desc; |
src/main/java/com/canrd/webmagic/job/NatureJob.java
0 → 100644
1 | +package com.canrd.webmagic.job; | ||
2 | + | ||
3 | +import com.canrd.webmagic.common.utils.KeywordUtil; | ||
4 | +import com.canrd.webmagic.processor.NatureSearchPageProcessor; | ||
5 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | ||
6 | +import org.springframework.scheduling.annotation.Scheduled; | ||
7 | +import org.springframework.stereotype.Component; | ||
8 | +import us.codecraft.webmagic.Spider; | ||
9 | + | ||
10 | +import javax.annotation.Resource; | ||
11 | + | ||
12 | +/** | ||
13 | + * @author: xms | ||
14 | + * @description: TODO | ||
15 | + * @date: 2024/4/26 10:06 | ||
16 | + * @version: 1.0 | ||
17 | + */ | ||
18 | +@Component | ||
19 | +public class NatureJob { | ||
20 | + | ||
21 | + @Resource | ||
22 | + private NatureSearchPageProcessor natureSearchPageProcessor; | ||
23 | + | ||
24 | + @Resource | ||
25 | + private ArticlePipeline articlePipeline; | ||
26 | + | ||
27 | + /** | ||
28 | + * 每天凌晨执行一次 | ||
29 | + */ | ||
30 | +// @Scheduled(cron = "*/20 * * * * ?") | ||
31 | + @Scheduled(cron = "0 0 0 * * ?") | ||
32 | + public void executeByDay() { | ||
33 | + for (String keyword : KeywordUtil.getKeyWordList()) { | ||
34 | + Spider.create(natureSearchPageProcessor) | ||
35 | + // 添加这个Spider要爬取的网页地址 | ||
36 | + .addUrl("https://www.nature.com/search?q=" + keyword + "&page=" + 1) | ||
37 | + .addUrl("https://www.nature.com/search?q=" + keyword + "&page=" + 2) | ||
38 | + .addUrl("https://www.nature.com/search?q=" + keyword + "&page=" + 3) | ||
39 | + .addPipeline(articlePipeline) | ||
40 | +// .setDownloader(downloader.newIpDownloader()) | ||
41 | + // 开启5个线程执行,并开始爬取 | ||
42 | + .thread(5).run(); | ||
43 | + } | ||
44 | + } | ||
45 | +} |
src/main/java/com/canrd/webmagic/processor/NatureSearchPageProcessor.java
@@ -3,12 +3,13 @@ package com.canrd.webmagic.processor; | @@ -3,12 +3,13 @@ package com.canrd.webmagic.processor; | ||
3 | import com.alibaba.fastjson.JSONArray; | 3 | import com.alibaba.fastjson.JSONArray; |
4 | import com.alibaba.fastjson.JSONObject; | 4 | import com.alibaba.fastjson.JSONObject; |
5 | import com.baomidou.mybatisplus.core.toolkit.CollectionUtils; | 5 | import com.baomidou.mybatisplus.core.toolkit.CollectionUtils; |
6 | +import com.canrd.webmagic.common.utils.DateUtil; | ||
6 | import com.canrd.webmagic.common.utils.KeywordUtil; | 7 | import com.canrd.webmagic.common.utils.KeywordUtil; |
7 | import com.canrd.webmagic.common.utils.StringUtils; | 8 | import com.canrd.webmagic.common.utils.StringUtils; |
8 | import com.canrd.webmagic.domain.ArticleTypeEnum; | 9 | import com.canrd.webmagic.domain.ArticleTypeEnum; |
9 | import com.canrd.webmagic.domain.dto.ArticleDO; | 10 | import com.canrd.webmagic.domain.dto.ArticleDO; |
10 | import com.canrd.webmagic.processor.config.Agent; | 11 | import com.canrd.webmagic.processor.config.Agent; |
11 | -import com.canrd.webmagic.processor.pipeline.NatureArticlePipeline; | 12 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; |
12 | import lombok.extern.slf4j.Slf4j; | 13 | import lombok.extern.slf4j.Slf4j; |
13 | import org.springframework.stereotype.Component; | 14 | import org.springframework.stereotype.Component; |
14 | import us.codecraft.webmagic.Page; | 15 | import us.codecraft.webmagic.Page; |
@@ -19,9 +20,9 @@ import us.codecraft.webmagic.selector.Html; | @@ -19,9 +20,9 @@ import us.codecraft.webmagic.selector.Html; | ||
19 | import us.codecraft.webmagic.selector.Selectable; | 20 | import us.codecraft.webmagic.selector.Selectable; |
20 | import us.codecraft.webmagic.selector.XpathSelector; | 21 | import us.codecraft.webmagic.selector.XpathSelector; |
21 | 22 | ||
22 | -import java.util.ArrayList; | ||
23 | -import java.util.List; | ||
24 | -import java.util.Objects; | 23 | +import java.text.ParseException; |
24 | +import java.text.SimpleDateFormat; | ||
25 | +import java.util.*; | ||
25 | import java.util.stream.Collectors; | 26 | import java.util.stream.Collectors; |
26 | 27 | ||
27 | /** | 28 | /** |
@@ -104,6 +105,7 @@ public class NatureSearchPageProcessor implements PageProcessor { | @@ -104,6 +105,7 @@ public class NatureSearchPageProcessor implements PageProcessor { | ||
104 | } | 105 | } |
105 | String articleDesc = html.xpath("//div[@class='c-article-section__content']/p/text()").get(); | 106 | String articleDesc = html.xpath("//div[@class='c-article-section__content']/p/text()").get(); |
106 | String publishTime; | 107 | String publishTime; |
108 | + Date publishTimeDateTime = null; | ||
107 | try { | 109 | try { |
108 | publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get(); | 110 | publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get(); |
109 | } catch (Exception e) { | 111 | } catch (Exception e) { |
@@ -113,6 +115,13 @@ public class NatureSearchPageProcessor implements PageProcessor { | @@ -113,6 +115,13 @@ public class NatureSearchPageProcessor implements PageProcessor { | ||
113 | publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(0).xpath("//li/time/text()").get(); | 115 | publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(0).xpath("//li/time/text()").get(); |
114 | } | 116 | } |
115 | } | 117 | } |
118 | + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH); | ||
119 | + | ||
120 | + try { | ||
121 | + publishTimeDateTime = formatter.parse(publishTime); | ||
122 | + } catch (ParseException e) { | ||
123 | + e.printStackTrace(); | ||
124 | + } | ||
116 | Selectable authorSelectable = headSelectable.xpath("//ul").nodes().get(1).select(new XpathSelector("li[@class='c-article-author-list__item']")); | 125 | Selectable authorSelectable = headSelectable.xpath("//ul").nodes().get(1).select(new XpathSelector("li[@class='c-article-author-list__item']")); |
117 | List<Selectable> authorNodes = authorSelectable.nodes(); | 126 | List<Selectable> authorNodes = authorSelectable.nodes(); |
118 | StringBuffer authorName = new StringBuffer(); | 127 | StringBuffer authorName = new StringBuffer(); |
@@ -170,7 +179,7 @@ public class NatureSearchPageProcessor implements PageProcessor { | @@ -170,7 +179,7 @@ public class NatureSearchPageProcessor implements PageProcessor { | ||
170 | .articleCode(articleCode) | 179 | .articleCode(articleCode) |
171 | .authorName(authorName.toString()) | 180 | .authorName(authorName.toString()) |
172 | .title(title) | 181 | .title(title) |
173 | - .publishTime(publishTime) | 182 | + .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE)) |
174 | .emailInfo(authorEmail.toJSONString()) | 183 | .emailInfo(authorEmail.toJSONString()) |
175 | .articleDesc(articleDesc) | 184 | .articleDesc(articleDesc) |
176 | .authorAddress(authorAddress.toJSONString()) | 185 | .authorAddress(authorAddress.toJSONString()) |
@@ -191,14 +200,25 @@ public class NatureSearchPageProcessor implements PageProcessor { | @@ -191,14 +200,25 @@ public class NatureSearchPageProcessor implements PageProcessor { | ||
191 | /** | 200 | /** |
192 | * 获取到指定的dom后,从这些dom中提取元素内容。 | 201 | * 获取到指定的dom后,从这些dom中提取元素内容。 |
193 | */ | 202 | */ |
194 | - for (int i = 1; i <= nodes.size() - 1; i++) { | 203 | + for (int i = 0; i <= nodes.size() - 1; i++) { |
195 | Selectable node = nodes.get(i).$(".u-full-height").nodes().get(2).nodes().get(0).$(".u-full-height").select(new XpathSelector("a[@class='c-card__link u-link-inherit']")).nodes().get(0); | 204 | Selectable node = nodes.get(i).$(".u-full-height").nodes().get(2).nodes().get(0).$(".u-full-height").select(new XpathSelector("a[@class='c-card__link u-link-inherit']")).nodes().get(0); |
196 | - String link = node.$("a", "href").get(); | 205 | + String link = node.links().get(); |
197 | String title = node.$("a", "text").get(); | 206 | String title = node.$("a", "text").get(); |
198 | if (KeywordUtil.containKeywordsInTitle(title)) { | 207 | if (KeywordUtil.containKeywordsInTitle(title)) { |
199 | - page.addTargetRequest(link); | ||
200 | - log.info("关键字文章列表链接:{},标题:{},文章链接:{}", url, title, link); | 208 | + String publishTime = nodes.get(i).xpath("//div[@class='c-card__section c-meta']/time/text()").get(); |
209 | + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH); | ||
210 | + try { | ||
211 | + Date publishTimeDateTime = formatter.parse(publishTime); | ||
212 | + if (!publishTimeDateTime.before(DateUtil.localDate2Date(DateUtil.parseDate("2000-01-01", DateUtil.DATE)))) { | ||
213 | + page.addTargetRequest(link); | ||
214 | + log.info("关键字文章列表链接:{},标题:{},文章链接:{}", url, title, link); | ||
215 | + } | ||
216 | + } catch (ParseException e) { | ||
217 | + e.printStackTrace(); | ||
218 | + } | ||
219 | + | ||
201 | } | 220 | } |
221 | + | ||
202 | } | 222 | } |
203 | } | 223 | } |
204 | 224 | ||
@@ -212,7 +232,7 @@ public class NatureSearchPageProcessor implements PageProcessor { | @@ -212,7 +232,7 @@ public class NatureSearchPageProcessor implements PageProcessor { | ||
212 | Spider.create(new NatureSearchPageProcessor()) | 232 | Spider.create(new NatureSearchPageProcessor()) |
213 | // 添加这个Spider要爬取的网页地址 | 233 | // 添加这个Spider要爬取的网页地址 |
214 | .addUrl("https://www.nature.com/nature/research-articles?sort=PubDate&page=1") | 234 | .addUrl("https://www.nature.com/nature/research-articles?sort=PubDate&page=1") |
215 | - .addPipeline(new NatureArticlePipeline()) | 235 | + .addPipeline(new ArticlePipeline()) |
216 | // 开启5个线程执行,并开始爬取 | 236 | // 开启5个线程执行,并开始爬取 |
217 | .thread(5).run(); | 237 | .thread(5).run(); |
218 | } | 238 | } |
src/main/java/com/canrd/webmagic/processor/Science4JournalArticlePageProcessor.java
0 → 100644
1 | +package com.canrd.webmagic.processor; | ||
2 | + | ||
3 | +import com.alibaba.fastjson.JSONArray; | ||
4 | +import com.alibaba.fastjson.JSONObject; | ||
5 | +import com.canrd.webmagic.common.utils.DateUtil; | ||
6 | +import com.canrd.webmagic.domain.ArticleTypeEnum; | ||
7 | +import com.canrd.webmagic.domain.dto.ArticleDO; | ||
8 | +import com.canrd.webmagic.processor.config.Agent; | ||
9 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | ||
10 | +import lombok.extern.slf4j.Slf4j; | ||
11 | +import org.springframework.stereotype.Component; | ||
12 | +import us.codecraft.webmagic.Page; | ||
13 | +import us.codecraft.webmagic.Site; | ||
14 | +import us.codecraft.webmagic.Spider; | ||
15 | +import us.codecraft.webmagic.processor.PageProcessor; | ||
16 | +import us.codecraft.webmagic.selector.Html; | ||
17 | +import us.codecraft.webmagic.selector.Selectable; | ||
18 | + | ||
19 | +import java.text.ParseException; | ||
20 | +import java.text.SimpleDateFormat; | ||
21 | +import java.util.Date; | ||
22 | +import java.util.List; | ||
23 | +import java.util.Locale; | ||
24 | +import java.util.Objects; | ||
25 | + | ||
26 | +/** | ||
27 | + * @author: xms | ||
28 | + * @description: TODO | ||
29 | + * @date: 2024/4/1 14:19 | ||
30 | + * @version: 1.0 | ||
31 | + */ | ||
32 | +@Slf4j | ||
33 | +@Component | ||
34 | +public class Science4JournalArticlePageProcessor implements PageProcessor { | ||
35 | + private String agent = Agent.getRandom(); | ||
36 | + | ||
37 | + // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等 | ||
38 | + private Site site = Site.me().setRetryTimes(3).setSleepTime(100).setUserAgent(Agent.getRandom()); | ||
39 | + | ||
40 | + /** | ||
41 | + * 定制爬虫逻辑的核心接口,在这里编写抽取逻辑 | ||
42 | + * | ||
43 | + * @param page | ||
44 | + */ | ||
45 | + @Override | ||
46 | + public void process(Page page) { | ||
47 | + doArticleContent(page); | ||
48 | + } | ||
49 | + | ||
50 | + /** | ||
51 | + * @param page | ||
52 | + */ | ||
53 | + private void doArticleContent(Page page) { | ||
54 | + | ||
55 | + //解析页面 | ||
56 | + Html html = page.getHtml(); | ||
57 | + String articleCode = page.getUrl().get(); | ||
58 | + Selectable headSelectable = html.xpath("//div[@class='article-container']/article/header"); | ||
59 | + | ||
60 | + String title = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='core-lede']/div/text()").get(); | ||
61 | + | ||
62 | + String articleDesc = html.xpath("//div[@class='article-container']/article").xpath("//section[@id='bodymatter']/div/div/text()").get(); | ||
63 | + | ||
64 | + String publishTime = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='core-self-citation']").xpath("//div[@class='core-date-published']/span/text()").get(); | ||
65 | + Date publishTimeDateTime = null; | ||
66 | + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH); | ||
67 | + | ||
68 | + try { | ||
69 | + publishTimeDateTime = formatter.parse(publishTime); | ||
70 | + } catch (ParseException e) { | ||
71 | + e.printStackTrace(); | ||
72 | + } | ||
73 | + List<Selectable> authorNodes = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='contributors']/span/span/span").nodes(); | ||
74 | + StringBuffer authorName = new StringBuffer(); | ||
75 | + for (Selectable node : authorNodes) { | ||
76 | + authorName.append(node.xpath("//a/span/text()").get()).append(" "); | ||
77 | + } | ||
78 | + | ||
79 | + | ||
80 | + JSONArray authorEmail = new JSONArray(); | ||
81 | + List<Selectable> authorEmailSelectables = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='contributors']").xpath("//span[@class='authors']").xpath("//span[@role='list']").xpath("//span[@property='author']").nodes(); | ||
82 | + for (Selectable authorEmailSelectable : authorEmailSelectables) { | ||
83 | + String givenName = authorEmailSelectable.xpath("//span[@property='givenName']/text()").get(); | ||
84 | + String familyName = authorEmailSelectable.xpath("//span[@property='familyName']/text()").get(); | ||
85 | + String email = authorEmailSelectable.xpath("//a[@property='email']/text()").get(); | ||
86 | + | ||
87 | + JSONObject jsonObject = new JSONObject(); | ||
88 | + jsonObject.put("authorEmailName", givenName + "" + familyName); | ||
89 | + jsonObject.put("email", email); | ||
90 | + authorEmail.add(jsonObject); | ||
91 | + } | ||
92 | + log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorEmail.toJSONString()); | ||
93 | + | ||
94 | + page.putField("article", ArticleDO.builder() | ||
95 | + .articleType(ArticleTypeEnum.SCIENCE.getType()) | ||
96 | + .articleCode(articleCode) | ||
97 | + .authorName(authorName.toString()) | ||
98 | + .title(title) | ||
99 | + .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE)) | ||
100 | + .emailInfo(authorEmail.toJSONString()) | ||
101 | + .articleDesc(articleDesc) | ||
102 | + .authorAddress(null) | ||
103 | + .referenceInfo(null).build()); | ||
104 | + } | ||
105 | + | ||
106 | + @Override | ||
107 | + public Site getSite() { | ||
108 | + return site; | ||
109 | + } | ||
110 | + | ||
111 | + public static void main(String[] args) { | ||
112 | + // 创建一个Spider,并把我们的处理器放进去 | ||
113 | + Spider.create(new Science4JournalArticlePageProcessor()) | ||
114 | + // 添加这个Spider要爬取的网页地址 | ||
115 | + .addUrl("https://www.science.org/journal/science/insights?startPage=0") | ||
116 | + .addPipeline(new ArticlePipeline()) | ||
117 | + // 开启5个线程执行,并开始爬取 | ||
118 | + .thread(5).run(); | ||
119 | + } | ||
120 | +} | ||
0 | \ No newline at end of file | 121 | \ No newline at end of file |
src/main/java/com/canrd/webmagic/processor/Science4JournalSearchPageProcessor.java
0 → 100644
1 | +package com.canrd.webmagic.processor; | ||
2 | + | ||
3 | +import com.canrd.webmagic.common.utils.DateUtil; | ||
4 | +import com.canrd.webmagic.common.utils.KeywordUtil; | ||
5 | +import com.canrd.webmagic.processor.config.Agent; | ||
6 | +import com.canrd.webmagic.processor.download.SeleniumDownloader; | ||
7 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | ||
8 | +import lombok.extern.slf4j.Slf4j; | ||
9 | +import org.springframework.stereotype.Component; | ||
10 | +import us.codecraft.webmagic.Page; | ||
11 | +import us.codecraft.webmagic.Site; | ||
12 | +import us.codecraft.webmagic.Spider; | ||
13 | +import us.codecraft.webmagic.processor.PageProcessor; | ||
14 | +import us.codecraft.webmagic.selector.Selectable; | ||
15 | +import us.codecraft.webmagic.selector.XpathSelector; | ||
16 | + | ||
17 | +import javax.annotation.Resource; | ||
18 | +import java.text.ParseException; | ||
19 | +import java.text.SimpleDateFormat; | ||
20 | +import java.util.Date; | ||
21 | +import java.util.List; | ||
22 | +import java.util.Locale; | ||
23 | + | ||
24 | +/** | ||
25 | + * @author: xms | ||
26 | + * @description: TODO | ||
27 | + * @date: 2024/4/1 14:19 | ||
28 | + * @version: 1.0 | ||
29 | + */ | ||
30 | +@Slf4j | ||
31 | +@Component | ||
32 | +public class Science4JournalSearchPageProcessor implements PageProcessor { | ||
33 | + | ||
34 | + @Resource | ||
35 | + private Science4JournalArticlePageProcessor science4JournalArticlePageProcessor; | ||
36 | + | ||
37 | + @Resource | ||
38 | + private SeleniumDownloader seleniumDownloader; | ||
39 | + | ||
40 | + @Resource | ||
41 | + private ArticlePipeline articlePipeline; | ||
42 | + | ||
43 | + /** | ||
44 | + * 抓取网站的相关配置,包括编码、抓取间隔、重试次数等 | ||
45 | + */ | ||
46 | + private Site site = Site.me().setRetryTimes(3).setSleepTime(100).setUserAgent(Agent.getRandom()); | ||
47 | + | ||
48 | + /** | ||
49 | + * 定制爬虫逻辑的核心接口,在这里编写抽取逻辑 | ||
50 | + * | ||
51 | + * @param page | ||
52 | + */ | ||
53 | + @Override | ||
54 | + public void process(Page page) { | ||
55 | + doArticleList(page); | ||
56 | + } | ||
57 | + | ||
58 | + /** | ||
59 | + * @param page | ||
60 | + */ | ||
61 | + private void doArticleList(Page page) { | ||
62 | + String url = page.getUrl().get(); | ||
63 | + /** | ||
64 | + * 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容 | ||
65 | + * 1、通过$或css()方法获取到该page html下某元素dom | ||
66 | + */ | ||
67 | + Selectable selectable = page.getHtml().xpath("//div[@class=' search-result__body titles-results ']").select(new XpathSelector("div[@class='card pb-3 mb-4 border-bottom']")); | ||
68 | + List<Selectable> nodes = selectable.nodes(); | ||
69 | + | ||
70 | + /** | ||
71 | + * 获取到指定的dom后,从这些dom中提取元素内容。 | ||
72 | + */ | ||
73 | + for (int i = 0; i <= nodes.size() - 1; i++) { | ||
74 | + String title = nodes.get(i).xpath("//div[@class='card pb-3 mb-4 border-bottom']/div").xpath("//div[@class='d-flex justify-content-between align-items-end']/div/span/h2/a/text()").get(); | ||
75 | + String time = nodes.get(i).xpath("//div[@class='card-meta align-middle mb-2 text-uppercase text-darker-gray']/span").nodes().get(2).xpath("//time/text()").get(); | ||
76 | + String link = nodes.get(0).links().get(); | ||
77 | + if (!KeywordUtil.containKeywordsInTitle(title)) { | ||
78 | + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH); | ||
79 | + try { | ||
80 | + Date publishTimeDateTime = formatter.parse(time); | ||
81 | + if (!publishTimeDateTime.before(DateUtil.localDate2Date(DateUtil.parseDate("2000-01-01", DateUtil.DATE)))) { | ||
82 | +// page.addTargetRequest(link); | ||
83 | + Spider.create(science4JournalArticlePageProcessor) | ||
84 | + .addUrl(link) | ||
85 | + .addPipeline(articlePipeline) | ||
86 | + .setDownloader(seleniumDownloader) | ||
87 | + // 开启5个线程执行,并开始爬取 | ||
88 | + .thread(1).run(); | ||
89 | + log.info("关键字文章列表链接:{},标题:{},文章链接:{}", url, title, link); | ||
90 | + } | ||
91 | + } catch (ParseException e) { | ||
92 | + e.printStackTrace(); | ||
93 | + } | ||
94 | + | ||
95 | + } | ||
96 | + } | ||
97 | + | ||
98 | + } | ||
99 | + | ||
100 | + @Override | ||
101 | + public Site getSite() { | ||
102 | + return site; | ||
103 | + } | ||
104 | + | ||
105 | + public static void main(String[] args) { | ||
106 | + // 创建一个Spider,并把我们的处理器放进去 | ||
107 | + Spider.create(new Science4JournalSearchPageProcessor()) | ||
108 | + // 添加这个Spider要爬取的网页地址 | ||
109 | + .addUrl("https://www.science.org/journal/science/insights?startPage=0") | ||
110 | + .addPipeline(new ArticlePipeline()) | ||
111 | + // 开启5个线程执行,并开始爬取 | ||
112 | + .thread(5).run(); | ||
113 | + } | ||
114 | +} | ||
0 | \ No newline at end of file | 115 | \ No newline at end of file |
src/main/java/com/canrd/webmagic/processor/config/Downloader.java renamed to src/main/java/com/canrd/webmagic/processor/download/Downloader.java
1 | -package com.canrd.webmagic.processor.config; | 1 | +package com.canrd.webmagic.processor.download; |
2 | 2 | ||
3 | import lombok.extern.slf4j.Slf4j; | 3 | import lombok.extern.slf4j.Slf4j; |
4 | import org.springframework.beans.factory.annotation.Autowired; | 4 | import org.springframework.beans.factory.annotation.Autowired; |
src/main/java/com/canrd/webmagic/processor/download/SeleniumDownloader.java
0 → 100644
1 | +package com.canrd.webmagic.processor.download; | ||
2 | + | ||
3 | +import lombok.extern.slf4j.Slf4j; | ||
4 | +import org.openqa.selenium.By; | ||
5 | +import org.openqa.selenium.Cookie; | ||
6 | +import org.openqa.selenium.WebDriver; | ||
7 | +import org.openqa.selenium.WebElement; | ||
8 | +import org.springframework.stereotype.Component; | ||
9 | +import us.codecraft.webmagic.Page; | ||
10 | +import us.codecraft.webmagic.Request; | ||
11 | +import us.codecraft.webmagic.Site; | ||
12 | +import us.codecraft.webmagic.Task; | ||
13 | +import us.codecraft.webmagic.downloader.AbstractDownloader; | ||
14 | +import us.codecraft.webmagic.selector.Html; | ||
15 | +import us.codecraft.webmagic.selector.PlainText; | ||
16 | + | ||
17 | +import javax.annotation.Resource; | ||
18 | +import java.util.Map; | ||
19 | + | ||
20 | +/** | ||
21 | + * @author: xms | ||
22 | + * @description: TODO | ||
23 | + * @date: 2024/4/26 16:36 | ||
24 | + * @version: 1.0 | ||
25 | + */ | ||
26 | +@Slf4j | ||
27 | +@Component | ||
28 | +public class SeleniumDownloader extends AbstractDownloader { | ||
29 | + private int sleepTime = 0; | ||
30 | + | ||
31 | + @Resource | ||
32 | + private WebDriver webDriver; | ||
33 | + | ||
34 | + /** | ||
35 | + * set sleep time to wait until load success | ||
36 | + * | ||
37 | + * @param sleepTime sleepTime | ||
38 | + * @return this | ||
39 | + */ | ||
40 | + public SeleniumDownloader setSleepTime(int sleepTime) { | ||
41 | + this.sleepTime = sleepTime; | ||
42 | + return this; | ||
43 | + } | ||
44 | + | ||
45 | + @Override | ||
46 | + public Page download(Request request, Task task) { | ||
47 | + Page page = Page.fail(); | ||
48 | + try { | ||
49 | + | ||
50 | + | ||
51 | + log.info("downloading page " + request.getUrl()); | ||
52 | + webDriver.get(request.getUrl()); | ||
53 | + try { | ||
54 | + if (sleepTime > 0) { | ||
55 | + Thread.sleep(sleepTime); | ||
56 | + } | ||
57 | + } catch (InterruptedException e) { | ||
58 | + e.printStackTrace(); | ||
59 | + } | ||
60 | + WebDriver.Options manage = webDriver.manage(); | ||
61 | + Site site = task.getSite(); | ||
62 | + if (site.getCookies() != null) { | ||
63 | + for (Map.Entry<String, String> cookieEntry : site.getCookies() | ||
64 | + .entrySet()) { | ||
65 | + Cookie cookie = new Cookie(cookieEntry.getKey(), | ||
66 | + cookieEntry.getValue()); | ||
67 | + manage.addCookie(cookie); | ||
68 | + } | ||
69 | + } | ||
70 | + | ||
71 | + /* | ||
72 | + * TODO You can add mouse event or other processes | ||
73 | + * | ||
74 | + * @author: bob.li.0718@gmail.com | ||
75 | + */ | ||
76 | + try { | ||
77 | + //休眠3秒就是为了动态的数据渲染完成后在进行获取 | ||
78 | + Thread.sleep(30000); | ||
79 | + } catch (InterruptedException e) { | ||
80 | + throw new RuntimeException(e); | ||
81 | + } | ||
82 | + WebElement webElement = webDriver.findElement(By.xpath("/html")); | ||
83 | + String content = webElement.getAttribute("outerHTML"); | ||
84 | + page.setDownloadSuccess(true); | ||
85 | + page.setRawText(content); | ||
86 | + page.setHtml(new Html(content, request.getUrl())); | ||
87 | + page.setUrl(new PlainText(request.getUrl())); | ||
88 | + page.setRequest(request); | ||
89 | + onSuccess(request, task); | ||
90 | + } catch (Exception e) { | ||
91 | + log.warn("download page {} error", request.getUrl(), e); | ||
92 | + onError(request, task, e); | ||
93 | + } finally { | ||
94 | + | ||
95 | + } | ||
96 | + return page; | ||
97 | + } | ||
98 | + | ||
99 | + @Override | ||
100 | + public void setThread(int i) { | ||
101 | + | ||
102 | + } | ||
103 | +} |
src/main/java/com/canrd/webmagic/processor/pipeline/NatureArticlePipeline.java renamed to src/main/java/com/canrd/webmagic/processor/pipeline/ArticlePipeline.java
@@ -14,7 +14,7 @@ import java.util.List; | @@ -14,7 +14,7 @@ import java.util.List; | ||
14 | import java.util.Objects; | 14 | import java.util.Objects; |
15 | 15 | ||
16 | @Component | 16 | @Component |
17 | -public class NatureArticlePipeline implements Pipeline { | 17 | +public class ArticlePipeline implements Pipeline { |
18 | 18 | ||
19 | private ArticleService articleService; | 19 | private ArticleService articleService; |
20 | 20 |
src/main/resources/user-agent/User-Agents.txt
1 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60 | 1 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60 |
2 | -Opera/8.0 (Windows NT 5.1; U; en) | ||
3 | Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50 | 2 | Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50 |
4 | Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50 | 3 | Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50 |
5 | Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0 | 4 | Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0 |
6 | -Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10 | ||
7 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2 | 5 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2 |
8 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36 | 6 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36 |
9 | Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11 | 7 | Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11 |
@@ -19,4 +17,6 @@ Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C | @@ -19,4 +17,6 @@ Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C | ||
19 | Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0 | 17 | Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0 |
20 | Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0) | 18 | Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0) |
21 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36 | 19 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36 |
22 | -Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36 | ||
23 | \ No newline at end of file | 20 | \ No newline at end of file |
21 | +Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36 | ||
22 | +Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 | ||
23 | +Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0 | ||
24 | \ No newline at end of file | 24 | \ No newline at end of file |