Commit a24924df1d1fad46ddee4053e191bf6ac4731396
1 parent
8f304c93
feat:
1、selenium 整合 2、science 网站
Showing
13 changed files
with
557 additions
and
19 deletions
pom.xml
... | ... | @@ -47,6 +47,7 @@ |
47 | 47 | <jjwt.version>0.10.6</jjwt.version> |
48 | 48 | <easyexcel.version>2.2.3</easyexcel.version> |
49 | 49 | <webmagic.version>0.10.0</webmagic.version> |
50 | + <selenium.version>3.4.0</selenium.version> | |
50 | 51 | </properties> |
51 | 52 | |
52 | 53 | <dependencies> |
... | ... | @@ -78,6 +79,12 @@ |
78 | 79 | <version>${webmagic.version}</version> |
79 | 80 | </dependency> |
80 | 81 | |
82 | + <!-- selenium --> | |
83 | + <dependency> | |
84 | + <groupId>org.seleniumhq.selenium</groupId> | |
85 | + <artifactId>selenium-java</artifactId> | |
86 | + <version>${selenium.version}</version> | |
87 | + </dependency> | |
81 | 88 | |
82 | 89 | <!-- Lombok 依赖--> |
83 | 90 | <dependency> | ... | ... |
src/main/java/com/canrd/webmagic/config/SeleniumConfig.java
0 → 100644
1 | +package com.canrd.webmagic.config; | |
2 | + | |
3 | +import com.canrd.webmagic.processor.config.Agent; | |
4 | +import org.openqa.selenium.WebDriver; | |
5 | +import org.openqa.selenium.chrome.ChromeDriver; | |
6 | +import org.openqa.selenium.chrome.ChromeOptions; | |
7 | +import org.springframework.context.annotation.Bean; | |
8 | +import org.springframework.context.annotation.Configuration; | |
9 | + | |
10 | +/** | |
11 | + * @author: xms | |
12 | + * @description: TODO | |
13 | + * @date: 2024/4/26 14:37 | |
14 | + * @version: 1.0 | |
15 | + */ | |
16 | +@Configuration | |
17 | +public class SeleniumConfig { | |
18 | + | |
19 | + @Bean | |
20 | + public WebDriver webDriver() { | |
21 | + System.setProperty("webdriver.chrome.driver", "D:\\chrome\\chromedriver-win64\\chromedriver-win64\\chromedriver.exe"); | |
22 | + return new ChromeDriver(); | |
23 | + } | |
24 | +} | ... | ... |
src/main/java/com/canrd/webmagic/controller/ArticleController.java
... | ... | @@ -5,8 +5,8 @@ import com.canrd.webmagic.common.jsr303.OperateGroup; |
5 | 5 | import com.canrd.webmagic.domain.vo.NatureArticleQueryVO; |
6 | 6 | import com.canrd.webmagic.domain.vo.NatureArticleVO; |
7 | 7 | import com.canrd.webmagic.processor.NatureSearchPageProcessor; |
8 | -import com.canrd.webmagic.processor.config.Downloader; | |
9 | -import com.canrd.webmagic.processor.pipeline.NatureArticlePipeline; | |
8 | +import com.canrd.webmagic.processor.download.Downloader; | |
9 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | |
10 | 10 | import com.canrd.webmagic.service.ArticleService; |
11 | 11 | import org.springframework.validation.annotation.Validated; |
12 | 12 | import org.springframework.web.bind.annotation.*; |
... | ... | @@ -33,7 +33,7 @@ public class ArticleController { |
33 | 33 | private NatureSearchPageProcessor natureSearchPageProcessor; |
34 | 34 | |
35 | 35 | @Resource |
36 | - private NatureArticlePipeline articlePipeline; | |
36 | + private ArticlePipeline articlePipeline; | |
37 | 37 | |
38 | 38 | @Resource |
39 | 39 | private Downloader downloader; |
... | ... | @@ -49,7 +49,7 @@ public class ArticleController { |
49 | 49 | .addUrl("https://www.nature.com/search?q=" + keyword + "&page=" + i) |
50 | 50 | .addUrl("https://www.nature.com/nature/research-articles?sort=PubDate&page=" + i) |
51 | 51 | .addPipeline(articlePipeline) |
52 | - .setDownloader(downloader.newIpDownloader()) | |
52 | +// .setDownloader(downloader.newIpDownloader()) | |
53 | 53 | // 开启5个线程执行,并开始爬取 |
54 | 54 | .thread(5).run(); |
55 | 55 | } | ... | ... |
src/main/java/com/canrd/webmagic/controller/Science4JournalController.java
0 → 100644
1 | +package com.canrd.webmagic.controller; | |
2 | + | |
3 | +import com.canrd.webmagic.common.constant.ServerResult; | |
4 | +import com.canrd.webmagic.common.jsr303.OperateGroup; | |
5 | +import com.canrd.webmagic.domain.vo.NatureArticleQueryVO; | |
6 | +import com.canrd.webmagic.domain.vo.NatureArticleVO; | |
7 | +import com.canrd.webmagic.processor.Science4JournalSearchPageProcessor; | |
8 | +import com.canrd.webmagic.service.ArticleService; | |
9 | +import org.springframework.validation.annotation.Validated; | |
10 | +import org.springframework.web.bind.annotation.*; | |
11 | +import us.codecraft.webmagic.Spider; | |
12 | + | |
13 | +import javax.annotation.Resource; | |
14 | + | |
15 | +/** | |
16 | + * nature-文章信息(NatureArticle)表控制层 | |
17 | + * | |
18 | + * @author makejava | |
19 | + * @since 2024-04-07 18:39:41 | |
20 | + */ | |
21 | +@RestController | |
22 | +@RequestMapping("/science/journal") | |
23 | +public class Science4JournalController { | |
24 | + /** | |
25 | + * 服务对象 | |
26 | + */ | |
27 | + @Resource | |
28 | + private ArticleService articleService; | |
29 | + | |
30 | + @Resource | |
31 | + private Science4JournalSearchPageProcessor science4JournalSearchPageProcessor; | |
32 | + | |
33 | + /** | |
34 | + * @return | |
35 | + */ | |
36 | + @GetMapping("/start") | |
37 | + public ServerResult start(@RequestParam(value = "keyword") String keyword, @RequestParam(value = "indexSize") Integer indexSize) { | |
38 | + for (int i = 0; i <= indexSize; i++) { | |
39 | + Spider.create(science4JournalSearchPageProcessor) | |
40 | + .addUrl("http://www.science.org/journal/science/insights?startPage=" + i) | |
41 | + // 开启5个线程执行,并开始爬取 | |
42 | + .thread(5).run(); | |
43 | + } | |
44 | + | |
45 | + return ServerResult.success(); | |
46 | + } | |
47 | + | |
48 | + /** | |
49 | + * 分页查询 | |
50 | + * | |
51 | + * @param natureArticleQueryVO 查询条件 | |
52 | + * @return 查询结果 | |
53 | + */ | |
54 | + @PostMapping("/list") | |
55 | + public ServerResult list(@RequestBody @Validated({OperateGroup.List.class}) NatureArticleQueryVO natureArticleQueryVO) { | |
56 | + return articleService.list(natureArticleQueryVO); | |
57 | + } | |
58 | + | |
59 | + /** | |
60 | + * 通过主键查询单条数据 | |
61 | + * | |
62 | + * @param natureArticleQueryVO 查询条件 | |
63 | + * @return 单条数据 | |
64 | + */ | |
65 | + @PostMapping("/query_by_id") | |
66 | + public ServerResult queryById(@RequestBody NatureArticleQueryVO natureArticleQueryVO) { | |
67 | + return articleService.queryById(natureArticleQueryVO); | |
68 | + } | |
69 | + | |
70 | + /** | |
71 | + * 新增数据 | |
72 | + * | |
73 | + * @param natureArticleVO 数据VO | |
74 | + * @return 新增结果 | |
75 | + */ | |
76 | + @PostMapping("/add") | |
77 | + public ServerResult add(@RequestBody NatureArticleVO natureArticleVO) { | |
78 | + return articleService.add(natureArticleVO); | |
79 | + } | |
80 | + | |
81 | + /** | |
82 | + * 编辑数据 | |
83 | + * | |
84 | + * @param natureArticleVO 数据VO | |
85 | + * @return 编辑结果 | |
86 | + */ | |
87 | + @PostMapping("/edit") | |
88 | + public ServerResult edit(@RequestBody NatureArticleVO natureArticleVO) { | |
89 | + return articleService.edit(natureArticleVO); | |
90 | + } | |
91 | + | |
92 | + /** | |
93 | + * 删除数据 | |
94 | + * | |
95 | + * @param natureArticleQueryVO 查询条件 | |
96 | + * @return 删除是否成功 | |
97 | + */ | |
98 | + @PostMapping("/delete_by_id") | |
99 | + public ServerResult deleteById(@RequestBody NatureArticleQueryVO natureArticleQueryVO) { | |
100 | + return articleService.deleteById(natureArticleQueryVO); | |
101 | + } | |
102 | + | |
103 | +} | |
104 | + | ... | ... |
src/main/java/com/canrd/webmagic/domain/ArticleTypeEnum.java
src/main/java/com/canrd/webmagic/job/NatureJob.java
0 → 100644
1 | +package com.canrd.webmagic.job; | |
2 | + | |
3 | +import com.canrd.webmagic.common.utils.KeywordUtil; | |
4 | +import com.canrd.webmagic.processor.NatureSearchPageProcessor; | |
5 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | |
6 | +import org.springframework.scheduling.annotation.Scheduled; | |
7 | +import org.springframework.stereotype.Component; | |
8 | +import us.codecraft.webmagic.Spider; | |
9 | + | |
10 | +import javax.annotation.Resource; | |
11 | + | |
12 | +/** | |
13 | + * @author: xms | |
14 | + * @description: TODO | |
15 | + * @date: 2024/4/26 10:06 | |
16 | + * @version: 1.0 | |
17 | + */ | |
18 | +@Component | |
19 | +public class NatureJob { | |
20 | + | |
21 | + @Resource | |
22 | + private NatureSearchPageProcessor natureSearchPageProcessor; | |
23 | + | |
24 | + @Resource | |
25 | + private ArticlePipeline articlePipeline; | |
26 | + | |
27 | + /** | |
28 | + * 每天凌晨执行一次 | |
29 | + */ | |
30 | +// @Scheduled(cron = "*/20 * * * * ?") | |
31 | + @Scheduled(cron = "0 0 0 * * ?") | |
32 | + public void executeByDay() { | |
33 | + for (String keyword : KeywordUtil.getKeyWordList()) { | |
34 | + Spider.create(natureSearchPageProcessor) | |
35 | + // 添加这个Spider要爬取的网页地址 | |
36 | + .addUrl("https://www.nature.com/search?q=" + keyword + "&page=" + 1) | |
37 | + .addUrl("https://www.nature.com/search?q=" + keyword + "&page=" + 2) | |
38 | + .addUrl("https://www.nature.com/search?q=" + keyword + "&page=" + 3) | |
39 | + .addPipeline(articlePipeline) | |
40 | +// .setDownloader(downloader.newIpDownloader()) | |
41 | + // 开启5个线程执行,并开始爬取 | |
42 | + .thread(5).run(); | |
43 | + } | |
44 | + } | |
45 | +} | ... | ... |
src/main/java/com/canrd/webmagic/processor/NatureSearchPageProcessor.java
... | ... | @@ -3,12 +3,13 @@ package com.canrd.webmagic.processor; |
3 | 3 | import com.alibaba.fastjson.JSONArray; |
4 | 4 | import com.alibaba.fastjson.JSONObject; |
5 | 5 | import com.baomidou.mybatisplus.core.toolkit.CollectionUtils; |
6 | +import com.canrd.webmagic.common.utils.DateUtil; | |
6 | 7 | import com.canrd.webmagic.common.utils.KeywordUtil; |
7 | 8 | import com.canrd.webmagic.common.utils.StringUtils; |
8 | 9 | import com.canrd.webmagic.domain.ArticleTypeEnum; |
9 | 10 | import com.canrd.webmagic.domain.dto.ArticleDO; |
10 | 11 | import com.canrd.webmagic.processor.config.Agent; |
11 | -import com.canrd.webmagic.processor.pipeline.NatureArticlePipeline; | |
12 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | |
12 | 13 | import lombok.extern.slf4j.Slf4j; |
13 | 14 | import org.springframework.stereotype.Component; |
14 | 15 | import us.codecraft.webmagic.Page; |
... | ... | @@ -19,9 +20,9 @@ import us.codecraft.webmagic.selector.Html; |
19 | 20 | import us.codecraft.webmagic.selector.Selectable; |
20 | 21 | import us.codecraft.webmagic.selector.XpathSelector; |
21 | 22 | |
22 | -import java.util.ArrayList; | |
23 | -import java.util.List; | |
24 | -import java.util.Objects; | |
23 | +import java.text.ParseException; | |
24 | +import java.text.SimpleDateFormat; | |
25 | +import java.util.*; | |
25 | 26 | import java.util.stream.Collectors; |
26 | 27 | |
27 | 28 | /** |
... | ... | @@ -104,6 +105,7 @@ public class NatureSearchPageProcessor implements PageProcessor { |
104 | 105 | } |
105 | 106 | String articleDesc = html.xpath("//div[@class='c-article-section__content']/p/text()").get(); |
106 | 107 | String publishTime; |
108 | + Date publishTimeDateTime = null; | |
107 | 109 | try { |
108 | 110 | publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get(); |
109 | 111 | } catch (Exception e) { |
... | ... | @@ -113,6 +115,13 @@ public class NatureSearchPageProcessor implements PageProcessor { |
113 | 115 | publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(0).xpath("//li/time/text()").get(); |
114 | 116 | } |
115 | 117 | } |
118 | + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH); | |
119 | + | |
120 | + try { | |
121 | + publishTimeDateTime = formatter.parse(publishTime); | |
122 | + } catch (ParseException e) { | |
123 | + e.printStackTrace(); | |
124 | + } | |
116 | 125 | Selectable authorSelectable = headSelectable.xpath("//ul").nodes().get(1).select(new XpathSelector("li[@class='c-article-author-list__item']")); |
117 | 126 | List<Selectable> authorNodes = authorSelectable.nodes(); |
118 | 127 | StringBuffer authorName = new StringBuffer(); |
... | ... | @@ -170,7 +179,7 @@ public class NatureSearchPageProcessor implements PageProcessor { |
170 | 179 | .articleCode(articleCode) |
171 | 180 | .authorName(authorName.toString()) |
172 | 181 | .title(title) |
173 | - .publishTime(publishTime) | |
182 | + .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE)) | |
174 | 183 | .emailInfo(authorEmail.toJSONString()) |
175 | 184 | .articleDesc(articleDesc) |
176 | 185 | .authorAddress(authorAddress.toJSONString()) |
... | ... | @@ -191,14 +200,25 @@ public class NatureSearchPageProcessor implements PageProcessor { |
191 | 200 | /** |
192 | 201 | * 获取到指定的dom后,从这些dom中提取元素内容。 |
193 | 202 | */ |
194 | - for (int i = 1; i <= nodes.size() - 1; i++) { | |
203 | + for (int i = 0; i <= nodes.size() - 1; i++) { | |
195 | 204 | Selectable node = nodes.get(i).$(".u-full-height").nodes().get(2).nodes().get(0).$(".u-full-height").select(new XpathSelector("a[@class='c-card__link u-link-inherit']")).nodes().get(0); |
196 | - String link = node.$("a", "href").get(); | |
205 | + String link = node.links().get(); | |
197 | 206 | String title = node.$("a", "text").get(); |
198 | 207 | if (KeywordUtil.containKeywordsInTitle(title)) { |
199 | - page.addTargetRequest(link); | |
200 | - log.info("关键字文章列表链接:{},标题:{},文章链接:{}", url, title, link); | |
208 | + String publishTime = nodes.get(i).xpath("//div[@class='c-card__section c-meta']/time/text()").get(); | |
209 | + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH); | |
210 | + try { | |
211 | + Date publishTimeDateTime = formatter.parse(publishTime); | |
212 | + if (!publishTimeDateTime.before(DateUtil.localDate2Date(DateUtil.parseDate("2000-01-01", DateUtil.DATE)))) { | |
213 | + page.addTargetRequest(link); | |
214 | + log.info("关键字文章列表链接:{},标题:{},文章链接:{}", url, title, link); | |
215 | + } | |
216 | + } catch (ParseException e) { | |
217 | + e.printStackTrace(); | |
218 | + } | |
219 | + | |
201 | 220 | } |
221 | + | |
202 | 222 | } |
203 | 223 | } |
204 | 224 | |
... | ... | @@ -212,7 +232,7 @@ public class NatureSearchPageProcessor implements PageProcessor { |
212 | 232 | Spider.create(new NatureSearchPageProcessor()) |
213 | 233 | // 添加这个Spider要爬取的网页地址 |
214 | 234 | .addUrl("https://www.nature.com/nature/research-articles?sort=PubDate&page=1") |
215 | - .addPipeline(new NatureArticlePipeline()) | |
235 | + .addPipeline(new ArticlePipeline()) | |
216 | 236 | // 开启5个线程执行,并开始爬取 |
217 | 237 | .thread(5).run(); |
218 | 238 | } | ... | ... |
src/main/java/com/canrd/webmagic/processor/Science4JournalArticlePageProcessor.java
0 → 100644
1 | +package com.canrd.webmagic.processor; | |
2 | + | |
3 | +import com.alibaba.fastjson.JSONArray; | |
4 | +import com.alibaba.fastjson.JSONObject; | |
5 | +import com.canrd.webmagic.common.utils.DateUtil; | |
6 | +import com.canrd.webmagic.domain.ArticleTypeEnum; | |
7 | +import com.canrd.webmagic.domain.dto.ArticleDO; | |
8 | +import com.canrd.webmagic.processor.config.Agent; | |
9 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | |
10 | +import lombok.extern.slf4j.Slf4j; | |
11 | +import org.springframework.stereotype.Component; | |
12 | +import us.codecraft.webmagic.Page; | |
13 | +import us.codecraft.webmagic.Site; | |
14 | +import us.codecraft.webmagic.Spider; | |
15 | +import us.codecraft.webmagic.processor.PageProcessor; | |
16 | +import us.codecraft.webmagic.selector.Html; | |
17 | +import us.codecraft.webmagic.selector.Selectable; | |
18 | + | |
19 | +import java.text.ParseException; | |
20 | +import java.text.SimpleDateFormat; | |
21 | +import java.util.Date; | |
22 | +import java.util.List; | |
23 | +import java.util.Locale; | |
24 | +import java.util.Objects; | |
25 | + | |
26 | +/** | |
27 | + * @author: xms | |
28 | + * @description: TODO | |
29 | + * @date: 2024/4/1 14:19 | |
30 | + * @version: 1.0 | |
31 | + */ | |
32 | +@Slf4j | |
33 | +@Component | |
34 | +public class Science4JournalArticlePageProcessor implements PageProcessor { | |
35 | + private String agent = Agent.getRandom(); | |
36 | + | |
37 | + // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等 | |
38 | + private Site site = Site.me().setRetryTimes(3).setSleepTime(100).setUserAgent(Agent.getRandom()); | |
39 | + | |
40 | + /** | |
41 | + * 定制爬虫逻辑的核心接口,在这里编写抽取逻辑 | |
42 | + * | |
43 | + * @param page | |
44 | + */ | |
45 | + @Override | |
46 | + public void process(Page page) { | |
47 | + doArticleContent(page); | |
48 | + } | |
49 | + | |
50 | + /** | |
51 | + * @param page | |
52 | + */ | |
53 | + private void doArticleContent(Page page) { | |
54 | + | |
55 | + //解析页面 | |
56 | + Html html = page.getHtml(); | |
57 | + String articleCode = page.getUrl().get(); | |
58 | + Selectable headSelectable = html.xpath("//div[@class='article-container']/article/header"); | |
59 | + | |
60 | + String title = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='core-lede']/div/text()").get(); | |
61 | + | |
62 | + String articleDesc = html.xpath("//div[@class='article-container']/article").xpath("//section[@id='bodymatter']/div/div/text()").get(); | |
63 | + | |
64 | + String publishTime = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='core-self-citation']").xpath("//div[@class='core-date-published']/span/text()").get(); | |
65 | + Date publishTimeDateTime = null; | |
66 | + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH); | |
67 | + | |
68 | + try { | |
69 | + publishTimeDateTime = formatter.parse(publishTime); | |
70 | + } catch (ParseException e) { | |
71 | + e.printStackTrace(); | |
72 | + } | |
73 | + List<Selectable> authorNodes = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='contributors']/span/span/span").nodes(); | |
74 | + StringBuffer authorName = new StringBuffer(); | |
75 | + for (Selectable node : authorNodes) { | |
76 | + authorName.append(node.xpath("//a/span/text()").get()).append(" "); | |
77 | + } | |
78 | + | |
79 | + | |
80 | + JSONArray authorEmail = new JSONArray(); | |
81 | + List<Selectable> authorEmailSelectables = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='contributors']").xpath("//span[@class='authors']").xpath("//span[@role='list']").xpath("//span[@property='author']").nodes(); | |
82 | + for (Selectable authorEmailSelectable : authorEmailSelectables) { | |
83 | + String givenName = authorEmailSelectable.xpath("//span[@property='givenName']/text()").get(); | |
84 | + String familyName = authorEmailSelectable.xpath("//span[@property='familyName']/text()").get(); | |
85 | + String email = authorEmailSelectable.xpath("//a[@property='email']/text()").get(); | |
86 | + | |
87 | + JSONObject jsonObject = new JSONObject(); | |
88 | + jsonObject.put("authorEmailName", givenName + "" + familyName); | |
89 | + jsonObject.put("email", email); | |
90 | + authorEmail.add(jsonObject); | |
91 | + } | |
92 | + log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorEmail.toJSONString()); | |
93 | + | |
94 | + page.putField("article", ArticleDO.builder() | |
95 | + .articleType(ArticleTypeEnum.SCIENCE.getType()) | |
96 | + .articleCode(articleCode) | |
97 | + .authorName(authorName.toString()) | |
98 | + .title(title) | |
99 | + .publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE)) | |
100 | + .emailInfo(authorEmail.toJSONString()) | |
101 | + .articleDesc(articleDesc) | |
102 | + .authorAddress(null) | |
103 | + .referenceInfo(null).build()); | |
104 | + } | |
105 | + | |
106 | + @Override | |
107 | + public Site getSite() { | |
108 | + return site; | |
109 | + } | |
110 | + | |
111 | + public static void main(String[] args) { | |
112 | + // 创建一个Spider,并把我们的处理器放进去 | |
113 | + Spider.create(new Science4JournalArticlePageProcessor()) | |
114 | + // 添加这个Spider要爬取的网页地址 | |
115 | + .addUrl("https://www.science.org/journal/science/insights?startPage=0") | |
116 | + .addPipeline(new ArticlePipeline()) | |
117 | + // 开启5个线程执行,并开始爬取 | |
118 | + .thread(5).run(); | |
119 | + } | |
120 | +} | |
0 | 121 | \ No newline at end of file | ... | ... |
src/main/java/com/canrd/webmagic/processor/Science4JournalSearchPageProcessor.java
0 → 100644
1 | +package com.canrd.webmagic.processor; | |
2 | + | |
3 | +import com.canrd.webmagic.common.utils.DateUtil; | |
4 | +import com.canrd.webmagic.common.utils.KeywordUtil; | |
5 | +import com.canrd.webmagic.processor.config.Agent; | |
6 | +import com.canrd.webmagic.processor.download.SeleniumDownloader; | |
7 | +import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | |
8 | +import lombok.extern.slf4j.Slf4j; | |
9 | +import org.springframework.stereotype.Component; | |
10 | +import us.codecraft.webmagic.Page; | |
11 | +import us.codecraft.webmagic.Site; | |
12 | +import us.codecraft.webmagic.Spider; | |
13 | +import us.codecraft.webmagic.processor.PageProcessor; | |
14 | +import us.codecraft.webmagic.selector.Selectable; | |
15 | +import us.codecraft.webmagic.selector.XpathSelector; | |
16 | + | |
17 | +import javax.annotation.Resource; | |
18 | +import java.text.ParseException; | |
19 | +import java.text.SimpleDateFormat; | |
20 | +import java.util.Date; | |
21 | +import java.util.List; | |
22 | +import java.util.Locale; | |
23 | + | |
24 | +/** | |
25 | + * @author: xms | |
26 | + * @description: TODO | |
27 | + * @date: 2024/4/1 14:19 | |
28 | + * @version: 1.0 | |
29 | + */ | |
30 | +@Slf4j | |
31 | +@Component | |
32 | +public class Science4JournalSearchPageProcessor implements PageProcessor { | |
33 | + | |
34 | + @Resource | |
35 | + private Science4JournalArticlePageProcessor science4JournalArticlePageProcessor; | |
36 | + | |
37 | + @Resource | |
38 | + private SeleniumDownloader seleniumDownloader; | |
39 | + | |
40 | + @Resource | |
41 | + private ArticlePipeline articlePipeline; | |
42 | + | |
43 | + /** | |
44 | + * 抓取网站的相关配置,包括编码、抓取间隔、重试次数等 | |
45 | + */ | |
46 | + private Site site = Site.me().setRetryTimes(3).setSleepTime(100).setUserAgent(Agent.getRandom()); | |
47 | + | |
48 | + /** | |
49 | + * 定制爬虫逻辑的核心接口,在这里编写抽取逻辑 | |
50 | + * | |
51 | + * @param page | |
52 | + */ | |
53 | + @Override | |
54 | + public void process(Page page) { | |
55 | + doArticleList(page); | |
56 | + } | |
57 | + | |
58 | + /** | |
59 | + * @param page | |
60 | + */ | |
61 | + private void doArticleList(Page page) { | |
62 | + String url = page.getUrl().get(); | |
63 | + /** | |
64 | + * 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容 | |
65 | + * 1、通过$或css()方法获取到该page html下某元素dom | |
66 | + */ | |
67 | + Selectable selectable = page.getHtml().xpath("//div[@class=' search-result__body titles-results ']").select(new XpathSelector("div[@class='card pb-3 mb-4 border-bottom']")); | |
68 | + List<Selectable> nodes = selectable.nodes(); | |
69 | + | |
70 | + /** | |
71 | + * 获取到指定的dom后,从这些dom中提取元素内容。 | |
72 | + */ | |
73 | + for (int i = 0; i <= nodes.size() - 1; i++) { | |
74 | + String title = nodes.get(i).xpath("//div[@class='card pb-3 mb-4 border-bottom']/div").xpath("//div[@class='d-flex justify-content-between align-items-end']/div/span/h2/a/text()").get(); | |
75 | + String time = nodes.get(i).xpath("//div[@class='card-meta align-middle mb-2 text-uppercase text-darker-gray']/span").nodes().get(2).xpath("//time/text()").get(); | |
76 | + String link = nodes.get(0).links().get(); | |
77 | + if (!KeywordUtil.containKeywordsInTitle(title)) { | |
78 | + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH); | |
79 | + try { | |
80 | + Date publishTimeDateTime = formatter.parse(time); | |
81 | + if (!publishTimeDateTime.before(DateUtil.localDate2Date(DateUtil.parseDate("2000-01-01", DateUtil.DATE)))) { | |
82 | +// page.addTargetRequest(link); | |
83 | + Spider.create(science4JournalArticlePageProcessor) | |
84 | + .addUrl(link) | |
85 | + .addPipeline(articlePipeline) | |
86 | + .setDownloader(seleniumDownloader) | |
87 | + // 开启5个线程执行,并开始爬取 | |
88 | + .thread(1).run(); | |
89 | + log.info("关键字文章列表链接:{},标题:{},文章链接:{}", url, title, link); | |
90 | + } | |
91 | + } catch (ParseException e) { | |
92 | + e.printStackTrace(); | |
93 | + } | |
94 | + | |
95 | + } | |
96 | + } | |
97 | + | |
98 | + } | |
99 | + | |
100 | + @Override | |
101 | + public Site getSite() { | |
102 | + return site; | |
103 | + } | |
104 | + | |
105 | + public static void main(String[] args) { | |
106 | + // 创建一个Spider,并把我们的处理器放进去 | |
107 | + Spider.create(new Science4JournalSearchPageProcessor()) | |
108 | + // 添加这个Spider要爬取的网页地址 | |
109 | + .addUrl("https://www.science.org/journal/science/insights?startPage=0") | |
110 | + .addPipeline(new ArticlePipeline()) | |
111 | + // 开启5个线程执行,并开始爬取 | |
112 | + .thread(5).run(); | |
113 | + } | |
114 | +} | |
0 | 115 | \ No newline at end of file | ... | ... |
src/main/java/com/canrd/webmagic/processor/config/Downloader.java renamed to src/main/java/com/canrd/webmagic/processor/download/Downloader.java
src/main/java/com/canrd/webmagic/processor/download/SeleniumDownloader.java
0 → 100644
1 | +package com.canrd.webmagic.processor.download; | |
2 | + | |
3 | +import lombok.extern.slf4j.Slf4j; | |
4 | +import org.openqa.selenium.By; | |
5 | +import org.openqa.selenium.Cookie; | |
6 | +import org.openqa.selenium.WebDriver; | |
7 | +import org.openqa.selenium.WebElement; | |
8 | +import org.springframework.stereotype.Component; | |
9 | +import us.codecraft.webmagic.Page; | |
10 | +import us.codecraft.webmagic.Request; | |
11 | +import us.codecraft.webmagic.Site; | |
12 | +import us.codecraft.webmagic.Task; | |
13 | +import us.codecraft.webmagic.downloader.AbstractDownloader; | |
14 | +import us.codecraft.webmagic.selector.Html; | |
15 | +import us.codecraft.webmagic.selector.PlainText; | |
16 | + | |
17 | +import javax.annotation.Resource; | |
18 | +import java.util.Map; | |
19 | + | |
20 | +/** | |
21 | + * @author: xms | |
22 | + * @description: TODO | |
23 | + * @date: 2024/4/26 16:36 | |
24 | + * @version: 1.0 | |
25 | + */ | |
26 | +@Slf4j | |
27 | +@Component | |
28 | +public class SeleniumDownloader extends AbstractDownloader { | |
29 | + private int sleepTime = 0; | |
30 | + | |
31 | + @Resource | |
32 | + private WebDriver webDriver; | |
33 | + | |
34 | + /** | |
35 | + * set sleep time to wait until load success | |
36 | + * | |
37 | + * @param sleepTime sleepTime | |
38 | + * @return this | |
39 | + */ | |
40 | + public SeleniumDownloader setSleepTime(int sleepTime) { | |
41 | + this.sleepTime = sleepTime; | |
42 | + return this; | |
43 | + } | |
44 | + | |
45 | + @Override | |
46 | + public Page download(Request request, Task task) { | |
47 | + Page page = Page.fail(); | |
48 | + try { | |
49 | + | |
50 | + | |
51 | + log.info("downloading page " + request.getUrl()); | |
52 | + webDriver.get(request.getUrl()); | |
53 | + try { | |
54 | + if (sleepTime > 0) { | |
55 | + Thread.sleep(sleepTime); | |
56 | + } | |
57 | + } catch (InterruptedException e) { | |
58 | + e.printStackTrace(); | |
59 | + } | |
60 | + WebDriver.Options manage = webDriver.manage(); | |
61 | + Site site = task.getSite(); | |
62 | + if (site.getCookies() != null) { | |
63 | + for (Map.Entry<String, String> cookieEntry : site.getCookies() | |
64 | + .entrySet()) { | |
65 | + Cookie cookie = new Cookie(cookieEntry.getKey(), | |
66 | + cookieEntry.getValue()); | |
67 | + manage.addCookie(cookie); | |
68 | + } | |
69 | + } | |
70 | + | |
71 | + /* | |
72 | + * TODO You can add mouse event or other processes | |
73 | + * | |
74 | + * @author: bob.li.0718@gmail.com | |
75 | + */ | |
76 | + try { | |
77 | + //休眠3秒就是为了动态的数据渲染完成后在进行获取 | |
78 | + Thread.sleep(30000); | |
79 | + } catch (InterruptedException e) { | |
80 | + throw new RuntimeException(e); | |
81 | + } | |
82 | + WebElement webElement = webDriver.findElement(By.xpath("/html")); | |
83 | + String content = webElement.getAttribute("outerHTML"); | |
84 | + page.setDownloadSuccess(true); | |
85 | + page.setRawText(content); | |
86 | + page.setHtml(new Html(content, request.getUrl())); | |
87 | + page.setUrl(new PlainText(request.getUrl())); | |
88 | + page.setRequest(request); | |
89 | + onSuccess(request, task); | |
90 | + } catch (Exception e) { | |
91 | + log.warn("download page {} error", request.getUrl(), e); | |
92 | + onError(request, task, e); | |
93 | + } finally { | |
94 | + | |
95 | + } | |
96 | + return page; | |
97 | + } | |
98 | + | |
99 | + @Override | |
100 | + public void setThread(int i) { | |
101 | + | |
102 | + } | |
103 | +} | ... | ... |
src/main/java/com/canrd/webmagic/processor/pipeline/NatureArticlePipeline.java renamed to src/main/java/com/canrd/webmagic/processor/pipeline/ArticlePipeline.java
src/main/resources/user-agent/User-Agents.txt
1 | 1 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60 |
2 | -Opera/8.0 (Windows NT 5.1; U; en) | |
3 | 2 | Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50 |
4 | 3 | Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50 |
5 | 4 | Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0 |
6 | -Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10 | |
7 | 5 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2 |
8 | 6 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36 |
9 | 7 | Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11 |
... | ... | @@ -19,4 +17,6 @@ Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C |
19 | 17 | Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0 |
20 | 18 | Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0) |
21 | 19 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36 |
22 | -Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36 | |
23 | 20 | \ No newline at end of file |
21 | +Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36 | |
22 | +Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 | |
23 | +Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0 | |
24 | 24 | \ No newline at end of file | ... | ... |