Commit af0bbdcecf9eb4a2de40475ab8709eaf61ae82d4
1 parent
47bdaf78
feat: nature article爬取
Showing
11 changed files
with
157 additions
and
61 deletions
sql/webmagic.sql
1 | -DROP TABLE IF EXISTS `nature_article`; | ||
2 | -CREATE TABLE `nature_article` ( | 1 | +DROP TABLE IF EXISTS `article`; |
2 | +CREATE TABLE `article` ( | ||
3 | `id` bigint NOT NULL AUTO_INCREMENT, | 3 | `id` bigint NOT NULL AUTO_INCREMENT, |
4 | + `article_type` varchar(32) DEFAULT NULL COMMENT '文章类型', | ||
4 | `article_code` varchar(64) DEFAULT NULL COMMENT '文章标识', | 5 | `article_code` varchar(64) DEFAULT NULL COMMENT '文章标识', |
5 | `author_name` varchar(256) DEFAULT NULL COMMENT '作者名称', | 6 | `author_name` varchar(256) DEFAULT NULL COMMENT '作者名称', |
6 | `title` varchar(256) DEFAULT NULL COMMENT '文章标题', | 7 | `title` varchar(256) DEFAULT NULL COMMENT '文章标题', |
7 | `publish_time` varchar(64) DEFAULT NULL COMMENT '发布时间', | 8 | `publish_time` varchar(64) DEFAULT NULL COMMENT '发布时间', |
8 | `email_info` varchar(512) DEFAULT NULL COMMENT '邮箱信息', | 9 | `email_info` varchar(512) DEFAULT NULL COMMENT '邮箱信息', |
10 | + `article_desc` text DEFAULT NULL COMMENT '文章摘要', | ||
11 | + `author_address` text DEFAULT NULL COMMENT '作者地址信息', | ||
12 | + `reference_info` text DEFAULT NULL COMMENT '相关文章引用信息', | ||
9 | PRIMARY KEY (`id`) | 13 | PRIMARY KEY (`id`) |
10 | -) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='nature-文章信息'; | ||
11 | - | ||
12 | - | ||
13 | -alter table `nature_article` add column `article_code` varchar(256) DEFAULT NULL COMMENT '文章标识'; | ||
14 | \ No newline at end of file | 14 | \ No newline at end of file |
15 | +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='文章信息'; | ||
15 | \ No newline at end of file | 16 | \ No newline at end of file |
src/main/java/com/canrd/webmagic/controller/NatureArticleController.java renamed to src/main/java/com/canrd/webmagic/controller/ArticleController.java
@@ -6,7 +6,7 @@ import com.canrd.webmagic.domain.vo.NatureArticleQueryVO; | @@ -6,7 +6,7 @@ import com.canrd.webmagic.domain.vo.NatureArticleQueryVO; | ||
6 | import com.canrd.webmagic.domain.vo.NatureArticleVO; | 6 | import com.canrd.webmagic.domain.vo.NatureArticleVO; |
7 | import com.canrd.webmagic.processor.NatureSearchPageProcessor; | 7 | import com.canrd.webmagic.processor.NatureSearchPageProcessor; |
8 | import com.canrd.webmagic.processor.pipeline.NatureArticlePipeline; | 8 | import com.canrd.webmagic.processor.pipeline.NatureArticlePipeline; |
9 | -import com.canrd.webmagic.service.NatureArticleService; | 9 | +import com.canrd.webmagic.service.ArticleService; |
10 | import org.springframework.validation.annotation.Validated; | 10 | import org.springframework.validation.annotation.Validated; |
11 | import org.springframework.web.bind.annotation.*; | 11 | import org.springframework.web.bind.annotation.*; |
12 | import us.codecraft.webmagic.Spider; | 12 | import us.codecraft.webmagic.Spider; |
@@ -21,12 +21,12 @@ import javax.annotation.Resource; | @@ -21,12 +21,12 @@ import javax.annotation.Resource; | ||
21 | */ | 21 | */ |
22 | @RestController | 22 | @RestController |
23 | @RequestMapping("/nature/article") | 23 | @RequestMapping("/nature/article") |
24 | -public class NatureArticleController { | 24 | +public class ArticleController { |
25 | /** | 25 | /** |
26 | * 服务对象 | 26 | * 服务对象 |
27 | */ | 27 | */ |
28 | @Resource | 28 | @Resource |
29 | - private NatureArticleService natureArticleService; | 29 | + private ArticleService articleService; |
30 | 30 | ||
31 | @Resource | 31 | @Resource |
32 | private NatureSearchPageProcessor natureSearchPageProcessor; | 32 | private NatureSearchPageProcessor natureSearchPageProcessor; |
@@ -60,7 +60,7 @@ public class NatureArticleController { | @@ -60,7 +60,7 @@ public class NatureArticleController { | ||
60 | */ | 60 | */ |
61 | @PostMapping("/list") | 61 | @PostMapping("/list") |
62 | public ServerResult list(@RequestBody @Validated({OperateGroup.List.class}) NatureArticleQueryVO natureArticleQueryVO) { | 62 | public ServerResult list(@RequestBody @Validated({OperateGroup.List.class}) NatureArticleQueryVO natureArticleQueryVO) { |
63 | - return natureArticleService.list(natureArticleQueryVO); | 63 | + return articleService.list(natureArticleQueryVO); |
64 | } | 64 | } |
65 | 65 | ||
66 | /** | 66 | /** |
@@ -71,7 +71,7 @@ public class NatureArticleController { | @@ -71,7 +71,7 @@ public class NatureArticleController { | ||
71 | */ | 71 | */ |
72 | @PostMapping("/query_by_id") | 72 | @PostMapping("/query_by_id") |
73 | public ServerResult queryById(@RequestBody NatureArticleQueryVO natureArticleQueryVO) { | 73 | public ServerResult queryById(@RequestBody NatureArticleQueryVO natureArticleQueryVO) { |
74 | - return natureArticleService.queryById(natureArticleQueryVO); | 74 | + return articleService.queryById(natureArticleQueryVO); |
75 | } | 75 | } |
76 | 76 | ||
77 | /** | 77 | /** |
@@ -82,7 +82,7 @@ public class NatureArticleController { | @@ -82,7 +82,7 @@ public class NatureArticleController { | ||
82 | */ | 82 | */ |
83 | @PostMapping("/add") | 83 | @PostMapping("/add") |
84 | public ServerResult add(@RequestBody NatureArticleVO natureArticleVO) { | 84 | public ServerResult add(@RequestBody NatureArticleVO natureArticleVO) { |
85 | - return natureArticleService.add(natureArticleVO); | 85 | + return articleService.add(natureArticleVO); |
86 | } | 86 | } |
87 | 87 | ||
88 | /** | 88 | /** |
@@ -93,7 +93,7 @@ public class NatureArticleController { | @@ -93,7 +93,7 @@ public class NatureArticleController { | ||
93 | */ | 93 | */ |
94 | @PostMapping("/edit") | 94 | @PostMapping("/edit") |
95 | public ServerResult edit(@RequestBody NatureArticleVO natureArticleVO) { | 95 | public ServerResult edit(@RequestBody NatureArticleVO natureArticleVO) { |
96 | - return natureArticleService.edit(natureArticleVO); | 96 | + return articleService.edit(natureArticleVO); |
97 | } | 97 | } |
98 | 98 | ||
99 | /** | 99 | /** |
@@ -104,7 +104,7 @@ public class NatureArticleController { | @@ -104,7 +104,7 @@ public class NatureArticleController { | ||
104 | */ | 104 | */ |
105 | @PostMapping("/delete_by_id") | 105 | @PostMapping("/delete_by_id") |
106 | public ServerResult deleteById(@RequestBody NatureArticleQueryVO natureArticleQueryVO) { | 106 | public ServerResult deleteById(@RequestBody NatureArticleQueryVO natureArticleQueryVO) { |
107 | - return natureArticleService.deleteById(natureArticleQueryVO); | 107 | + return articleService.deleteById(natureArticleQueryVO); |
108 | } | 108 | } |
109 | 109 | ||
110 | } | 110 | } |
src/main/java/com/canrd/webmagic/domain/ArticleTypeEnum.java
0 → 100644
1 | +package com.canrd.webmagic.domain; | ||
2 | + | ||
3 | +import lombok.AllArgsConstructor; | ||
4 | +import lombok.Getter; | ||
5 | +import lombok.NoArgsConstructor; | ||
6 | + | ||
7 | +/** | ||
8 | + * @author: xms | ||
9 | + * @description: TODO | ||
10 | + * @date: 2024/4/11 16:52 | ||
11 | + * @version: 1.0 | ||
12 | + */ | ||
13 | +@Getter | ||
14 | +@AllArgsConstructor | ||
15 | +@NoArgsConstructor | ||
16 | +public enum ArticleTypeEnum { | ||
17 | + NATURE("nature", "nature网址"), | ||
18 | + ; | ||
19 | + private String type; | ||
20 | + private String desc; | ||
21 | + | ||
22 | +} |
src/main/java/com/canrd/webmagic/domain/dto/NatureArticleDO.java renamed to src/main/java/com/canrd/webmagic/domain/dto/ArticleDO.java
@@ -12,17 +12,23 @@ import java.io.Serializable; | @@ -12,17 +12,23 @@ import java.io.Serializable; | ||
12 | * @author makejava | 12 | * @author makejava |
13 | * @since 2024-04-07 18:39:38 | 13 | * @since 2024-04-07 18:39:38 |
14 | */ | 14 | */ |
15 | -@TableName("nature_article") | 15 | +@TableName("article") |
16 | @Data | 16 | @Data |
17 | @AllArgsConstructor | 17 | @AllArgsConstructor |
18 | @ToString | 18 | @ToString |
19 | @NoArgsConstructor | 19 | @NoArgsConstructor |
20 | @EqualsAndHashCode(callSuper = false) | 20 | @EqualsAndHashCode(callSuper = false) |
21 | @SuperBuilder | 21 | @SuperBuilder |
22 | -public class NatureArticleDO implements Serializable { | 22 | +public class ArticleDO implements Serializable { |
23 | private static final long serialVersionUID = 890672868109538541L; | 23 | private static final long serialVersionUID = 890672868109538541L; |
24 | 24 | ||
25 | private Long id; | 25 | private Long id; |
26 | + | ||
27 | + /** | ||
28 | + * 文章类型:ArticleTypeEnum | ||
29 | + */ | ||
30 | + private String articleType; | ||
31 | + | ||
26 | /** | 32 | /** |
27 | * 作者名称 | 33 | * 作者名称 |
28 | */ | 34 | */ |
@@ -48,4 +54,19 @@ public class NatureArticleDO implements Serializable { | @@ -48,4 +54,19 @@ public class NatureArticleDO implements Serializable { | ||
48 | */ | 54 | */ |
49 | private String emailInfo; | 55 | private String emailInfo; |
50 | 56 | ||
57 | + /** | ||
58 | + * 文章摘要 | ||
59 | + */ | ||
60 | + private String articleDesc; | ||
61 | + | ||
62 | + /** | ||
63 | + * 作者地址 | ||
64 | + */ | ||
65 | + private String authorAddress; | ||
66 | + | ||
67 | + /** | ||
68 | + * 相关文章引用信息 | ||
69 | + */ | ||
70 | + private String referenceInfo; | ||
71 | + | ||
51 | } | 72 | } |
src/main/java/com/canrd/webmagic/mapper/NatureArticleMapper.java renamed to src/main/java/com/canrd/webmagic/mapper/ArticleMapper.java
1 | package com.canrd.webmagic.mapper; | 1 | package com.canrd.webmagic.mapper; |
2 | 2 | ||
3 | import com.baomidou.mybatisplus.core.mapper.BaseMapper; | 3 | import com.baomidou.mybatisplus.core.mapper.BaseMapper; |
4 | -import com.canrd.webmagic.domain.dto.NatureArticleDO; | 4 | +import com.canrd.webmagic.domain.dto.ArticleDO; |
5 | 5 | ||
6 | /** | 6 | /** |
7 | * nature-文章信息(NatureArticle)表数据库访问层 | 7 | * nature-文章信息(NatureArticle)表数据库访问层 |
@@ -9,7 +9,7 @@ import com.canrd.webmagic.domain.dto.NatureArticleDO; | @@ -9,7 +9,7 @@ import com.canrd.webmagic.domain.dto.NatureArticleDO; | ||
9 | * @author makejava | 9 | * @author makejava |
10 | * @since 2024-04-07 18:39:47 | 10 | * @since 2024-04-07 18:39:47 |
11 | */ | 11 | */ |
12 | -public interface NatureArticleMapper extends BaseMapper<NatureArticleDO> { | 12 | +public interface ArticleMapper extends BaseMapper<ArticleDO> { |
13 | 13 | ||
14 | 14 | ||
15 | } | 15 | } |
src/main/java/com/canrd/webmagic/processor/NatureSearchPageProcessor.java
@@ -2,10 +2,13 @@ package com.canrd.webmagic.processor; | @@ -2,10 +2,13 @@ package com.canrd.webmagic.processor; | ||
2 | 2 | ||
3 | import com.alibaba.fastjson.JSONArray; | 3 | import com.alibaba.fastjson.JSONArray; |
4 | import com.alibaba.fastjson.JSONObject; | 4 | import com.alibaba.fastjson.JSONObject; |
5 | +import com.baomidou.mybatisplus.core.toolkit.CollectionUtils; | ||
5 | import com.canrd.webmagic.common.utils.StringUtils; | 6 | import com.canrd.webmagic.common.utils.StringUtils; |
6 | -import com.canrd.webmagic.domain.dto.NatureArticleDO; | 7 | +import com.canrd.webmagic.domain.ArticleTypeEnum; |
8 | +import com.canrd.webmagic.domain.dto.ArticleDO; | ||
7 | import com.canrd.webmagic.processor.config.Agent; | 9 | import com.canrd.webmagic.processor.config.Agent; |
8 | import com.canrd.webmagic.processor.pipeline.NatureArticlePipeline; | 10 | import com.canrd.webmagic.processor.pipeline.NatureArticlePipeline; |
11 | +import lombok.extern.slf4j.Slf4j; | ||
9 | import org.springframework.stereotype.Component; | 12 | import org.springframework.stereotype.Component; |
10 | import us.codecraft.webmagic.Page; | 13 | import us.codecraft.webmagic.Page; |
11 | import us.codecraft.webmagic.Site; | 14 | import us.codecraft.webmagic.Site; |
@@ -15,8 +18,10 @@ import us.codecraft.webmagic.selector.Html; | @@ -15,8 +18,10 @@ import us.codecraft.webmagic.selector.Html; | ||
15 | import us.codecraft.webmagic.selector.Selectable; | 18 | import us.codecraft.webmagic.selector.Selectable; |
16 | import us.codecraft.webmagic.selector.XpathSelector; | 19 | import us.codecraft.webmagic.selector.XpathSelector; |
17 | 20 | ||
21 | +import java.util.ArrayList; | ||
18 | import java.util.List; | 22 | import java.util.List; |
19 | import java.util.Objects; | 23 | import java.util.Objects; |
24 | +import java.util.stream.Collectors; | ||
20 | 25 | ||
21 | /** | 26 | /** |
22 | * @author: xms | 27 | * @author: xms |
@@ -24,12 +29,13 @@ import java.util.Objects; | @@ -24,12 +29,13 @@ import java.util.Objects; | ||
24 | * @date: 2024/4/1 14:19 | 29 | * @date: 2024/4/1 14:19 |
25 | * @version: 1.0 | 30 | * @version: 1.0 |
26 | */ | 31 | */ |
32 | +@Slf4j | ||
27 | @Component | 33 | @Component |
28 | public class NatureSearchPageProcessor implements PageProcessor { | 34 | public class NatureSearchPageProcessor implements PageProcessor { |
29 | private String agent = Agent.getRandom(); | 35 | private String agent = Agent.getRandom(); |
30 | 36 | ||
31 | // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等 | 37 | // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等 |
32 | - private Site site = Site.me().setRetryTimes(3).setSleepTime(100).setUserAgent(agent); | 38 | + private Site site = Site.me().setRetryTimes(3).setSleepTime(100); |
33 | 39 | ||
34 | /** | 40 | /** |
35 | * 定制爬虫逻辑的核心接口,在这里编写抽取逻辑 | 41 | * 定制爬虫逻辑的核心接口,在这里编写抽取逻辑 |
@@ -42,17 +48,16 @@ public class NatureSearchPageProcessor implements PageProcessor { | @@ -42,17 +48,16 @@ public class NatureSearchPageProcessor implements PageProcessor { | ||
42 | doArticleList(page); | 48 | doArticleList(page); |
43 | } else if (page.getUrl().get().contains("research-articles")) { | 49 | } else if (page.getUrl().get().contains("research-articles")) { |
44 | doArticleList4ReSearch(page); | 50 | doArticleList4ReSearch(page); |
45 | - }else { | 51 | + } else { |
46 | doArticleContent(page); | 52 | doArticleContent(page); |
47 | } | 53 | } |
48 | 54 | ||
49 | } | 55 | } |
50 | 56 | ||
51 | /** | 57 | /** |
52 | - * | ||
53 | * @param page | 58 | * @param page |
54 | */ | 59 | */ |
55 | - private void doArticleList4ReSearch(Page page){ | 60 | + private void doArticleList4ReSearch(Page page) { |
56 | String url = page.getUrl().get(); | 61 | String url = page.getUrl().get(); |
57 | String[] split = url.split("="); | 62 | String[] split = url.split("="); |
58 | Integer pageIndex = Integer.parseInt(split[split.length - 1]); | 63 | Integer pageIndex = Integer.parseInt(split[split.length - 1]); |
@@ -79,17 +84,23 @@ public class NatureSearchPageProcessor implements PageProcessor { | @@ -79,17 +84,23 @@ public class NatureSearchPageProcessor implements PageProcessor { | ||
79 | } | 84 | } |
80 | 85 | ||
81 | private void doArticleContent(Page page) { | 86 | private void doArticleContent(Page page) { |
87 | + if (page.getUrl().get().contains("redirect") || !page.getUrl().get().contains("nature")) { | ||
88 | + return; | ||
89 | + } | ||
82 | //解析页面 | 90 | //解析页面 |
83 | Html html = page.getHtml(); | 91 | Html html = page.getHtml(); |
84 | String[] urlArr = page.getUrl().get().split("/"); | 92 | String[] urlArr = page.getUrl().get().split("/"); |
85 | String articleCode = urlArr[urlArr.length - 1]; | 93 | String articleCode = urlArr[urlArr.length - 1]; |
86 | Selectable headSelectable = html.xpath("//div[@class='c-article-header']/header"); | 94 | Selectable headSelectable = html.xpath("//div[@class='c-article-header']/header"); |
87 | List<Selectable> authorEmailSelectables = html.xpath("//p[@id='corresponding-author-list']/a").nodes(); | 95 | List<Selectable> authorEmailSelectables = html.xpath("//p[@id='corresponding-author-list']/a").nodes(); |
96 | + Selectable referencesSelectable = html.xpath("//ol[@class='c-article-references']").select(new XpathSelector("li[@class='c-article-references__item js-c-reading-companion-references-item']")); | ||
97 | + Selectable authorAddressSelectable = html.xpath("//ol[@class='c-article-author-affiliation__list']").select(new XpathSelector("li")); | ||
88 | 98 | ||
89 | String title = headSelectable.xpath("//div/h1/text()").get(); | 99 | String title = headSelectable.xpath("//div/h1/text()").get(); |
90 | if (StringUtils.isBlank(title)) { | 100 | if (StringUtils.isBlank(title)) { |
91 | title = headSelectable.xpath("//h1/text()").get(); | 101 | title = headSelectable.xpath("//h1/text()").get(); |
92 | } | 102 | } |
103 | + String articleDesc = html.xpath("//div[@class='c-article-section__content']/p/text()").get(); | ||
93 | String publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get(); | 104 | String publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get(); |
94 | Selectable authorSelectable = headSelectable.xpath("//ul").nodes().get(1).select(new XpathSelector("li[@class='c-article-author-list__item']")); | 105 | Selectable authorSelectable = headSelectable.xpath("//ul").nodes().get(1).select(new XpathSelector("li[@class='c-article-author-list__item']")); |
95 | List<Selectable> authorNodes = authorSelectable.nodes(); | 106 | List<Selectable> authorNodes = authorSelectable.nodes(); |
@@ -97,7 +108,41 @@ public class NatureSearchPageProcessor implements PageProcessor { | @@ -97,7 +108,41 @@ public class NatureSearchPageProcessor implements PageProcessor { | ||
97 | for (Selectable node : authorNodes) { | 108 | for (Selectable node : authorNodes) { |
98 | authorName.append(node.xpath("//a/text()")); | 109 | authorName.append(node.xpath("//a/text()")); |
99 | } | 110 | } |
100 | - JSONArray array = new JSONArray(); | 111 | + |
112 | + JSONArray authorAddress = new JSONArray(); | ||
113 | + List<Selectable> authorAddressList = authorAddressSelectable.nodes(); | ||
114 | + if (CollectionUtils.isNotEmpty(authorAddressList)) { | ||
115 | + for (Selectable selectable : authorAddressList) { | ||
116 | + String address = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__address']/text()").get(); | ||
117 | + String authorNames = selectable.xpath("//p").xpath("//p[@class='c-article-author-affiliation__authors-list']/text()").get(); | ||
118 | + JSONObject object = new JSONObject(); | ||
119 | + object.put("address", address); | ||
120 | + object.put("authorNames", authorNames); | ||
121 | + authorAddress.add(object); | ||
122 | + } | ||
123 | + } | ||
124 | + | ||
125 | + JSONArray references = new JSONArray(); | ||
126 | + List<Selectable> referenceList = referencesSelectable.nodes(); | ||
127 | + if (CollectionUtils.isNotEmpty(referenceList)) { | ||
128 | + for (Selectable reference : referenceList) { | ||
129 | + String referenceTitle = reference.xpath("//p").xpath("//p[@class='c-article-references__text']/text()").get(); | ||
130 | + List<Selectable> referenceLinks = reference.xpath("//p").xpath("//p[@class='c-article-references__links u-hide-print']").links().nodes(); | ||
131 | + List<String> links = new ArrayList<>(); | ||
132 | + if (CollectionUtils.isNotEmpty(referenceLinks)) { | ||
133 | + links = referenceLinks.stream().map(x -> x.get()).collect(Collectors.toList()); | ||
134 | + } | ||
135 | + JSONObject object = new JSONObject(); | ||
136 | + object.put("referenceTitle", referenceTitle); | ||
137 | + object.put("links", links); | ||
138 | + if (CollectionUtils.isNotEmpty(links)) { | ||
139 | + page.addTargetRequests(links.stream().filter(x -> x.contains("nature")).collect(Collectors.toList())); | ||
140 | + } | ||
141 | + references.add(object); | ||
142 | + } | ||
143 | + } | ||
144 | + | ||
145 | + JSONArray authorEmail = new JSONArray(); | ||
101 | for (Selectable authorEmailSelectable : authorEmailSelectables) { | 146 | for (Selectable authorEmailSelectable : authorEmailSelectables) { |
102 | String[] split = authorEmailSelectable.xpath("//a").links().get().split(":"); | 147 | String[] split = authorEmailSelectable.xpath("//a").links().get().split(":"); |
103 | String email = Objects.isNull(split) ? "" : split[split.length - 1]; | 148 | String email = Objects.isNull(split) ? "" : split[split.length - 1]; |
@@ -105,16 +150,21 @@ public class NatureSearchPageProcessor implements PageProcessor { | @@ -105,16 +150,21 @@ public class NatureSearchPageProcessor implements PageProcessor { | ||
105 | JSONObject jsonObject = new JSONObject(); | 150 | JSONObject jsonObject = new JSONObject(); |
106 | jsonObject.put("authorEmailName", authorEmailName); | 151 | jsonObject.put("authorEmailName", authorEmailName); |
107 | jsonObject.put("email", email); | 152 | jsonObject.put("email", email); |
108 | - array.add(jsonObject); | 153 | + authorEmail.add(jsonObject); |
109 | } | 154 | } |
110 | - System.out.println("code:" + articleCode + ",发布时间:" + publishTime + ",标题:" + title + ",作者:" + authorName + ",邮箱信息:" + array.toJSONString()); | 155 | + System.out.println("code:" + articleCode + ",发布时间:" + publishTime + ",标题:" + title + ",作者:" + authorName + ",邮箱信息:" + authorEmail.toJSONString()); |
156 | + | ||
111 | 157 | ||
112 | - page.putField("article", NatureArticleDO.builder() | 158 | + page.putField("article", ArticleDO.builder() |
159 | + .articleType(ArticleTypeEnum.NATURE.getType()) | ||
113 | .articleCode(articleCode) | 160 | .articleCode(articleCode) |
114 | .authorName(authorName.toString()) | 161 | .authorName(authorName.toString()) |
115 | .title(title) | 162 | .title(title) |
116 | .publishTime(publishTime) | 163 | .publishTime(publishTime) |
117 | - .emailInfo(array.toJSONString()).build()); | 164 | + .emailInfo(authorEmail.toJSONString()) |
165 | + .articleDesc(articleDesc) | ||
166 | + .authorAddress(authorAddress.toJSONString()) | ||
167 | + .referenceInfo(references.toJSONString()).build()); | ||
118 | } | 168 | } |
119 | 169 | ||
120 | private void doArticleList(Page page) { | 170 | private void doArticleList(Page page) { |
@@ -152,7 +202,7 @@ public class NatureSearchPageProcessor implements PageProcessor { | @@ -152,7 +202,7 @@ public class NatureSearchPageProcessor implements PageProcessor { | ||
152 | // 创建一个Spider,并把我们的处理器放进去 | 202 | // 创建一个Spider,并把我们的处理器放进去 |
153 | Spider.create(new NatureSearchPageProcessor()) | 203 | Spider.create(new NatureSearchPageProcessor()) |
154 | // 添加这个Spider要爬取的网页地址 | 204 | // 添加这个Spider要爬取的网页地址 |
155 | - .addUrl("https://www.nature.com/search?q=battery&page=1") | 205 | + .addUrl("https://www.nature.com/nature/research-articles?sort=PubDate&page=1") |
156 | .addPipeline(new NatureArticlePipeline()) | 206 | .addPipeline(new NatureArticlePipeline()) |
157 | // 开启5个线程执行,并开始爬取 | 207 | // 开启5个线程执行,并开始爬取 |
158 | .thread(5).run(); | 208 | .thread(5).run(); |
src/main/java/com/canrd/webmagic/processor/config/UpdateIp.java
@@ -31,7 +31,7 @@ public class UpdateIp { | @@ -31,7 +31,7 @@ public class UpdateIp { | ||
31 | @Autowired | 31 | @Autowired |
32 | private RedisTemplate redisTemplate; | 32 | private RedisTemplate redisTemplate; |
33 | 33 | ||
34 | - @Scheduled(cron = "*/20 * * * * ?") | 34 | +// @Scheduled(cron = "*/20 * * * * ?") |
35 | void update() { | 35 | void update() { |
36 | List<String> range = redisTemplate.opsForList().range("ip", 0, -1); | 36 | List<String> range = redisTemplate.opsForList().range("ip", 0, -1); |
37 | for (String ip : range) { | 37 | for (String ip : range) { |
@@ -42,7 +42,7 @@ public class UpdateIp { | @@ -42,7 +42,7 @@ public class UpdateIp { | ||
42 | } | 42 | } |
43 | } | 43 | } |
44 | 44 | ||
45 | - @Scheduled(cron = "*/15 * * * * ?") | 45 | +// @Scheduled(cron = "*/15 * * * * ?") |
46 | void ips() { | 46 | void ips() { |
47 | String string = null; | 47 | String string = null; |
48 | try { | 48 | try { |
src/main/java/com/canrd/webmagic/processor/pipeline/NatureArticlePipeline.java
1 | package com.canrd.webmagic.processor.pipeline; | 1 | package com.canrd.webmagic.processor.pipeline; |
2 | 2 | ||
3 | import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper; | 3 | import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper; |
4 | -import com.canrd.webmagic.domain.dto.NatureArticleDO; | ||
5 | -import com.canrd.webmagic.service.NatureArticleService; | 4 | +import com.baomidou.mybatisplus.core.toolkit.CollectionUtils; |
5 | +import com.canrd.webmagic.domain.dto.ArticleDO; | ||
6 | +import com.canrd.webmagic.service.ArticleService; | ||
6 | import org.springframework.beans.factory.annotation.Autowired; | 7 | import org.springframework.beans.factory.annotation.Autowired; |
7 | import org.springframework.stereotype.Component; | 8 | import org.springframework.stereotype.Component; |
8 | import us.codecraft.webmagic.ResultItems; | 9 | import us.codecraft.webmagic.ResultItems; |
9 | import us.codecraft.webmagic.Task; | 10 | import us.codecraft.webmagic.Task; |
10 | import us.codecraft.webmagic.pipeline.Pipeline; | 11 | import us.codecraft.webmagic.pipeline.Pipeline; |
11 | 12 | ||
13 | +import java.util.List; | ||
12 | import java.util.Objects; | 14 | import java.util.Objects; |
13 | 15 | ||
14 | @Component | 16 | @Component |
15 | public class NatureArticlePipeline implements Pipeline { | 17 | public class NatureArticlePipeline implements Pipeline { |
16 | 18 | ||
17 | - private NatureArticleService natureArticleService; | 19 | + private ArticleService articleService; |
18 | 20 | ||
19 | @Autowired | 21 | @Autowired |
20 | - public void setNatureArticleService(NatureArticleService natureArticleService) { | ||
21 | - this.natureArticleService = natureArticleService; | 22 | + public void setNatureArticleService(ArticleService articleService) { |
23 | + this.articleService = articleService; | ||
22 | } | 24 | } |
23 | 25 | ||
24 | @Override | 26 | @Override |
25 | public void process(ResultItems resultItems, Task task) { | 27 | public void process(ResultItems resultItems, Task task) { |
26 | - NatureArticleDO articleDO = resultItems.get("article"); | 28 | + ArticleDO articleDO = resultItems.get("article"); |
27 | if (Objects.nonNull(articleDO)) { | 29 | if (Objects.nonNull(articleDO)) { |
28 | - NatureArticleDO natureArticleDO = natureArticleService.getOne(new LambdaQueryWrapper<NatureArticleDO>().eq(NatureArticleDO::getArticleCode, articleDO.getArticleCode())); | ||
29 | - if (Objects.nonNull(natureArticleDO)) { | 30 | + List<ArticleDO> natureArticleDO = articleService.list(new LambdaQueryWrapper<ArticleDO>().eq(ArticleDO::getArticleCode, articleDO.getArticleCode())); |
31 | + if (CollectionUtils.isNotEmpty(natureArticleDO)) { | ||
30 | return; | 32 | return; |
31 | } | 33 | } |
32 | - natureArticleService.save(articleDO); | 34 | + articleService.save(articleDO); |
33 | } | 35 | } |
34 | } | 36 | } |
35 | } | 37 | } |
src/main/java/com/canrd/webmagic/service/NatureArticleService.java renamed to src/main/java/com/canrd/webmagic/service/ArticleService.java
@@ -2,7 +2,7 @@ package com.canrd.webmagic.service; | @@ -2,7 +2,7 @@ package com.canrd.webmagic.service; | ||
2 | 2 | ||
3 | import com.baomidou.mybatisplus.extension.service.IService; | 3 | import com.baomidou.mybatisplus.extension.service.IService; |
4 | import com.canrd.webmagic.common.constant.ServerResult; | 4 | import com.canrd.webmagic.common.constant.ServerResult; |
5 | -import com.canrd.webmagic.domain.dto.NatureArticleDO; | 5 | +import com.canrd.webmagic.domain.dto.ArticleDO; |
6 | import com.canrd.webmagic.domain.vo.NatureArticleQueryVO; | 6 | import com.canrd.webmagic.domain.vo.NatureArticleQueryVO; |
7 | import com.canrd.webmagic.domain.vo.NatureArticleVO; | 7 | import com.canrd.webmagic.domain.vo.NatureArticleVO; |
8 | 8 | ||
@@ -12,7 +12,7 @@ import com.canrd.webmagic.domain.vo.NatureArticleVO; | @@ -12,7 +12,7 @@ import com.canrd.webmagic.domain.vo.NatureArticleVO; | ||
12 | * @author makejava | 12 | * @author makejava |
13 | * @since 2024-04-07 18:39:48 | 13 | * @since 2024-04-07 18:39:48 |
14 | */ | 14 | */ |
15 | -public interface NatureArticleService extends IService<NatureArticleDO> { | 15 | +public interface ArticleService extends IService<ArticleDO> { |
16 | 16 | ||
17 | /** | 17 | /** |
18 | * 通过ID查询单条数据 | 18 | * 通过ID查询单条数据 |
src/main/java/com/canrd/webmagic/service/impl/NatureArticleServiceImpl.java renamed to src/main/java/com/canrd/webmagic/service/impl/ArticleServiceImpl.java
@@ -5,11 +5,11 @@ import cn.hutool.core.collection.CollUtil; | @@ -5,11 +5,11 @@ import cn.hutool.core.collection.CollUtil; | ||
5 | import com.baomidou.mybatisplus.core.conditions.update.LambdaUpdateWrapper; | 5 | import com.baomidou.mybatisplus.core.conditions.update.LambdaUpdateWrapper; |
6 | import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl; | 6 | import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl; |
7 | import com.canrd.webmagic.common.constant.ServerResult; | 7 | import com.canrd.webmagic.common.constant.ServerResult; |
8 | -import com.canrd.webmagic.domain.dto.NatureArticleDO; | 8 | +import com.canrd.webmagic.domain.dto.ArticleDO; |
9 | import com.canrd.webmagic.domain.vo.NatureArticleQueryVO; | 9 | import com.canrd.webmagic.domain.vo.NatureArticleQueryVO; |
10 | import com.canrd.webmagic.domain.vo.NatureArticleVO; | 10 | import com.canrd.webmagic.domain.vo.NatureArticleVO; |
11 | -import com.canrd.webmagic.mapper.NatureArticleMapper; | ||
12 | -import com.canrd.webmagic.service.NatureArticleService; | 11 | +import com.canrd.webmagic.mapper.ArticleMapper; |
12 | +import com.canrd.webmagic.service.ArticleService; | ||
13 | import lombok.extern.slf4j.Slf4j; | 13 | import lombok.extern.slf4j.Slf4j; |
14 | import org.springframework.stereotype.Service; | 14 | import org.springframework.stereotype.Service; |
15 | 15 | ||
@@ -24,7 +24,7 @@ import java.util.Objects; | @@ -24,7 +24,7 @@ import java.util.Objects; | ||
24 | */ | 24 | */ |
25 | @Slf4j | 25 | @Slf4j |
26 | @Service | 26 | @Service |
27 | -public class NatureArticleServiceImpl extends ServiceImpl<NatureArticleMapper, NatureArticleDO> implements NatureArticleService { | 27 | +public class ArticleServiceImpl extends ServiceImpl<ArticleMapper, ArticleDO> implements ArticleService { |
28 | 28 | ||
29 | 29 | ||
30 | /** | 30 | /** |
@@ -39,11 +39,11 @@ public class NatureArticleServiceImpl extends ServiceImpl<NatureArticleMapper, N | @@ -39,11 +39,11 @@ public class NatureArticleServiceImpl extends ServiceImpl<NatureArticleMapper, N | ||
39 | if (Objects.isNull(natureArticleQueryVO.getId())) { | 39 | if (Objects.isNull(natureArticleQueryVO.getId())) { |
40 | return ServerResult.fail("id 不能为空"); | 40 | return ServerResult.fail("id 不能为空"); |
41 | } | 41 | } |
42 | - NatureArticleDO NatureArticleDo = getById(natureArticleQueryVO.getId()); | ||
43 | - if (Objects.isNull(NatureArticleDo)) { | 42 | + ArticleDO articleDo = getById(natureArticleQueryVO.getId()); |
43 | + if (Objects.isNull(articleDo)) { | ||
44 | return ServerResult.success(null); | 44 | return ServerResult.success(null); |
45 | } | 45 | } |
46 | - return ServerResult.success(BeanUtil.copyProperties(NatureArticleDo, NatureArticleVO.class)); | 46 | + return ServerResult.success(BeanUtil.copyProperties(articleDo, NatureArticleVO.class)); |
47 | } | 47 | } |
48 | 48 | ||
49 | /** | 49 | /** |
@@ -69,9 +69,9 @@ public class NatureArticleServiceImpl extends ServiceImpl<NatureArticleMapper, N | @@ -69,9 +69,9 @@ public class NatureArticleServiceImpl extends ServiceImpl<NatureArticleMapper, N | ||
69 | if (Objects.nonNull(natureArticleVO.getId())) { | 69 | if (Objects.nonNull(natureArticleVO.getId())) { |
70 | natureArticleVO.setId(null); | 70 | natureArticleVO.setId(null); |
71 | } | 71 | } |
72 | - NatureArticleDO natureArticleDo = BeanUtil.copyProperties(natureArticleVO, NatureArticleDO.class); | 72 | + ArticleDO articleDo = BeanUtil.copyProperties(natureArticleVO, ArticleDO.class); |
73 | 73 | ||
74 | - save(natureArticleDo); | 74 | + save(articleDo); |
75 | 75 | ||
76 | return ServerResult.success(); | 76 | return ServerResult.success(); |
77 | } | 77 | } |
@@ -88,9 +88,9 @@ public class NatureArticleServiceImpl extends ServiceImpl<NatureArticleMapper, N | @@ -88,9 +88,9 @@ public class NatureArticleServiceImpl extends ServiceImpl<NatureArticleMapper, N | ||
88 | if (Objects.isNull(natureArticleVO.getId())) { | 88 | if (Objects.isNull(natureArticleVO.getId())) { |
89 | return ServerResult.fail("id 不能为空"); | 89 | return ServerResult.fail("id 不能为空"); |
90 | } | 90 | } |
91 | - NatureArticleDO natureArticleDo = BeanUtil.copyProperties(natureArticleVO, NatureArticleDO.class); | 91 | + ArticleDO articleDo = BeanUtil.copyProperties(natureArticleVO, ArticleDO.class); |
92 | 92 | ||
93 | - updateById(natureArticleDo); | 93 | + updateById(articleDo); |
94 | 94 | ||
95 | return ServerResult.success(); | 95 | return ServerResult.success(); |
96 | } | 96 | } |
@@ -107,13 +107,13 @@ public class NatureArticleServiceImpl extends ServiceImpl<NatureArticleMapper, N | @@ -107,13 +107,13 @@ public class NatureArticleServiceImpl extends ServiceImpl<NatureArticleMapper, N | ||
107 | if (CollUtil.isEmpty(ids)) { | 107 | if (CollUtil.isEmpty(ids)) { |
108 | return ServerResult.fail("ids 参数不能为空"); | 108 | return ServerResult.fail("ids 参数不能为空"); |
109 | } | 109 | } |
110 | - List<NatureArticleDO> natureArticleList = listByIds(ids); | 110 | + List<ArticleDO> natureArticleList = listByIds(ids); |
111 | if (CollUtil.isEmpty(natureArticleList)) { | 111 | if (CollUtil.isEmpty(natureArticleList)) { |
112 | return ServerResult.success(); | 112 | return ServerResult.success(); |
113 | } | 113 | } |
114 | //todo 校验是否可以逻辑删除 | 114 | //todo 校验是否可以逻辑删除 |
115 | - LambdaUpdateWrapper<NatureArticleDO> updateWrapper = new LambdaUpdateWrapper<NatureArticleDO>() | ||
116 | - .in(NatureArticleDO::getId, ids); | 115 | + LambdaUpdateWrapper<ArticleDO> updateWrapper = new LambdaUpdateWrapper<ArticleDO>() |
116 | + .in(ArticleDO::getId, ids); | ||
117 | update(updateWrapper); | 117 | update(updateWrapper); |
118 | return ServerResult.success(); | 118 | return ServerResult.success(); |
119 | } | 119 | } |
src/test/java/com/canrd/webmagic/utils/DateTimeUtilTest.java
@@ -4,8 +4,8 @@ import com.alibaba.fastjson.JSON; | @@ -4,8 +4,8 @@ import com.alibaba.fastjson.JSON; | ||
4 | import com.alibaba.fastjson.JSONArray; | 4 | import com.alibaba.fastjson.JSONArray; |
5 | import com.alibaba.fastjson.JSONObject; | 5 | import com.alibaba.fastjson.JSONObject; |
6 | import com.canrd.webmagic.BaseTest; | 6 | import com.canrd.webmagic.BaseTest; |
7 | -import com.canrd.webmagic.domain.dto.NatureArticleDO; | ||
8 | -import com.canrd.webmagic.service.NatureArticleService; | 7 | +import com.canrd.webmagic.domain.dto.ArticleDO; |
8 | +import com.canrd.webmagic.service.ArticleService; | ||
9 | import org.junit.Test; | 9 | import org.junit.Test; |
10 | 10 | ||
11 | import javax.annotation.Resource; | 11 | import javax.annotation.Resource; |
@@ -20,13 +20,13 @@ import java.util.List; | @@ -20,13 +20,13 @@ import java.util.List; | ||
20 | public class DateTimeUtilTest extends BaseTest { | 20 | public class DateTimeUtilTest extends BaseTest { |
21 | 21 | ||
22 | @Resource | 22 | @Resource |
23 | - private NatureArticleService natureArticleService; | 23 | + private ArticleService articleService; |
24 | 24 | ||
25 | @Test | 25 | @Test |
26 | public void export() { | 26 | public void export() { |
27 | - List<NatureArticleDO> articleDOList = natureArticleService.list(); | 27 | + List<ArticleDO> articleDOList = articleService.list(); |
28 | JSONArray array = new JSONArray(); | 28 | JSONArray array = new JSONArray(); |
29 | - for (NatureArticleDO articleDO : articleDOList) { | 29 | + for (ArticleDO articleDO : articleDOList) { |
30 | JSONArray jsonArray = JSONArray.parseArray(articleDO.getEmailInfo()); | 30 | JSONArray jsonArray = JSONArray.parseArray(articleDO.getEmailInfo()); |
31 | array.addAll(jsonArray); | 31 | array.addAll(jsonArray); |
32 | } | 32 | } |