Commit 836066f9f49a40d28c54510bc0359dd4db232ed1
1 parent
ea294a6a
nature文章信息爬取,存储db
Showing
11 changed files
with
142 additions
and
50 deletions
sql/webmagic.sql
1 | 1 | DROP TABLE IF EXISTS `nature_article`; |
2 | 2 | CREATE TABLE `nature_article` ( |
3 | 3 | `id` bigint NOT NULL AUTO_INCREMENT, |
4 | + `article_code` varchar(64) DEFAULT NULL COMMENT '文章标识', | |
4 | 5 | `author_name` varchar(256) DEFAULT NULL COMMENT '作者名称', |
5 | 6 | `title` varchar(256) DEFAULT NULL COMMENT '文章标题', |
6 | 7 | `publish_time` varchar(64) DEFAULT NULL COMMENT '发布时间', |
7 | 8 | `email_info` varchar(512) DEFAULT NULL COMMENT '邮箱信息', |
8 | 9 | PRIMARY KEY (`id`) |
9 | -) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='nature-文章信息'; | |
10 | 10 | \ No newline at end of file |
11 | +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='nature-文章信息'; | |
12 | + | |
13 | + | |
14 | +alter table `nature_article` add column `article_code` varchar(256) DEFAULT NULL COMMENT '文章标识'; | |
11 | 15 | \ No newline at end of file | ... | ... |
src/main/java/com/canrd/webmagic/controller/NatureArticleController.java
... | ... | @@ -4,12 +4,12 @@ import com.canrd.webmagic.common.constant.ServerResult; |
4 | 4 | import com.canrd.webmagic.common.jsr303.OperateGroup; |
5 | 5 | import com.canrd.webmagic.domain.vo.NatureArticleQueryVO; |
6 | 6 | import com.canrd.webmagic.domain.vo.NatureArticleVO; |
7 | +import com.canrd.webmagic.processor.NatureArticlePipeline; | |
8 | +import com.canrd.webmagic.processor.NatureSearchPageProcessor; | |
7 | 9 | import com.canrd.webmagic.service.NatureArticleService; |
8 | 10 | import org.springframework.validation.annotation.Validated; |
9 | -import org.springframework.web.bind.annotation.PostMapping; | |
10 | -import org.springframework.web.bind.annotation.RequestBody; | |
11 | -import org.springframework.web.bind.annotation.RequestMapping; | |
12 | -import org.springframework.web.bind.annotation.RestController; | |
11 | +import org.springframework.web.bind.annotation.*; | |
12 | +import us.codecraft.webmagic.Spider; | |
13 | 13 | |
14 | 14 | import javax.annotation.Resource; |
15 | 15 | |
... | ... | @@ -20,7 +20,7 @@ import javax.annotation.Resource; |
20 | 20 | * @since 2024-04-07 18:39:41 |
21 | 21 | */ |
22 | 22 | @RestController |
23 | -@RequestMapping("/gwms/xxx") | |
23 | +@RequestMapping("/nature/article") | |
24 | 24 | public class NatureArticleController { |
25 | 25 | /** |
26 | 26 | * 服务对象 |
... | ... | @@ -28,6 +28,29 @@ public class NatureArticleController { |
28 | 28 | @Resource |
29 | 29 | private NatureArticleService natureArticleService; |
30 | 30 | |
31 | + @Resource | |
32 | + private NatureSearchPageProcessor natureSearchPageProcessor; | |
33 | + | |
34 | + @Resource | |
35 | + private NatureArticlePipeline articlePipeline; | |
36 | + | |
37 | + /** | |
38 | + * @return | |
39 | + */ | |
40 | + @GetMapping("/start") | |
41 | + public ServerResult start(@RequestParam(value = "keyword") String keyword, @RequestParam(value = "indexSize") Integer indexSize) { | |
42 | + for (int i = 1; i <= indexSize; i++) { | |
43 | + Spider.create(natureSearchPageProcessor) | |
44 | + // 添加这个Spider要爬取的网页地址 | |
45 | + .addUrl("https://www.nature.com/search?q=" + keyword + "&page=" + i) | |
46 | + .addPipeline(articlePipeline) | |
47 | + // 开启5个线程执行,并开始爬取 | |
48 | + .thread(5).run(); | |
49 | + } | |
50 | + | |
51 | + return ServerResult.success(); | |
52 | + } | |
53 | + | |
31 | 54 | /** |
32 | 55 | * 分页查询 |
33 | 56 | * | ... | ... |
src/main/java/com/canrd/webmagic/domain/dto/NatureArticleDO.java
1 | 1 | package com.canrd.webmagic.domain.dto; |
2 | 2 | |
3 | -.dto; | |
4 | - | |
5 | - | |
6 | -import java.io.Serializable; | |
7 | - | |
8 | 3 | import com.baomidou.mybatisplus.annotation.TableName; |
9 | -import com.gree.gaolan.common.dto.BaseDO; | |
10 | 4 | import lombok.*; |
11 | 5 | import lombok.experimental.SuperBuilder; |
12 | 6 | |
7 | +import java.io.Serializable; | |
8 | + | |
13 | 9 | /** |
14 | 10 | * nature-文章信息(NatureArticle)实体类 |
15 | 11 | * |
... | ... | @@ -23,7 +19,7 @@ import lombok.experimental.SuperBuilder; |
23 | 19 | @NoArgsConstructor |
24 | 20 | @EqualsAndHashCode(callSuper = false) |
25 | 21 | @SuperBuilder |
26 | -public class NatureArticleDO extends BaseDO implements Serializable { | |
22 | +public class NatureArticleDO implements Serializable { | |
27 | 23 | private static final long serialVersionUID = 890672868109538541L; |
28 | 24 | |
29 | 25 | private Long id; |
... | ... | @@ -31,14 +27,22 @@ public class NatureArticleDO extends BaseDO implements Serializable { |
31 | 27 | * 作者名称 |
32 | 28 | */ |
33 | 29 | private String authorName; |
30 | + | |
31 | + /** | |
32 | + * 文章标识 | |
33 | + */ | |
34 | + private String articleCode; | |
35 | + | |
34 | 36 | /** |
35 | 37 | * 文章标题 |
36 | 38 | */ |
37 | 39 | private String title; |
40 | + | |
38 | 41 | /** |
39 | 42 | * 发布时间 |
40 | 43 | */ |
41 | 44 | private String publishTime; |
45 | + | |
42 | 46 | /** |
43 | 47 | * 邮箱信息 |
44 | 48 | */ | ... | ... |
src/main/java/com/canrd/webmagic/domain/vo/NatureArticleQueryVO.java
1 | 1 | package com.canrd.webmagic.domain.vo; |
2 | 2 | |
3 | -.vo; | |
4 | - | |
5 | -import java.io.Serializable; | |
6 | - | |
7 | 3 | import lombok.*; |
8 | 4 | import lombok.experimental.SuperBuilder; |
9 | -import com.gree.gaolan.common.vo.common.BasePageVO; | |
10 | 5 | |
11 | -import java.lang.Long; | |
6 | +import java.io.Serializable; | |
12 | 7 | import java.util.List; |
13 | 8 | |
14 | 9 | /** | ... | ... |
src/main/java/com/canrd/webmagic/domain/vo/NatureArticleStartVO.java
0 → 100644
1 | +package com.canrd.webmagic.domain.vo; | |
2 | + | |
3 | +import lombok.*; | |
4 | +import lombok.experimental.SuperBuilder; | |
5 | + | |
6 | +import java.io.Serializable; | |
7 | +import java.util.List; | |
8 | + | |
9 | +/** | |
10 | + * nature-文章信息(NatureArticle)实体类 | |
11 | + * | |
12 | + * @author makejava | |
13 | + * @since 2024-04-07 18:39:41 | |
14 | + */ | |
15 | +@Data | |
16 | +@AllArgsConstructor | |
17 | +@ToString | |
18 | +@NoArgsConstructor | |
19 | +@EqualsAndHashCode(callSuper = false) | |
20 | +@SuperBuilder | |
21 | +public class NatureArticleStartVO implements Serializable { | |
22 | + | |
23 | + /** | |
24 | + * 爬取的总页数 | |
25 | + */ | |
26 | + private Integer indexSize; | |
27 | + | |
28 | + /** | |
29 | + * 爬取的关键字 | |
30 | + */ | |
31 | + private String keyword; | |
32 | +} | |
33 | + | ... | ... |
src/main/java/com/canrd/webmagic/domain/vo/NatureArticleVO.java
src/main/java/com/canrd/webmagic/processor/NatureArticlePipeline.java
0 → 100644
1 | +package com.canrd.webmagic.processor; | |
2 | + | |
3 | +import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper; | |
4 | +import com.canrd.webmagic.domain.dto.NatureArticleDO; | |
5 | +import com.canrd.webmagic.service.NatureArticleService; | |
6 | +import org.springframework.beans.factory.annotation.Autowired; | |
7 | +import org.springframework.stereotype.Component; | |
8 | +import us.codecraft.webmagic.ResultItems; | |
9 | +import us.codecraft.webmagic.Task; | |
10 | +import us.codecraft.webmagic.pipeline.Pipeline; | |
11 | + | |
12 | +import java.util.Objects; | |
13 | + | |
14 | +@Component | |
15 | +public class NatureArticlePipeline implements Pipeline { | |
16 | + | |
17 | + private NatureArticleService natureArticleService; | |
18 | + | |
19 | + @Autowired | |
20 | + public void setNatureArticleService(NatureArticleService natureArticleService) { | |
21 | + this.natureArticleService = natureArticleService; | |
22 | + } | |
23 | + | |
24 | + @Override | |
25 | + public void process(ResultItems resultItems, Task task) { | |
26 | + NatureArticleDO articleDO = resultItems.get("article"); | |
27 | + if (Objects.nonNull(articleDO)) { | |
28 | + NatureArticleDO natureArticleDO = natureArticleService.getOne(new LambdaQueryWrapper<NatureArticleDO>().eq(NatureArticleDO::getArticleCode, articleDO.getArticleCode())); | |
29 | + if (Objects.nonNull(natureArticleDO)) { | |
30 | + return; | |
31 | + } | |
32 | + natureArticleService.save(articleDO); | |
33 | + } | |
34 | + } | |
35 | +} | ... | ... |
src/main/java/com/canrd/webmagic/processor/NatureSearchPageProcessor.java
... | ... | @@ -3,6 +3,8 @@ package com.canrd.webmagic.processor; |
3 | 3 | import com.alibaba.fastjson.JSONArray; |
4 | 4 | import com.alibaba.fastjson.JSONObject; |
5 | 5 | import com.canrd.webmagic.common.utils.StringUtils; |
6 | +import com.canrd.webmagic.domain.dto.NatureArticleDO; | |
7 | +import org.springframework.stereotype.Component; | |
6 | 8 | import us.codecraft.webmagic.Page; |
7 | 9 | import us.codecraft.webmagic.Site; |
8 | 10 | import us.codecraft.webmagic.Spider; |
... | ... | @@ -20,6 +22,7 @@ import java.util.Objects; |
20 | 22 | * @date: 2024/4/1 14:19 |
21 | 23 | * @version: 1.0 |
22 | 24 | */ |
25 | +@Component | |
23 | 26 | public class NatureSearchPageProcessor implements PageProcessor { |
24 | 27 | |
25 | 28 | // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等 |
... | ... | @@ -32,7 +35,7 @@ public class NatureSearchPageProcessor implements PageProcessor { |
32 | 35 | */ |
33 | 36 | @Override |
34 | 37 | public void process(Page page) { |
35 | - if ("https://www.nature.com/search?q=battery&page=1".equals(page.getUrl().get())) { | |
38 | + if (page.getUrl().get().contains("search")) { | |
36 | 39 | doArticleList(page); |
37 | 40 | } else { |
38 | 41 | doArticleContent(page); |
... | ... | @@ -44,7 +47,7 @@ public class NatureSearchPageProcessor implements PageProcessor { |
44 | 47 | //解析页面 |
45 | 48 | Html html = page.getHtml(); |
46 | 49 | String[] urlArr = page.getUrl().get().split("/"); |
47 | - String articleId = urlArr[urlArr.length - 1]; | |
50 | + String articleCode = urlArr[urlArr.length - 1]; | |
48 | 51 | Selectable headSelectable = html.xpath("//div[@class='c-article-header']/header"); |
49 | 52 | List<Selectable> authorEmailSelectables = html.xpath("//p[@id='corresponding-author-list']/a").nodes(); |
50 | 53 | |
... | ... | @@ -69,7 +72,14 @@ public class NatureSearchPageProcessor implements PageProcessor { |
69 | 72 | jsonObject.put("email", email); |
70 | 73 | array.add(jsonObject); |
71 | 74 | } |
72 | - System.out.println("id:" + articleId + ",发布时间:" + publishTime + ",标题:" + title + ",作者:" + authorName + ",邮箱信息:" + array.toJSONString()); | |
75 | + System.out.println("code:" + articleCode + ",发布时间:" + publishTime + ",标题:" + title + ",作者:" + authorName + ",邮箱信息:" + array.toJSONString()); | |
76 | + | |
77 | + page.putField("article", NatureArticleDO.builder() | |
78 | + .articleCode(articleCode) | |
79 | + .authorName(authorName.toString()) | |
80 | + .title(title) | |
81 | + .publishTime(publishTime) | |
82 | + .emailInfo(array.toJSONString()).build()); | |
73 | 83 | } |
74 | 84 | |
75 | 85 | private void doArticleList(Page page) { |
... | ... | @@ -96,7 +106,6 @@ public class NatureSearchPageProcessor implements PageProcessor { |
96 | 106 | String title = node.$("a", "text").get(); |
97 | 107 | System.out.printf("%d、%s,访问地址:%s%n", i, title, link1); |
98 | 108 | } |
99 | -// page.addTargetRequest("https://www.nature.com/search?q=battery&page=" + pageIndex); | |
100 | 109 | } |
101 | 110 | |
102 | 111 | @Override |
... | ... | @@ -109,6 +118,7 @@ public class NatureSearchPageProcessor implements PageProcessor { |
109 | 118 | Spider.create(new NatureSearchPageProcessor()) |
110 | 119 | // 添加这个Spider要爬取的网页地址 |
111 | 120 | .addUrl("https://www.nature.com/search?q=battery&page=1") |
121 | + .addPipeline(new NatureArticlePipeline()) | |
112 | 122 | // 开启5个线程执行,并开始爬取 |
113 | 123 | .thread(5).run(); |
114 | 124 | } | ... | ... |
src/main/java/com/canrd/webmagic/service/NatureArticleService.java
1 | 1 | package com.canrd.webmagic.service; |
2 | 2 | |
3 | 3 | import com.baomidou.mybatisplus.extension.service.IService; |
4 | +import com.canrd.webmagic.common.constant.ServerResult; | |
4 | 5 | import com.canrd.webmagic.domain.dto.NatureArticleDO; |
5 | -import com.gree.gaolan.common.constant.ServerResult; | |
6 | +import com.canrd.webmagic.domain.vo.NatureArticleQueryVO; | |
7 | +import com.canrd.webmagic.domain.vo.NatureArticleVO; | |
6 | 8 | |
7 | 9 | /** |
8 | 10 | * nature-文章信息(NatureArticle)表服务接口 | ... | ... |
src/main/java/com/canrd/webmagic/service/impl/NatureArticleServiceImpl.java
1 | 1 | package com.canrd.webmagic.service.impl; |
2 | 2 | |
3 | -import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper; | |
3 | +import cn.hutool.core.bean.BeanUtil; | |
4 | +import cn.hutool.core.collection.CollUtil; | |
4 | 5 | import com.baomidou.mybatisplus.core.conditions.update.LambdaUpdateWrapper; |
5 | -import com.baomidou.mybatisplus.core.metadata.IPage; | |
6 | -import com.baomidou.mybatisplus.extension.plugins.pagination.Page; | |
7 | 6 | import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl; |
7 | +import com.canrd.webmagic.common.constant.ServerResult; | |
8 | 8 | import com.canrd.webmagic.domain.dto.NatureArticleDO; |
9 | -import com.central.common.utils.CopyBeanUtils; | |
10 | -import com.gree.doraemon.base.core.bean.BeanUtil; | |
11 | -import com.gree.doraemon.base.core.collection.CollUtil; | |
12 | -import com.gree.gaolan.common.constant.Constant; | |
13 | -import com.gree.gaolan.common.constant.ServerResult; | |
14 | -import com.gree.gaolan.common.util.PageUtils; | |
9 | +import com.canrd.webmagic.domain.vo.NatureArticleQueryVO; | |
10 | +import com.canrd.webmagic.domain.vo.NatureArticleVO; | |
11 | +import com.canrd.webmagic.mapper.NatureArticleMapper; | |
12 | +import com.canrd.webmagic.service.NatureArticleService; | |
15 | 13 | import lombok.extern.slf4j.Slf4j; |
16 | 14 | import org.springframework.stereotype.Service; |
17 | 15 | |
... | ... | @@ -56,15 +54,7 @@ public class NatureArticleServiceImpl extends ServiceImpl<NatureArticleMapper, N |
56 | 54 | */ |
57 | 55 | @Override |
58 | 56 | public ServerResult list(NatureArticleQueryVO natureArticleQueryVO) { |
59 | - | |
60 | - LambdaQueryWrapper<NatureArticleDO> queryWapper = new LambdaQueryWrapper<NatureArticleDO>() | |
61 | - .eq(NatureArticleDO::getEnableFlag, Constant.ENABLE_TEN) | |
62 | - .orderByDesc(NatureArticleDO::getId); | |
63 | - Page page = new Page<>(natureArticleQueryVO.getCurrent(), natureArticleQueryVO.getSize()); | |
64 | - IPage<NatureArticleDO> iPage = page(page, queryWapper); | |
65 | - natureArticleQueryVO.setTotal(Long.valueOf(iPage.getTotal()).intValue()); | |
66 | - List<NatureArticleVO> result = CopyBeanUtils.trans(iPage.getRecords(), NatureArticleVO.class); | |
67 | - return ServerResult.success(PageUtils.getPageReturn(result, natureArticleQueryVO)); | |
57 | + return ServerResult.success(ServerResult.success()); | |
68 | 58 | } |
69 | 59 | |
70 | 60 | /** |
... | ... | @@ -123,8 +113,7 @@ public class NatureArticleServiceImpl extends ServiceImpl<NatureArticleMapper, N |
123 | 113 | } |
124 | 114 | //todo 校验是否可以逻辑删除 |
125 | 115 | LambdaUpdateWrapper<NatureArticleDO> updateWrapper = new LambdaUpdateWrapper<NatureArticleDO>() |
126 | - .in(NatureArticleDO::getId, ids) | |
127 | - .set(NatureArticleDO::getEnableFlag, Constant.UNABLE_TWENTY); | |
116 | + .in(NatureArticleDO::getId, ids); | |
128 | 117 | update(updateWrapper); |
129 | 118 | return ServerResult.success(); |
130 | 119 | } | ... | ... |
src/main/resources/application-local.yml
... | ... | @@ -59,7 +59,7 @@ spring: |
59 | 59 | testOnReturn: true |
60 | 60 | password: canrd@2024 |
61 | 61 | time-between-eviction-runs-millis: 1000 |
62 | - url: jdbc:mysql://39.108.227.113:3307/order-erp1?useUnicode=true&characterEncoding=UTF-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=Asia/Shanghai&useSSL=false&autoReconnect=true&failOverReadOnly=false&maxReconnects=10&allowMultiQueries=true&useAffectedRows=true&autoReconnectForPools=true | |
62 | + url: jdbc:mysql://39.108.227.113:3307/webmagic?useUnicode=true&characterEncoding=UTF-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=Asia/Shanghai&useSSL=false&autoReconnect=true&failOverReadOnly=false&maxReconnects=10&allowMultiQueries=true&useAffectedRows=true&autoReconnectForPools=true | |
63 | 63 | username: root |
64 | 64 | redis: |
65 | 65 | database: 0 | ... | ... |