Commit 836066f9f49a40d28c54510bc0359dd4db232ed1

Authored by qdlgxiemaosheng
1 parent ea294a6a

nature文章信息爬取,存储db

sql/webmagic.sql
1 DROP TABLE IF EXISTS `nature_article`; 1 DROP TABLE IF EXISTS `nature_article`;
2 CREATE TABLE `nature_article` ( 2 CREATE TABLE `nature_article` (
3 `id` bigint NOT NULL AUTO_INCREMENT, 3 `id` bigint NOT NULL AUTO_INCREMENT,
  4 + `article_code` varchar(64) DEFAULT NULL COMMENT '文章标识',
4 `author_name` varchar(256) DEFAULT NULL COMMENT '作者名称', 5 `author_name` varchar(256) DEFAULT NULL COMMENT '作者名称',
5 `title` varchar(256) DEFAULT NULL COMMENT '文章标题', 6 `title` varchar(256) DEFAULT NULL COMMENT '文章标题',
6 `publish_time` varchar(64) DEFAULT NULL COMMENT '发布时间', 7 `publish_time` varchar(64) DEFAULT NULL COMMENT '发布时间',
7 `email_info` varchar(512) DEFAULT NULL COMMENT '邮箱信息', 8 `email_info` varchar(512) DEFAULT NULL COMMENT '邮箱信息',
8 PRIMARY KEY (`id`) 9 PRIMARY KEY (`id`)
9 -) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='nature-文章信息';  
10 \ No newline at end of file 10 \ No newline at end of file
  11 +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='nature-文章信息';
  12 +
  13 +
  14 +alter table `nature_article` add column `article_code` varchar(256) DEFAULT NULL COMMENT '文章标识';
11 \ No newline at end of file 15 \ No newline at end of file
src/main/java/com/canrd/webmagic/controller/NatureArticleController.java
@@ -4,12 +4,12 @@ import com.canrd.webmagic.common.constant.ServerResult; @@ -4,12 +4,12 @@ import com.canrd.webmagic.common.constant.ServerResult;
4 import com.canrd.webmagic.common.jsr303.OperateGroup; 4 import com.canrd.webmagic.common.jsr303.OperateGroup;
5 import com.canrd.webmagic.domain.vo.NatureArticleQueryVO; 5 import com.canrd.webmagic.domain.vo.NatureArticleQueryVO;
6 import com.canrd.webmagic.domain.vo.NatureArticleVO; 6 import com.canrd.webmagic.domain.vo.NatureArticleVO;
  7 +import com.canrd.webmagic.processor.NatureArticlePipeline;
  8 +import com.canrd.webmagic.processor.NatureSearchPageProcessor;
7 import com.canrd.webmagic.service.NatureArticleService; 9 import com.canrd.webmagic.service.NatureArticleService;
8 import org.springframework.validation.annotation.Validated; 10 import org.springframework.validation.annotation.Validated;
9 -import org.springframework.web.bind.annotation.PostMapping;  
10 -import org.springframework.web.bind.annotation.RequestBody;  
11 -import org.springframework.web.bind.annotation.RequestMapping;  
12 -import org.springframework.web.bind.annotation.RestController; 11 +import org.springframework.web.bind.annotation.*;
  12 +import us.codecraft.webmagic.Spider;
13 13
14 import javax.annotation.Resource; 14 import javax.annotation.Resource;
15 15
@@ -20,7 +20,7 @@ import javax.annotation.Resource; @@ -20,7 +20,7 @@ import javax.annotation.Resource;
20 * @since 2024-04-07 18:39:41 20 * @since 2024-04-07 18:39:41
21 */ 21 */
22 @RestController 22 @RestController
23 -@RequestMapping("/gwms/xxx") 23 +@RequestMapping("/nature/article")
24 public class NatureArticleController { 24 public class NatureArticleController {
25 /** 25 /**
26 * 服务对象 26 * 服务对象
@@ -28,6 +28,29 @@ public class NatureArticleController { @@ -28,6 +28,29 @@ public class NatureArticleController {
28 @Resource 28 @Resource
29 private NatureArticleService natureArticleService; 29 private NatureArticleService natureArticleService;
30 30
  31 + @Resource
  32 + private NatureSearchPageProcessor natureSearchPageProcessor;
  33 +
  34 + @Resource
  35 + private NatureArticlePipeline articlePipeline;
  36 +
  37 + /**
  38 + * @return
  39 + */
  40 + @GetMapping("/start")
  41 + public ServerResult start(@RequestParam(value = "keyword") String keyword, @RequestParam(value = "indexSize") Integer indexSize) {
  42 + for (int i = 1; i <= indexSize; i++) {
  43 + Spider.create(natureSearchPageProcessor)
  44 + // 添加这个Spider要爬取的网页地址
  45 + .addUrl("https://www.nature.com/search?q=" + keyword + "&page=" + i)
  46 + .addPipeline(articlePipeline)
  47 + // 开启5个线程执行,并开始爬取
  48 + .thread(5).run();
  49 + }
  50 +
  51 + return ServerResult.success();
  52 + }
  53 +
31 /** 54 /**
32 * 分页查询 55 * 分页查询
33 * 56 *
src/main/java/com/canrd/webmagic/domain/dto/NatureArticleDO.java
1 package com.canrd.webmagic.domain.dto; 1 package com.canrd.webmagic.domain.dto;
2 2
3 -.dto;  
4 -  
5 -  
6 -import java.io.Serializable;  
7 -  
8 import com.baomidou.mybatisplus.annotation.TableName; 3 import com.baomidou.mybatisplus.annotation.TableName;
9 -import com.gree.gaolan.common.dto.BaseDO;  
10 import lombok.*; 4 import lombok.*;
11 import lombok.experimental.SuperBuilder; 5 import lombok.experimental.SuperBuilder;
12 6
  7 +import java.io.Serializable;
  8 +
13 /** 9 /**
14 * nature-文章信息(NatureArticle)实体类 10 * nature-文章信息(NatureArticle)实体类
15 * 11 *
@@ -23,7 +19,7 @@ import lombok.experimental.SuperBuilder; @@ -23,7 +19,7 @@ import lombok.experimental.SuperBuilder;
23 @NoArgsConstructor 19 @NoArgsConstructor
24 @EqualsAndHashCode(callSuper = false) 20 @EqualsAndHashCode(callSuper = false)
25 @SuperBuilder 21 @SuperBuilder
26 -public class NatureArticleDO extends BaseDO implements Serializable { 22 +public class NatureArticleDO implements Serializable {
27 private static final long serialVersionUID = 890672868109538541L; 23 private static final long serialVersionUID = 890672868109538541L;
28 24
29 private Long id; 25 private Long id;
@@ -31,14 +27,22 @@ public class NatureArticleDO extends BaseDO implements Serializable { @@ -31,14 +27,22 @@ public class NatureArticleDO extends BaseDO implements Serializable {
31 * 作者名称 27 * 作者名称
32 */ 28 */
33 private String authorName; 29 private String authorName;
  30 +
  31 + /**
  32 + * 文章标识
  33 + */
  34 + private String articleCode;
  35 +
34 /** 36 /**
35 * 文章标题 37 * 文章标题
36 */ 38 */
37 private String title; 39 private String title;
  40 +
38 /** 41 /**
39 * 发布时间 42 * 发布时间
40 */ 43 */
41 private String publishTime; 44 private String publishTime;
  45 +
42 /** 46 /**
43 * 邮箱信息 47 * 邮箱信息
44 */ 48 */
src/main/java/com/canrd/webmagic/domain/vo/NatureArticleQueryVO.java
1 package com.canrd.webmagic.domain.vo; 1 package com.canrd.webmagic.domain.vo;
2 2
3 -.vo;  
4 -  
5 -import java.io.Serializable;  
6 -  
7 import lombok.*; 3 import lombok.*;
8 import lombok.experimental.SuperBuilder; 4 import lombok.experimental.SuperBuilder;
9 -import com.gree.gaolan.common.vo.common.BasePageVO;  
10 5
11 -import java.lang.Long; 6 +import java.io.Serializable;
12 import java.util.List; 7 import java.util.List;
13 8
14 /** 9 /**
src/main/java/com/canrd/webmagic/domain/vo/NatureArticleStartVO.java 0 → 100644
  1 +package com.canrd.webmagic.domain.vo;
  2 +
  3 +import lombok.*;
  4 +import lombok.experimental.SuperBuilder;
  5 +
  6 +import java.io.Serializable;
  7 +import java.util.List;
  8 +
  9 +/**
  10 + * nature-文章信息(NatureArticle)实体类
  11 + *
  12 + * @author makejava
  13 + * @since 2024-04-07 18:39:41
  14 + */
  15 +@Data
  16 +@AllArgsConstructor
  17 +@ToString
  18 +@NoArgsConstructor
  19 +@EqualsAndHashCode(callSuper = false)
  20 +@SuperBuilder
  21 +public class NatureArticleStartVO implements Serializable {
  22 +
  23 + /**
  24 + * 爬取的总页数
  25 + */
  26 + private Integer indexSize;
  27 +
  28 + /**
  29 + * 爬取的关键字
  30 + */
  31 + private String keyword;
  32 +}
  33 +
src/main/java/com/canrd/webmagic/domain/vo/NatureArticleVO.java
1 package com.canrd.webmagic.domain.vo; 1 package com.canrd.webmagic.domain.vo;
2 2
3 -.vo;  
4 -  
5 -  
6 -import java.io.Serializable;  
7 -  
8 import lombok.*; 3 import lombok.*;
9 import lombok.experimental.SuperBuilder; 4 import lombok.experimental.SuperBuilder;
10 5
  6 +import java.io.Serializable;
  7 +
11 /** 8 /**
12 * nature-文章信息(NatureArticle)实体类 9 * nature-文章信息(NatureArticle)实体类
13 * 10 *
src/main/java/com/canrd/webmagic/processor/NatureArticlePipeline.java 0 → 100644
  1 +package com.canrd.webmagic.processor;
  2 +
  3 +import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
  4 +import com.canrd.webmagic.domain.dto.NatureArticleDO;
  5 +import com.canrd.webmagic.service.NatureArticleService;
  6 +import org.springframework.beans.factory.annotation.Autowired;
  7 +import org.springframework.stereotype.Component;
  8 +import us.codecraft.webmagic.ResultItems;
  9 +import us.codecraft.webmagic.Task;
  10 +import us.codecraft.webmagic.pipeline.Pipeline;
  11 +
  12 +import java.util.Objects;
  13 +
  14 +@Component
  15 +public class NatureArticlePipeline implements Pipeline {
  16 +
  17 + private NatureArticleService natureArticleService;
  18 +
  19 + @Autowired
  20 + public void setNatureArticleService(NatureArticleService natureArticleService) {
  21 + this.natureArticleService = natureArticleService;
  22 + }
  23 +
  24 + @Override
  25 + public void process(ResultItems resultItems, Task task) {
  26 + NatureArticleDO articleDO = resultItems.get("article");
  27 + if (Objects.nonNull(articleDO)) {
  28 + NatureArticleDO natureArticleDO = natureArticleService.getOne(new LambdaQueryWrapper<NatureArticleDO>().eq(NatureArticleDO::getArticleCode, articleDO.getArticleCode()));
  29 + if (Objects.nonNull(natureArticleDO)) {
  30 + return;
  31 + }
  32 + natureArticleService.save(articleDO);
  33 + }
  34 + }
  35 +}
src/main/java/com/canrd/webmagic/processor/NatureSearchPageProcessor.java
@@ -3,6 +3,8 @@ package com.canrd.webmagic.processor; @@ -3,6 +3,8 @@ package com.canrd.webmagic.processor;
3 import com.alibaba.fastjson.JSONArray; 3 import com.alibaba.fastjson.JSONArray;
4 import com.alibaba.fastjson.JSONObject; 4 import com.alibaba.fastjson.JSONObject;
5 import com.canrd.webmagic.common.utils.StringUtils; 5 import com.canrd.webmagic.common.utils.StringUtils;
  6 +import com.canrd.webmagic.domain.dto.NatureArticleDO;
  7 +import org.springframework.stereotype.Component;
6 import us.codecraft.webmagic.Page; 8 import us.codecraft.webmagic.Page;
7 import us.codecraft.webmagic.Site; 9 import us.codecraft.webmagic.Site;
8 import us.codecraft.webmagic.Spider; 10 import us.codecraft.webmagic.Spider;
@@ -20,6 +22,7 @@ import java.util.Objects; @@ -20,6 +22,7 @@ import java.util.Objects;
20 * @date: 2024/4/1 14:19 22 * @date: 2024/4/1 14:19
21 * @version: 1.0 23 * @version: 1.0
22 */ 24 */
  25 +@Component
23 public class NatureSearchPageProcessor implements PageProcessor { 26 public class NatureSearchPageProcessor implements PageProcessor {
24 27
25 // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等 28 // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等
@@ -32,7 +35,7 @@ public class NatureSearchPageProcessor implements PageProcessor { @@ -32,7 +35,7 @@ public class NatureSearchPageProcessor implements PageProcessor {
32 */ 35 */
33 @Override 36 @Override
34 public void process(Page page) { 37 public void process(Page page) {
35 - if ("https://www.nature.com/search?q=battery&page=1".equals(page.getUrl().get())) { 38 + if (page.getUrl().get().contains("search")) {
36 doArticleList(page); 39 doArticleList(page);
37 } else { 40 } else {
38 doArticleContent(page); 41 doArticleContent(page);
@@ -44,7 +47,7 @@ public class NatureSearchPageProcessor implements PageProcessor { @@ -44,7 +47,7 @@ public class NatureSearchPageProcessor implements PageProcessor {
44 //解析页面 47 //解析页面
45 Html html = page.getHtml(); 48 Html html = page.getHtml();
46 String[] urlArr = page.getUrl().get().split("/"); 49 String[] urlArr = page.getUrl().get().split("/");
47 - String articleId = urlArr[urlArr.length - 1]; 50 + String articleCode = urlArr[urlArr.length - 1];
48 Selectable headSelectable = html.xpath("//div[@class='c-article-header']/header"); 51 Selectable headSelectable = html.xpath("//div[@class='c-article-header']/header");
49 List<Selectable> authorEmailSelectables = html.xpath("//p[@id='corresponding-author-list']/a").nodes(); 52 List<Selectable> authorEmailSelectables = html.xpath("//p[@id='corresponding-author-list']/a").nodes();
50 53
@@ -69,7 +72,14 @@ public class NatureSearchPageProcessor implements PageProcessor { @@ -69,7 +72,14 @@ public class NatureSearchPageProcessor implements PageProcessor {
69 jsonObject.put("email", email); 72 jsonObject.put("email", email);
70 array.add(jsonObject); 73 array.add(jsonObject);
71 } 74 }
72 - System.out.println("id:" + articleId + ",发布时间:" + publishTime + ",标题:" + title + ",作者:" + authorName + ",邮箱信息:" + array.toJSONString()); 75 + System.out.println("code:" + articleCode + ",发布时间:" + publishTime + ",标题:" + title + ",作者:" + authorName + ",邮箱信息:" + array.toJSONString());
  76 +
  77 + page.putField("article", NatureArticleDO.builder()
  78 + .articleCode(articleCode)
  79 + .authorName(authorName.toString())
  80 + .title(title)
  81 + .publishTime(publishTime)
  82 + .emailInfo(array.toJSONString()).build());
73 } 83 }
74 84
75 private void doArticleList(Page page) { 85 private void doArticleList(Page page) {
@@ -96,7 +106,6 @@ public class NatureSearchPageProcessor implements PageProcessor { @@ -96,7 +106,6 @@ public class NatureSearchPageProcessor implements PageProcessor {
96 String title = node.$("a", "text").get(); 106 String title = node.$("a", "text").get();
97 System.out.printf("%d、%s,访问地址:%s%n", i, title, link1); 107 System.out.printf("%d、%s,访问地址:%s%n", i, title, link1);
98 } 108 }
99 -// page.addTargetRequest("https://www.nature.com/search?q=battery&page=" + pageIndex);  
100 } 109 }
101 110
102 @Override 111 @Override
@@ -109,6 +118,7 @@ public class NatureSearchPageProcessor implements PageProcessor { @@ -109,6 +118,7 @@ public class NatureSearchPageProcessor implements PageProcessor {
109 Spider.create(new NatureSearchPageProcessor()) 118 Spider.create(new NatureSearchPageProcessor())
110 // 添加这个Spider要爬取的网页地址 119 // 添加这个Spider要爬取的网页地址
111 .addUrl("https://www.nature.com/search?q=battery&page=1") 120 .addUrl("https://www.nature.com/search?q=battery&page=1")
  121 + .addPipeline(new NatureArticlePipeline())
112 // 开启5个线程执行,并开始爬取 122 // 开启5个线程执行,并开始爬取
113 .thread(5).run(); 123 .thread(5).run();
114 } 124 }
src/main/java/com/canrd/webmagic/service/NatureArticleService.java
1 package com.canrd.webmagic.service; 1 package com.canrd.webmagic.service;
2 2
3 import com.baomidou.mybatisplus.extension.service.IService; 3 import com.baomidou.mybatisplus.extension.service.IService;
  4 +import com.canrd.webmagic.common.constant.ServerResult;
4 import com.canrd.webmagic.domain.dto.NatureArticleDO; 5 import com.canrd.webmagic.domain.dto.NatureArticleDO;
5 -import com.gree.gaolan.common.constant.ServerResult; 6 +import com.canrd.webmagic.domain.vo.NatureArticleQueryVO;
  7 +import com.canrd.webmagic.domain.vo.NatureArticleVO;
6 8
7 /** 9 /**
8 * nature-文章信息(NatureArticle)表服务接口 10 * nature-文章信息(NatureArticle)表服务接口
src/main/java/com/canrd/webmagic/service/impl/NatureArticleServiceImpl.java
1 package com.canrd.webmagic.service.impl; 1 package com.canrd.webmagic.service.impl;
2 2
3 -import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper; 3 +import cn.hutool.core.bean.BeanUtil;
  4 +import cn.hutool.core.collection.CollUtil;
4 import com.baomidou.mybatisplus.core.conditions.update.LambdaUpdateWrapper; 5 import com.baomidou.mybatisplus.core.conditions.update.LambdaUpdateWrapper;
5 -import com.baomidou.mybatisplus.core.metadata.IPage;  
6 -import com.baomidou.mybatisplus.extension.plugins.pagination.Page;  
7 import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl; 6 import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
  7 +import com.canrd.webmagic.common.constant.ServerResult;
8 import com.canrd.webmagic.domain.dto.NatureArticleDO; 8 import com.canrd.webmagic.domain.dto.NatureArticleDO;
9 -import com.central.common.utils.CopyBeanUtils;  
10 -import com.gree.doraemon.base.core.bean.BeanUtil;  
11 -import com.gree.doraemon.base.core.collection.CollUtil;  
12 -import com.gree.gaolan.common.constant.Constant;  
13 -import com.gree.gaolan.common.constant.ServerResult;  
14 -import com.gree.gaolan.common.util.PageUtils; 9 +import com.canrd.webmagic.domain.vo.NatureArticleQueryVO;
  10 +import com.canrd.webmagic.domain.vo.NatureArticleVO;
  11 +import com.canrd.webmagic.mapper.NatureArticleMapper;
  12 +import com.canrd.webmagic.service.NatureArticleService;
15 import lombok.extern.slf4j.Slf4j; 13 import lombok.extern.slf4j.Slf4j;
16 import org.springframework.stereotype.Service; 14 import org.springframework.stereotype.Service;
17 15
@@ -56,15 +54,7 @@ public class NatureArticleServiceImpl extends ServiceImpl&lt;NatureArticleMapper, N @@ -56,15 +54,7 @@ public class NatureArticleServiceImpl extends ServiceImpl&lt;NatureArticleMapper, N
56 */ 54 */
57 @Override 55 @Override
58 public ServerResult list(NatureArticleQueryVO natureArticleQueryVO) { 56 public ServerResult list(NatureArticleQueryVO natureArticleQueryVO) {
59 -  
60 - LambdaQueryWrapper<NatureArticleDO> queryWapper = new LambdaQueryWrapper<NatureArticleDO>()  
61 - .eq(NatureArticleDO::getEnableFlag, Constant.ENABLE_TEN)  
62 - .orderByDesc(NatureArticleDO::getId);  
63 - Page page = new Page<>(natureArticleQueryVO.getCurrent(), natureArticleQueryVO.getSize());  
64 - IPage<NatureArticleDO> iPage = page(page, queryWapper);  
65 - natureArticleQueryVO.setTotal(Long.valueOf(iPage.getTotal()).intValue());  
66 - List<NatureArticleVO> result = CopyBeanUtils.trans(iPage.getRecords(), NatureArticleVO.class);  
67 - return ServerResult.success(PageUtils.getPageReturn(result, natureArticleQueryVO)); 57 + return ServerResult.success(ServerResult.success());
68 } 58 }
69 59
70 /** 60 /**
@@ -123,8 +113,7 @@ public class NatureArticleServiceImpl extends ServiceImpl&lt;NatureArticleMapper, N @@ -123,8 +113,7 @@ public class NatureArticleServiceImpl extends ServiceImpl&lt;NatureArticleMapper, N
123 } 113 }
124 //todo 校验是否可以逻辑删除 114 //todo 校验是否可以逻辑删除
125 LambdaUpdateWrapper<NatureArticleDO> updateWrapper = new LambdaUpdateWrapper<NatureArticleDO>() 115 LambdaUpdateWrapper<NatureArticleDO> updateWrapper = new LambdaUpdateWrapper<NatureArticleDO>()
126 - .in(NatureArticleDO::getId, ids)  
127 - .set(NatureArticleDO::getEnableFlag, Constant.UNABLE_TWENTY); 116 + .in(NatureArticleDO::getId, ids);
128 update(updateWrapper); 117 update(updateWrapper);
129 return ServerResult.success(); 118 return ServerResult.success();
130 } 119 }
src/main/resources/application-local.yml
@@ -59,7 +59,7 @@ spring: @@ -59,7 +59,7 @@ spring:
59 testOnReturn: true 59 testOnReturn: true
60 password: canrd@2024 60 password: canrd@2024
61 time-between-eviction-runs-millis: 1000 61 time-between-eviction-runs-millis: 1000
62 - url: jdbc:mysql://39.108.227.113:3307/order-erp1?useUnicode=true&characterEncoding=UTF-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=Asia/Shanghai&useSSL=false&autoReconnect=true&failOverReadOnly=false&maxReconnects=10&allowMultiQueries=true&useAffectedRows=true&autoReconnectForPools=true 62 + url: jdbc:mysql://39.108.227.113:3307/webmagic?useUnicode=true&characterEncoding=UTF-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=Asia/Shanghai&useSSL=false&autoReconnect=true&failOverReadOnly=false&maxReconnects=10&allowMultiQueries=true&useAffectedRows=true&autoReconnectForPools=true
63 username: root 63 username: root
64 redis: 64 redis:
65 database: 0 65 database: 0