Commit 836066f9f49a40d28c54510bc0359dd4db232ed1

Authored by qdlgxiemaosheng
1 parent ea294a6a

nature文章信息爬取,存储db

sql/webmagic.sql
1 1 DROP TABLE IF EXISTS `nature_article`;
2 2 CREATE TABLE `nature_article` (
3 3 `id` bigint NOT NULL AUTO_INCREMENT,
  4 + `article_code` varchar(64) DEFAULT NULL COMMENT '文章标识',
4 5 `author_name` varchar(256) DEFAULT NULL COMMENT '作者名称',
5 6 `title` varchar(256) DEFAULT NULL COMMENT '文章标题',
6 7 `publish_time` varchar(64) DEFAULT NULL COMMENT '发布时间',
7 8 `email_info` varchar(512) DEFAULT NULL COMMENT '邮箱信息',
8 9 PRIMARY KEY (`id`)
9   -) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='nature-文章信息';
10 10 \ No newline at end of file
  11 +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='nature-文章信息';
  12 +
  13 +
  14 +alter table `nature_article` add column `article_code` varchar(256) DEFAULT NULL COMMENT '文章标识';
11 15 \ No newline at end of file
... ...
src/main/java/com/canrd/webmagic/controller/NatureArticleController.java
... ... @@ -4,12 +4,12 @@ import com.canrd.webmagic.common.constant.ServerResult;
4 4 import com.canrd.webmagic.common.jsr303.OperateGroup;
5 5 import com.canrd.webmagic.domain.vo.NatureArticleQueryVO;
6 6 import com.canrd.webmagic.domain.vo.NatureArticleVO;
  7 +import com.canrd.webmagic.processor.NatureArticlePipeline;
  8 +import com.canrd.webmagic.processor.NatureSearchPageProcessor;
7 9 import com.canrd.webmagic.service.NatureArticleService;
8 10 import org.springframework.validation.annotation.Validated;
9   -import org.springframework.web.bind.annotation.PostMapping;
10   -import org.springframework.web.bind.annotation.RequestBody;
11   -import org.springframework.web.bind.annotation.RequestMapping;
12   -import org.springframework.web.bind.annotation.RestController;
  11 +import org.springframework.web.bind.annotation.*;
  12 +import us.codecraft.webmagic.Spider;
13 13  
14 14 import javax.annotation.Resource;
15 15  
... ... @@ -20,7 +20,7 @@ import javax.annotation.Resource;
20 20 * @since 2024-04-07 18:39:41
21 21 */
22 22 @RestController
23   -@RequestMapping("/gwms/xxx")
  23 +@RequestMapping("/nature/article")
24 24 public class NatureArticleController {
25 25 /**
26 26 * 服务对象
... ... @@ -28,6 +28,29 @@ public class NatureArticleController {
28 28 @Resource
29 29 private NatureArticleService natureArticleService;
30 30  
  31 + @Resource
  32 + private NatureSearchPageProcessor natureSearchPageProcessor;
  33 +
  34 + @Resource
  35 + private NatureArticlePipeline articlePipeline;
  36 +
  37 + /**
  38 + * @return
  39 + */
  40 + @GetMapping("/start")
  41 + public ServerResult start(@RequestParam(value = "keyword") String keyword, @RequestParam(value = "indexSize") Integer indexSize) {
  42 + for (int i = 1; i <= indexSize; i++) {
  43 + Spider.create(natureSearchPageProcessor)
  44 + // 添加这个Spider要爬取的网页地址
  45 + .addUrl("https://www.nature.com/search?q=" + keyword + "&page=" + i)
  46 + .addPipeline(articlePipeline)
  47 + // 开启5个线程执行,并开始爬取
  48 + .thread(5).run();
  49 + }
  50 +
  51 + return ServerResult.success();
  52 + }
  53 +
31 54 /**
32 55 * 分页查询
33 56 *
... ...
src/main/java/com/canrd/webmagic/domain/dto/NatureArticleDO.java
1 1 package com.canrd.webmagic.domain.dto;
2 2  
3   -.dto;
4   -
5   -
6   -import java.io.Serializable;
7   -
8 3 import com.baomidou.mybatisplus.annotation.TableName;
9   -import com.gree.gaolan.common.dto.BaseDO;
10 4 import lombok.*;
11 5 import lombok.experimental.SuperBuilder;
12 6  
  7 +import java.io.Serializable;
  8 +
13 9 /**
14 10 * nature-文章信息(NatureArticle)实体类
15 11 *
... ... @@ -23,7 +19,7 @@ import lombok.experimental.SuperBuilder;
23 19 @NoArgsConstructor
24 20 @EqualsAndHashCode(callSuper = false)
25 21 @SuperBuilder
26   -public class NatureArticleDO extends BaseDO implements Serializable {
  22 +public class NatureArticleDO implements Serializable {
27 23 private static final long serialVersionUID = 890672868109538541L;
28 24  
29 25 private Long id;
... ... @@ -31,14 +27,22 @@ public class NatureArticleDO extends BaseDO implements Serializable {
31 27 * 作者名称
32 28 */
33 29 private String authorName;
  30 +
  31 + /**
  32 + * 文章标识
  33 + */
  34 + private String articleCode;
  35 +
34 36 /**
35 37 * 文章标题
36 38 */
37 39 private String title;
  40 +
38 41 /**
39 42 * 发布时间
40 43 */
41 44 private String publishTime;
  45 +
42 46 /**
43 47 * 邮箱信息
44 48 */
... ...
src/main/java/com/canrd/webmagic/domain/vo/NatureArticleQueryVO.java
1 1 package com.canrd.webmagic.domain.vo;
2 2  
3   -.vo;
4   -
5   -import java.io.Serializable;
6   -
7 3 import lombok.*;
8 4 import lombok.experimental.SuperBuilder;
9   -import com.gree.gaolan.common.vo.common.BasePageVO;
10 5  
11   -import java.lang.Long;
  6 +import java.io.Serializable;
12 7 import java.util.List;
13 8  
14 9 /**
... ...
src/main/java/com/canrd/webmagic/domain/vo/NatureArticleStartVO.java 0 → 100644
  1 +package com.canrd.webmagic.domain.vo;
  2 +
  3 +import lombok.*;
  4 +import lombok.experimental.SuperBuilder;
  5 +
  6 +import java.io.Serializable;
  7 +import java.util.List;
  8 +
  9 +/**
  10 + * nature-文章信息(NatureArticle)实体类
  11 + *
  12 + * @author makejava
  13 + * @since 2024-04-07 18:39:41
  14 + */
  15 +@Data
  16 +@AllArgsConstructor
  17 +@ToString
  18 +@NoArgsConstructor
  19 +@EqualsAndHashCode(callSuper = false)
  20 +@SuperBuilder
  21 +public class NatureArticleStartVO implements Serializable {
  22 +
  23 + /**
  24 + * 爬取的总页数
  25 + */
  26 + private Integer indexSize;
  27 +
  28 + /**
  29 + * 爬取的关键字
  30 + */
  31 + private String keyword;
  32 +}
  33 +
... ...
src/main/java/com/canrd/webmagic/domain/vo/NatureArticleVO.java
1 1 package com.canrd.webmagic.domain.vo;
2 2  
3   -.vo;
4   -
5   -
6   -import java.io.Serializable;
7   -
8 3 import lombok.*;
9 4 import lombok.experimental.SuperBuilder;
10 5  
  6 +import java.io.Serializable;
  7 +
11 8 /**
12 9 * nature-文章信息(NatureArticle)实体类
13 10 *
... ...
src/main/java/com/canrd/webmagic/processor/NatureArticlePipeline.java 0 → 100644
  1 +package com.canrd.webmagic.processor;
  2 +
  3 +import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
  4 +import com.canrd.webmagic.domain.dto.NatureArticleDO;
  5 +import com.canrd.webmagic.service.NatureArticleService;
  6 +import org.springframework.beans.factory.annotation.Autowired;
  7 +import org.springframework.stereotype.Component;
  8 +import us.codecraft.webmagic.ResultItems;
  9 +import us.codecraft.webmagic.Task;
  10 +import us.codecraft.webmagic.pipeline.Pipeline;
  11 +
  12 +import java.util.Objects;
  13 +
  14 +@Component
  15 +public class NatureArticlePipeline implements Pipeline {
  16 +
  17 + private NatureArticleService natureArticleService;
  18 +
  19 + @Autowired
  20 + public void setNatureArticleService(NatureArticleService natureArticleService) {
  21 + this.natureArticleService = natureArticleService;
  22 + }
  23 +
  24 + @Override
  25 + public void process(ResultItems resultItems, Task task) {
  26 + NatureArticleDO articleDO = resultItems.get("article");
  27 + if (Objects.nonNull(articleDO)) {
  28 + NatureArticleDO natureArticleDO = natureArticleService.getOne(new LambdaQueryWrapper<NatureArticleDO>().eq(NatureArticleDO::getArticleCode, articleDO.getArticleCode()));
  29 + if (Objects.nonNull(natureArticleDO)) {
  30 + return;
  31 + }
  32 + natureArticleService.save(articleDO);
  33 + }
  34 + }
  35 +}
... ...
src/main/java/com/canrd/webmagic/processor/NatureSearchPageProcessor.java
... ... @@ -3,6 +3,8 @@ package com.canrd.webmagic.processor;
3 3 import com.alibaba.fastjson.JSONArray;
4 4 import com.alibaba.fastjson.JSONObject;
5 5 import com.canrd.webmagic.common.utils.StringUtils;
  6 +import com.canrd.webmagic.domain.dto.NatureArticleDO;
  7 +import org.springframework.stereotype.Component;
6 8 import us.codecraft.webmagic.Page;
7 9 import us.codecraft.webmagic.Site;
8 10 import us.codecraft.webmagic.Spider;
... ... @@ -20,6 +22,7 @@ import java.util.Objects;
20 22 * @date: 2024/4/1 14:19
21 23 * @version: 1.0
22 24 */
  25 +@Component
23 26 public class NatureSearchPageProcessor implements PageProcessor {
24 27  
25 28 // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等
... ... @@ -32,7 +35,7 @@ public class NatureSearchPageProcessor implements PageProcessor {
32 35 */
33 36 @Override
34 37 public void process(Page page) {
35   - if ("https://www.nature.com/search?q=battery&page=1".equals(page.getUrl().get())) {
  38 + if (page.getUrl().get().contains("search")) {
36 39 doArticleList(page);
37 40 } else {
38 41 doArticleContent(page);
... ... @@ -44,7 +47,7 @@ public class NatureSearchPageProcessor implements PageProcessor {
44 47 //解析页面
45 48 Html html = page.getHtml();
46 49 String[] urlArr = page.getUrl().get().split("/");
47   - String articleId = urlArr[urlArr.length - 1];
  50 + String articleCode = urlArr[urlArr.length - 1];
48 51 Selectable headSelectable = html.xpath("//div[@class='c-article-header']/header");
49 52 List<Selectable> authorEmailSelectables = html.xpath("//p[@id='corresponding-author-list']/a").nodes();
50 53  
... ... @@ -69,7 +72,14 @@ public class NatureSearchPageProcessor implements PageProcessor {
69 72 jsonObject.put("email", email);
70 73 array.add(jsonObject);
71 74 }
72   - System.out.println("id:" + articleId + ",发布时间:" + publishTime + ",标题:" + title + ",作者:" + authorName + ",邮箱信息:" + array.toJSONString());
  75 + System.out.println("code:" + articleCode + ",发布时间:" + publishTime + ",标题:" + title + ",作者:" + authorName + ",邮箱信息:" + array.toJSONString());
  76 +
  77 + page.putField("article", NatureArticleDO.builder()
  78 + .articleCode(articleCode)
  79 + .authorName(authorName.toString())
  80 + .title(title)
  81 + .publishTime(publishTime)
  82 + .emailInfo(array.toJSONString()).build());
73 83 }
74 84  
75 85 private void doArticleList(Page page) {
... ... @@ -96,7 +106,6 @@ public class NatureSearchPageProcessor implements PageProcessor {
96 106 String title = node.$("a", "text").get();
97 107 System.out.printf("%d、%s,访问地址:%s%n", i, title, link1);
98 108 }
99   -// page.addTargetRequest("https://www.nature.com/search?q=battery&page=" + pageIndex);
100 109 }
101 110  
102 111 @Override
... ... @@ -109,6 +118,7 @@ public class NatureSearchPageProcessor implements PageProcessor {
109 118 Spider.create(new NatureSearchPageProcessor())
110 119 // 添加这个Spider要爬取的网页地址
111 120 .addUrl("https://www.nature.com/search?q=battery&page=1")
  121 + .addPipeline(new NatureArticlePipeline())
112 122 // 开启5个线程执行,并开始爬取
113 123 .thread(5).run();
114 124 }
... ...
src/main/java/com/canrd/webmagic/service/NatureArticleService.java
1 1 package com.canrd.webmagic.service;
2 2  
3 3 import com.baomidou.mybatisplus.extension.service.IService;
  4 +import com.canrd.webmagic.common.constant.ServerResult;
4 5 import com.canrd.webmagic.domain.dto.NatureArticleDO;
5   -import com.gree.gaolan.common.constant.ServerResult;
  6 +import com.canrd.webmagic.domain.vo.NatureArticleQueryVO;
  7 +import com.canrd.webmagic.domain.vo.NatureArticleVO;
6 8  
7 9 /**
8 10 * nature-文章信息(NatureArticle)表服务接口
... ...
src/main/java/com/canrd/webmagic/service/impl/NatureArticleServiceImpl.java
1 1 package com.canrd.webmagic.service.impl;
2 2  
3   -import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
  3 +import cn.hutool.core.bean.BeanUtil;
  4 +import cn.hutool.core.collection.CollUtil;
4 5 import com.baomidou.mybatisplus.core.conditions.update.LambdaUpdateWrapper;
5   -import com.baomidou.mybatisplus.core.metadata.IPage;
6   -import com.baomidou.mybatisplus.extension.plugins.pagination.Page;
7 6 import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
  7 +import com.canrd.webmagic.common.constant.ServerResult;
8 8 import com.canrd.webmagic.domain.dto.NatureArticleDO;
9   -import com.central.common.utils.CopyBeanUtils;
10   -import com.gree.doraemon.base.core.bean.BeanUtil;
11   -import com.gree.doraemon.base.core.collection.CollUtil;
12   -import com.gree.gaolan.common.constant.Constant;
13   -import com.gree.gaolan.common.constant.ServerResult;
14   -import com.gree.gaolan.common.util.PageUtils;
  9 +import com.canrd.webmagic.domain.vo.NatureArticleQueryVO;
  10 +import com.canrd.webmagic.domain.vo.NatureArticleVO;
  11 +import com.canrd.webmagic.mapper.NatureArticleMapper;
  12 +import com.canrd.webmagic.service.NatureArticleService;
15 13 import lombok.extern.slf4j.Slf4j;
16 14 import org.springframework.stereotype.Service;
17 15  
... ... @@ -56,15 +54,7 @@ public class NatureArticleServiceImpl extends ServiceImpl&lt;NatureArticleMapper, N
56 54 */
57 55 @Override
58 56 public ServerResult list(NatureArticleQueryVO natureArticleQueryVO) {
59   -
60   - LambdaQueryWrapper<NatureArticleDO> queryWapper = new LambdaQueryWrapper<NatureArticleDO>()
61   - .eq(NatureArticleDO::getEnableFlag, Constant.ENABLE_TEN)
62   - .orderByDesc(NatureArticleDO::getId);
63   - Page page = new Page<>(natureArticleQueryVO.getCurrent(), natureArticleQueryVO.getSize());
64   - IPage<NatureArticleDO> iPage = page(page, queryWapper);
65   - natureArticleQueryVO.setTotal(Long.valueOf(iPage.getTotal()).intValue());
66   - List<NatureArticleVO> result = CopyBeanUtils.trans(iPage.getRecords(), NatureArticleVO.class);
67   - return ServerResult.success(PageUtils.getPageReturn(result, natureArticleQueryVO));
  57 + return ServerResult.success(ServerResult.success());
68 58 }
69 59  
70 60 /**
... ... @@ -123,8 +113,7 @@ public class NatureArticleServiceImpl extends ServiceImpl&lt;NatureArticleMapper, N
123 113 }
124 114 //todo 校验是否可以逻辑删除
125 115 LambdaUpdateWrapper<NatureArticleDO> updateWrapper = new LambdaUpdateWrapper<NatureArticleDO>()
126   - .in(NatureArticleDO::getId, ids)
127   - .set(NatureArticleDO::getEnableFlag, Constant.UNABLE_TWENTY);
  116 + .in(NatureArticleDO::getId, ids);
128 117 update(updateWrapper);
129 118 return ServerResult.success();
130 119 }
... ...
src/main/resources/application-local.yml
... ... @@ -59,7 +59,7 @@ spring:
59 59 testOnReturn: true
60 60 password: canrd@2024
61 61 time-between-eviction-runs-millis: 1000
62   - url: jdbc:mysql://39.108.227.113:3307/order-erp1?useUnicode=true&characterEncoding=UTF-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=Asia/Shanghai&useSSL=false&autoReconnect=true&failOverReadOnly=false&maxReconnects=10&allowMultiQueries=true&useAffectedRows=true&autoReconnectForPools=true
  62 + url: jdbc:mysql://39.108.227.113:3307/webmagic?useUnicode=true&characterEncoding=UTF-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=Asia/Shanghai&useSSL=false&autoReconnect=true&failOverReadOnly=false&maxReconnects=10&allowMultiQueries=true&useAffectedRows=true&autoReconnectForPools=true
63 63 username: root
64 64 redis:
65 65 database: 0
... ...