Commit b1b31dc60cb0fdfde1fea1c80b6fb2493a00ab32

Authored by 谢茂盛
1 parent 836066f9

feat: 配置userAgent和IP代理池

src/main/java/com/canrd/webmagic/controller/NatureArticleController.java
@@ -4,7 +4,7 @@ import com.canrd.webmagic.common.constant.ServerResult; @@ -4,7 +4,7 @@ import com.canrd.webmagic.common.constant.ServerResult;
4 import com.canrd.webmagic.common.jsr303.OperateGroup; 4 import com.canrd.webmagic.common.jsr303.OperateGroup;
5 import com.canrd.webmagic.domain.vo.NatureArticleQueryVO; 5 import com.canrd.webmagic.domain.vo.NatureArticleQueryVO;
6 import com.canrd.webmagic.domain.vo.NatureArticleVO; 6 import com.canrd.webmagic.domain.vo.NatureArticleVO;
7 -import com.canrd.webmagic.processor.NatureArticlePipeline; 7 +import com.canrd.webmagic.processor.pipeline.NatureArticlePipeline;
8 import com.canrd.webmagic.processor.NatureSearchPageProcessor; 8 import com.canrd.webmagic.processor.NatureSearchPageProcessor;
9 import com.canrd.webmagic.service.NatureArticleService; 9 import com.canrd.webmagic.service.NatureArticleService;
10 import org.springframework.validation.annotation.Validated; 10 import org.springframework.validation.annotation.Validated;
src/main/java/com/canrd/webmagic/processor/NatureSearchPageProcessor.java
@@ -4,6 +4,8 @@ import com.alibaba.fastjson.JSONArray; @@ -4,6 +4,8 @@ import com.alibaba.fastjson.JSONArray;
4 import com.alibaba.fastjson.JSONObject; 4 import com.alibaba.fastjson.JSONObject;
5 import com.canrd.webmagic.common.utils.StringUtils; 5 import com.canrd.webmagic.common.utils.StringUtils;
6 import com.canrd.webmagic.domain.dto.NatureArticleDO; 6 import com.canrd.webmagic.domain.dto.NatureArticleDO;
  7 +import com.canrd.webmagic.processor.config.Agent;
  8 +import com.canrd.webmagic.processor.pipeline.NatureArticlePipeline;
7 import org.springframework.stereotype.Component; 9 import org.springframework.stereotype.Component;
8 import us.codecraft.webmagic.Page; 10 import us.codecraft.webmagic.Page;
9 import us.codecraft.webmagic.Site; 11 import us.codecraft.webmagic.Site;
@@ -24,9 +26,10 @@ import java.util.Objects; @@ -24,9 +26,10 @@ import java.util.Objects;
24 */ 26 */
25 @Component 27 @Component
26 public class NatureSearchPageProcessor implements PageProcessor { 28 public class NatureSearchPageProcessor implements PageProcessor {
  29 + private String agent = Agent.getRandom();
27 30
28 // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等 31 // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等
29 - private Site site = Site.me().setRetryTimes(3).setSleepTime(100); 32 + private Site site = Site.me().setRetryTimes(3).setSleepTime(100).setUserAgent(agent);
30 33
31 /** 34 /**
32 * 定制爬虫逻辑的核心接口,在这里编写抽取逻辑 35 * 定制爬虫逻辑的核心接口,在这里编写抽取逻辑
src/main/java/com/canrd/webmagic/processor/config/Agent.java 0 → 100644
  1 +package com.canrd.webmagic.processor.config;
  2 +
  3 +import cn.hutool.core.io.resource.ClassPathResource;
  4 +import lombok.extern.slf4j.Slf4j;
  5 +
  6 +import java.io.BufferedReader;
  7 +import java.io.IOException;
  8 +import java.io.InputStream;
  9 +import java.io.InputStreamReader;
  10 +import java.util.ArrayList;
  11 +import java.util.List;
  12 +import java.util.Random;
  13 +import java.util.concurrent.locks.ReentrantReadWriteLock;
  14 +
  15 +/**
  16 + * @author: xms
  17 + * @description: TODO
  18 + * @date: 2024/4/9 10:28
  19 + * @version: 1.0
  20 + */
  21 +@Slf4j
  22 +public class Agent {
  23 + private static final String AGENT_FILE_PATH = "user-agent/User-Agents.txt";
  24 + private static ReentrantReadWriteLock lock = new ReentrantReadWriteLock();
  25 + private static List<String> agents;
  26 +
  27 + /**
  28 + * @return
  29 + */
  30 + public static String getRandom() {
  31 + String random = getRandom(null);
  32 + log.info("Agent======================>" + random);
  33 + return random;
  34 + }
  35 +
  36 + /**
  37 + * @param agent
  38 + * @return
  39 + */
  40 + private static String getRandom(String agent) {
  41 + try {
  42 + lock.readLock().lock();
  43 + int size = agents.size();
  44 + if (size == 0) {
  45 + return "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36";
  46 + }
  47 + Random random = new Random();
  48 + if (null != agent) {
  49 + return agent;
  50 + } else {
  51 + return agents.get(random.nextInt(size));
  52 + }
  53 + } catch (Exception e) {
  54 + e.printStackTrace();
  55 + return "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36";
  56 + } finally {
  57 + lock.readLock().unlock();
  58 + }
  59 + }
  60 +
  61 + static {
  62 + agents = new ArrayList<>();
  63 + InputStream resourceAsStream = null;
  64 + InputStreamReader inputStreamReader = null;
  65 + BufferedReader bufferedReader = null;
  66 + try {
  67 + resourceAsStream = new ClassPathResource(AGENT_FILE_PATH).getStream();
  68 + inputStreamReader = new InputStreamReader(resourceAsStream);
  69 + bufferedReader = new BufferedReader(inputStreamReader);
  70 + String len;
  71 + while ((len = bufferedReader.readLine()) != null) {
  72 + if (!len.matches("^#.*")) {
  73 + agents.add(len.trim());
  74 + }
  75 + }
  76 + } catch (Exception e) {
  77 + e.printStackTrace();
  78 + } finally {
  79 + if (null != bufferedReader) {
  80 + try {
  81 + bufferedReader.close();
  82 + } catch (IOException e) {
  83 + e.printStackTrace();
  84 + }
  85 + }
  86 + if (null != inputStreamReader) {
  87 + try {
  88 + inputStreamReader.close();
  89 + } catch (IOException e) {
  90 + e.printStackTrace();
  91 + }
  92 + }
  93 + if (null != resourceAsStream) {
  94 + try {
  95 + resourceAsStream.close();
  96 + } catch (IOException e) {
  97 + e.printStackTrace();
  98 + }
  99 + }
  100 + }
  101 + }
  102 +}
src/main/java/com/canrd/webmagic/processor/config/Downloader.java 0 → 100644
  1 +package com.canrd.webmagic.processor.config;
  2 +
  3 +import lombok.extern.slf4j.Slf4j;
  4 +import org.springframework.beans.factory.annotation.Autowired;
  5 +import org.springframework.data.redis.core.RedisTemplate;
  6 +import org.springframework.stereotype.Component;
  7 +import us.codecraft.webmagic.Request;
  8 +import us.codecraft.webmagic.downloader.HttpClientDownloader;
  9 +import us.codecraft.webmagic.proxy.Proxy;
  10 +import us.codecraft.webmagic.proxy.SimpleProxyProvider;
  11 +
  12 +import java.util.Random;
  13 +
  14 +/**
  15 + * @author: xms
  16 + * @description: TODO
  17 + * @date: 2024/4/9 10:37
  18 + * @version: 1.0
  19 + */
  20 +@Slf4j
  21 +@Component
  22 +public class Downloader {
  23 + private static RedisTemplate redisTemplate;
  24 +
  25 + @Autowired
  26 + Downloader(RedisTemplate redisTemplate) {
  27 + Downloader.redisTemplate = redisTemplate;
  28 + }
  29 +
  30 + /**
  31 + *
  32 + * @return
  33 + */
  34 + public static HttpClientDownloader newIpDownloader() {
  35 + HttpClientDownloader downloader = new HttpClientDownloader() {
  36 + @Override
  37 + protected void onError(Request request) {
  38 + String[] ips = newIp();
  39 + setProxyProvider(SimpleProxyProvider.from(new Proxy(ips[0], Integer.parseInt(ips[1]))));
  40 + }
  41 + };
  42 + return downloader;
  43 + }
  44 +
  45 + static String[] newIp() {
  46 + Long size = redisTemplate.opsForList().size("ip");
  47 + String ip = redisTemplate.opsForList().index("ip", new Random().nextInt(size.intValue())).toString();
  48 + log.info("获取ip===========>" + ip);
  49 + String[] ips = ip.split(":");
  50 + return ips;
  51 + }
  52 +}
src/main/java/com/canrd/webmagic/processor/config/UpdateIp.java 0 → 100644
  1 +package com.canrd.webmagic.processor.config;
  2 +
  3 +import com.baomidou.mybatisplus.core.toolkit.StringUtils;
  4 +import org.apache.commons.io.IOUtils;
  5 +import org.jsoup.Jsoup;
  6 +import org.jsoup.nodes.Document;
  7 +import org.jsoup.nodes.Element;
  8 +import org.jsoup.select.Elements;
  9 +import org.springframework.beans.factory.annotation.Autowired;
  10 +import org.springframework.data.redis.core.RedisTemplate;
  11 +import org.springframework.scheduling.annotation.Scheduled;
  12 +import org.springframework.stereotype.Component;
  13 +
  14 +import java.io.IOException;
  15 +import java.io.InputStream;
  16 +import java.net.InetSocketAddress;
  17 +import java.net.Proxy;
  18 +import java.net.URL;
  19 +import java.net.URLConnection;
  20 +import java.util.List;
  21 +
  22 +/**
  23 + * @author: xms
  24 + * @description: TODO
  25 + * @date: 2024/4/9 10:35
  26 + * @version: 1.0
  27 + */
  28 +@Component
  29 +public class UpdateIp {
  30 +
  31 + @Autowired
  32 + private RedisTemplate redisTemplate;
  33 +
  34 + @Scheduled(cron = "*/20 * * * * ?")
  35 + void update() {
  36 + List<String> range = redisTemplate.opsForList().range("ip", 0, -1);
  37 + for (String ip : range) {
  38 + if (ifUseless(ip)) {
  39 + System.err.println(ip + " 从redis移除");
  40 + redisTemplate.opsForList().remove("ip", 0, ip);
  41 + }
  42 + }
  43 + }
  44 +
  45 + @Scheduled(cron = "*/15 * * * * ?")
  46 + void ips() {
  47 + String string = null;
  48 + try {
  49 + Document document = Jsoup.connect("https://www.xicidaili.com/nn").timeout(3000).get();
  50 + Elements tags = document.select("#ip_list > tbody > tr");
  51 + for (Element element : tags) {
  52 + //取得ip地址节点
  53 + Elements tdChilds = element.select("tr > td:nth-child(2)");
  54 + //取得端口号节点
  55 + Elements tcpd = element.select("tr > td:nth-child(3)");
  56 + if (StringUtils.isNotBlank(tdChilds.text()) && StringUtils.isNotBlank(tcpd.text())) {
  57 + string = tdChilds.text() + ":" + tcpd.text();
  58 + if (!ifUseless(string)) {
  59 + List<String> range = redisTemplate.opsForList().range("ip", 0, -1);
  60 + if (!range.contains(string)) {
  61 + System.err.println(string + " 存进redis");
  62 + if (redisTemplate.opsForList().size("ip") > 100) {
  63 + redisTemplate.opsForList().rightPopAndLeftPush("ip", string);
  64 + } else {
  65 + redisTemplate.opsForList().leftPush("ip", string);
  66 + }
  67 + }
  68 + }
  69 + }
  70 + }
  71 + } catch (IOException e) {
  72 + e.printStackTrace();
  73 + }
  74 + }
  75 +
  76 + /**
  77 + * 无效的ip 返回true 有效的ip返回false
  78 + *
  79 + * @param ip
  80 + * @return
  81 + */
  82 + boolean ifUseless(String ip) {
  83 + String[] split = ip.split(":");
  84 + URL url = null;
  85 + try {
  86 + url = new URL("http://www.baidu.com");
  87 + InetSocketAddress addr = new InetSocketAddress(split[0], Integer.parseInt(split[1]));
  88 + Proxy proxy = new Proxy(Proxy.Type.HTTP, addr);
  89 + InputStream in = null;
  90 + try {
  91 + URLConnection conn = url.openConnection(proxy);
  92 + conn.setConnectTimeout(2000);
  93 + in = conn.getInputStream();
  94 + } catch (Exception e) {
  95 + return true;
  96 + }
  97 + String s = IOUtils.toString(in);
  98 + if (s.indexOf("baidu") > 0) {
  99 + return false;
  100 + }
  101 + return true;
  102 + } catch (Exception e) {
  103 + return true;
  104 + }
  105 + }
  106 +}
src/main/java/com/canrd/webmagic/processor/NatureArticlePipeline.java renamed to src/main/java/com/canrd/webmagic/processor/pipeline/NatureArticlePipeline.java
1 -package com.canrd.webmagic.processor; 1 +package com.canrd.webmagic.processor.pipeline;
2 2
3 import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper; 3 import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
4 import com.canrd.webmagic.domain.dto.NatureArticleDO; 4 import com.canrd.webmagic.domain.dto.NatureArticleDO;
src/main/resources/log4j2-dev.xml
@@ -66,7 +66,7 @@ @@ -66,7 +66,7 @@
66 <ThresholdFilter level="debug"/> 66 <ThresholdFilter level="debug"/>
67 <appender-ref ref="RollingFileInfo" /> 67 <appender-ref ref="RollingFileInfo" />
68 </logger> 68 </logger>
69 - <logger name="com.canrd.shop" level="DEBUG" > 69 + <logger name="com.canrd.webmagic" level="DEBUG" >
70 <ThresholdFilter level="debug"/> 70 <ThresholdFilter level="debug"/>
71 <appender-ref ref="RollingFileInfo" /> 71 <appender-ref ref="RollingFileInfo" />
72 </logger> 72 </logger>
src/main/resources/log4j2-prod.xml
@@ -30,7 +30,7 @@ @@ -30,7 +30,7 @@
30 <!--</console>--> 30 <!--</console>-->
31 31
32 <!-- 这个会打印出所有的info及以下级别的信息,每次大小超过size,则这size大小的日志会自动存入按年份-月份建立的文件夹下面并进行压缩,作为存档 --> 32 <!-- 这个会打印出所有的info及以下级别的信息,每次大小超过size,则这size大小的日志会自动存入按年份-月份建立的文件夹下面并进行压缩,作为存档 -->
33 - <RollingFile name="RollingFileInfo" fileName="${sys:logging.path}/logs/overtime.log" filePattern="${sys:logging.path}/logs/$${date:yyyy-MM-dd}/info-%d{yyyy-MM-dd}-%i.log"> 33 + <RollingFile name="RollingFileInfo" fileName="${sys:logging.path}/logs/webmagic.log" filePattern="${sys:logging.path}/logs/$${date:yyyy-MM-dd}/info-%d{yyyy-MM-dd}-%i.log">
34 <!--控制台只输出level及以上级别的信息(onMatch),其他的直接拒绝(onMismatch) --> 34 <!--控制台只输出level及以上级别的信息(onMatch),其他的直接拒绝(onMismatch) -->
35 <ThresholdFilter level="info" onMatch="ACCEPT" onMismatch="DENY" /> 35 <ThresholdFilter level="info" onMatch="ACCEPT" onMismatch="DENY" />
36 <!--<Filters>--> 36 <!--<Filters>-->
@@ -65,7 +65,7 @@ @@ -65,7 +65,7 @@
65 <ThresholdFilter level="info"/> 65 <ThresholdFilter level="info"/>
66 <appender-ref ref="RollingFileInfo" /> 66 <appender-ref ref="RollingFileInfo" />
67 </logger> 67 </logger>
68 - <logger name="com.canrd.shop" level="info" > 68 + <logger name="com.canrd.webmagic" level="info" >
69 <ThresholdFilter level="info"/> 69 <ThresholdFilter level="info"/>
70 <appender-ref ref="RollingFileInfo" /> 70 <appender-ref ref="RollingFileInfo" />
71 </logger> 71 </logger>
src/main/resources/user-agent/User-Agents.txt 0 → 100644
  1 +Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60
  2 +Opera/8.0 (Windows NT 5.1; U; en)
  3 +Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50
  4 +Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50
  5 +Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0
  6 +Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10
  7 +Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2
  8 +Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36
  9 +Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11
  10 +Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16
  11 +Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36
  12 +Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko
  13 +Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11
  14 +Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER
  15 +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)
  16 +Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)"
  17 +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)
  18 +Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)
  19 +Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0
  20 +Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)
  21 +Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36
  22 +Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36
0 \ No newline at end of file 23 \ No newline at end of file
src/test/java/com/canrd/webmagic/BaseTest.java 0 → 100644
  1 +package com.canrd.webmagic;
  2 +
  3 +import org.junit.runner.RunWith;
  4 +import org.springframework.boot.test.context.SpringBootTest;
  5 +import org.springframework.test.context.junit4.SpringRunner;
  6 +import org.springframework.transaction.annotation.Transactional;
  7 +
  8 +
  9 +@RunWith(SpringRunner.class)
  10 +@SpringBootTest(classes = Application.class)
  11 +//主动回滚测试产生的数据
  12 +@Transactional
  13 +public class BaseTest {
  14 +
  15 +
  16 +}
src/test/java/com/canrd/webmagic/utils/DateTimeUtilTest.java 0 → 100644
  1 +package com.canrd.webmagic.utils;
  2 +
  3 +import com.alibaba.fastjson.JSON;
  4 +import com.alibaba.fastjson.JSONArray;
  5 +import com.alibaba.fastjson.JSONObject;
  6 +import com.canrd.webmagic.BaseTest;
  7 +import com.canrd.webmagic.domain.dto.NatureArticleDO;
  8 +import com.canrd.webmagic.service.NatureArticleService;
  9 +import org.junit.Test;
  10 +
  11 +import javax.annotation.Resource;
  12 +import java.util.List;
  13 +
  14 +/**
  15 + * @author: xms
  16 + * @description: TODO
  17 + * @date: 2023/2/10 14:09
  18 + * @version: 1.0
  19 + */
  20 +public class DateTimeUtilTest extends BaseTest {
  21 +
  22 + @Resource
  23 + private NatureArticleService natureArticleService;
  24 +
  25 + @Test
  26 + public void export() {
  27 + List<NatureArticleDO> articleDOList = natureArticleService.list();
  28 + JSONArray array = new JSONArray();
  29 + for (NatureArticleDO articleDO : articleDOList) {
  30 + JSONArray jsonArray = JSONArray.parseArray(articleDO.getEmailInfo());
  31 + array.addAll(jsonArray);
  32 + }
  33 + System.out.println("联系作者,邮箱");
  34 + for (Object o : array) {
  35 + JSONObject jsonObject = JSONObject.parseObject(JSON.toJSONString(o));
  36 + String authorEmailName = jsonObject.getString("authorEmailName");
  37 + String email = jsonObject.getString("email");
  38 + System.out.println(authorEmailName + "," + email);
  39 + }
  40 + }
  41 +}