Commit b1b31dc60cb0fdfde1fea1c80b6fb2493a00ab32
1 parent
836066f9
feat: 配置userAgent和IP代理池
Showing
11 changed files
with
348 additions
and
6 deletions
src/main/java/com/canrd/webmagic/controller/NatureArticleController.java
... | ... | @@ -4,7 +4,7 @@ import com.canrd.webmagic.common.constant.ServerResult; |
4 | 4 | import com.canrd.webmagic.common.jsr303.OperateGroup; |
5 | 5 | import com.canrd.webmagic.domain.vo.NatureArticleQueryVO; |
6 | 6 | import com.canrd.webmagic.domain.vo.NatureArticleVO; |
7 | -import com.canrd.webmagic.processor.NatureArticlePipeline; | |
7 | +import com.canrd.webmagic.processor.pipeline.NatureArticlePipeline; | |
8 | 8 | import com.canrd.webmagic.processor.NatureSearchPageProcessor; |
9 | 9 | import com.canrd.webmagic.service.NatureArticleService; |
10 | 10 | import org.springframework.validation.annotation.Validated; | ... | ... |
src/main/java/com/canrd/webmagic/processor/NatureSearchPageProcessor.java
... | ... | @@ -4,6 +4,8 @@ import com.alibaba.fastjson.JSONArray; |
4 | 4 | import com.alibaba.fastjson.JSONObject; |
5 | 5 | import com.canrd.webmagic.common.utils.StringUtils; |
6 | 6 | import com.canrd.webmagic.domain.dto.NatureArticleDO; |
7 | +import com.canrd.webmagic.processor.config.Agent; | |
8 | +import com.canrd.webmagic.processor.pipeline.NatureArticlePipeline; | |
7 | 9 | import org.springframework.stereotype.Component; |
8 | 10 | import us.codecraft.webmagic.Page; |
9 | 11 | import us.codecraft.webmagic.Site; |
... | ... | @@ -24,9 +26,10 @@ import java.util.Objects; |
24 | 26 | */ |
25 | 27 | @Component |
26 | 28 | public class NatureSearchPageProcessor implements PageProcessor { |
29 | + private String agent = Agent.getRandom(); | |
27 | 30 | |
28 | 31 | // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等 |
29 | - private Site site = Site.me().setRetryTimes(3).setSleepTime(100); | |
32 | + private Site site = Site.me().setRetryTimes(3).setSleepTime(100).setUserAgent(agent); | |
30 | 33 | |
31 | 34 | /** |
32 | 35 | * 定制爬虫逻辑的核心接口,在这里编写抽取逻辑 | ... | ... |
src/main/java/com/canrd/webmagic/processor/config/Agent.java
0 → 100644
1 | +package com.canrd.webmagic.processor.config; | |
2 | + | |
3 | +import cn.hutool.core.io.resource.ClassPathResource; | |
4 | +import lombok.extern.slf4j.Slf4j; | |
5 | + | |
6 | +import java.io.BufferedReader; | |
7 | +import java.io.IOException; | |
8 | +import java.io.InputStream; | |
9 | +import java.io.InputStreamReader; | |
10 | +import java.util.ArrayList; | |
11 | +import java.util.List; | |
12 | +import java.util.Random; | |
13 | +import java.util.concurrent.locks.ReentrantReadWriteLock; | |
14 | + | |
15 | +/** | |
16 | + * @author: xms | |
17 | + * @description: TODO | |
18 | + * @date: 2024/4/9 10:28 | |
19 | + * @version: 1.0 | |
20 | + */ | |
21 | +@Slf4j | |
22 | +public class Agent { | |
23 | + private static final String AGENT_FILE_PATH = "user-agent/User-Agents.txt"; | |
24 | + private static ReentrantReadWriteLock lock = new ReentrantReadWriteLock(); | |
25 | + private static List<String> agents; | |
26 | + | |
27 | + /** | |
28 | + * @return | |
29 | + */ | |
30 | + public static String getRandom() { | |
31 | + String random = getRandom(null); | |
32 | + log.info("Agent======================>" + random); | |
33 | + return random; | |
34 | + } | |
35 | + | |
36 | + /** | |
37 | + * @param agent | |
38 | + * @return | |
39 | + */ | |
40 | + private static String getRandom(String agent) { | |
41 | + try { | |
42 | + lock.readLock().lock(); | |
43 | + int size = agents.size(); | |
44 | + if (size == 0) { | |
45 | + return "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"; | |
46 | + } | |
47 | + Random random = new Random(); | |
48 | + if (null != agent) { | |
49 | + return agent; | |
50 | + } else { | |
51 | + return agents.get(random.nextInt(size)); | |
52 | + } | |
53 | + } catch (Exception e) { | |
54 | + e.printStackTrace(); | |
55 | + return "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"; | |
56 | + } finally { | |
57 | + lock.readLock().unlock(); | |
58 | + } | |
59 | + } | |
60 | + | |
61 | + static { | |
62 | + agents = new ArrayList<>(); | |
63 | + InputStream resourceAsStream = null; | |
64 | + InputStreamReader inputStreamReader = null; | |
65 | + BufferedReader bufferedReader = null; | |
66 | + try { | |
67 | + resourceAsStream = new ClassPathResource(AGENT_FILE_PATH).getStream(); | |
68 | + inputStreamReader = new InputStreamReader(resourceAsStream); | |
69 | + bufferedReader = new BufferedReader(inputStreamReader); | |
70 | + String len; | |
71 | + while ((len = bufferedReader.readLine()) != null) { | |
72 | + if (!len.matches("^#.*")) { | |
73 | + agents.add(len.trim()); | |
74 | + } | |
75 | + } | |
76 | + } catch (Exception e) { | |
77 | + e.printStackTrace(); | |
78 | + } finally { | |
79 | + if (null != bufferedReader) { | |
80 | + try { | |
81 | + bufferedReader.close(); | |
82 | + } catch (IOException e) { | |
83 | + e.printStackTrace(); | |
84 | + } | |
85 | + } | |
86 | + if (null != inputStreamReader) { | |
87 | + try { | |
88 | + inputStreamReader.close(); | |
89 | + } catch (IOException e) { | |
90 | + e.printStackTrace(); | |
91 | + } | |
92 | + } | |
93 | + if (null != resourceAsStream) { | |
94 | + try { | |
95 | + resourceAsStream.close(); | |
96 | + } catch (IOException e) { | |
97 | + e.printStackTrace(); | |
98 | + } | |
99 | + } | |
100 | + } | |
101 | + } | |
102 | +} | ... | ... |
src/main/java/com/canrd/webmagic/processor/config/Downloader.java
0 → 100644
1 | +package com.canrd.webmagic.processor.config; | |
2 | + | |
3 | +import lombok.extern.slf4j.Slf4j; | |
4 | +import org.springframework.beans.factory.annotation.Autowired; | |
5 | +import org.springframework.data.redis.core.RedisTemplate; | |
6 | +import org.springframework.stereotype.Component; | |
7 | +import us.codecraft.webmagic.Request; | |
8 | +import us.codecraft.webmagic.downloader.HttpClientDownloader; | |
9 | +import us.codecraft.webmagic.proxy.Proxy; | |
10 | +import us.codecraft.webmagic.proxy.SimpleProxyProvider; | |
11 | + | |
12 | +import java.util.Random; | |
13 | + | |
14 | +/** | |
15 | + * @author: xms | |
16 | + * @description: TODO | |
17 | + * @date: 2024/4/9 10:37 | |
18 | + * @version: 1.0 | |
19 | + */ | |
20 | +@Slf4j | |
21 | +@Component | |
22 | +public class Downloader { | |
23 | + private static RedisTemplate redisTemplate; | |
24 | + | |
25 | + @Autowired | |
26 | + Downloader(RedisTemplate redisTemplate) { | |
27 | + Downloader.redisTemplate = redisTemplate; | |
28 | + } | |
29 | + | |
30 | + /** | |
31 | + * | |
32 | + * @return | |
33 | + */ | |
34 | + public static HttpClientDownloader newIpDownloader() { | |
35 | + HttpClientDownloader downloader = new HttpClientDownloader() { | |
36 | + @Override | |
37 | + protected void onError(Request request) { | |
38 | + String[] ips = newIp(); | |
39 | + setProxyProvider(SimpleProxyProvider.from(new Proxy(ips[0], Integer.parseInt(ips[1])))); | |
40 | + } | |
41 | + }; | |
42 | + return downloader; | |
43 | + } | |
44 | + | |
45 | + static String[] newIp() { | |
46 | + Long size = redisTemplate.opsForList().size("ip"); | |
47 | + String ip = redisTemplate.opsForList().index("ip", new Random().nextInt(size.intValue())).toString(); | |
48 | + log.info("获取ip===========>" + ip); | |
49 | + String[] ips = ip.split(":"); | |
50 | + return ips; | |
51 | + } | |
52 | +} | ... | ... |
src/main/java/com/canrd/webmagic/processor/config/UpdateIp.java
0 → 100644
1 | +package com.canrd.webmagic.processor.config; | |
2 | + | |
3 | +import com.baomidou.mybatisplus.core.toolkit.StringUtils; | |
4 | +import org.apache.commons.io.IOUtils; | |
5 | +import org.jsoup.Jsoup; | |
6 | +import org.jsoup.nodes.Document; | |
7 | +import org.jsoup.nodes.Element; | |
8 | +import org.jsoup.select.Elements; | |
9 | +import org.springframework.beans.factory.annotation.Autowired; | |
10 | +import org.springframework.data.redis.core.RedisTemplate; | |
11 | +import org.springframework.scheduling.annotation.Scheduled; | |
12 | +import org.springframework.stereotype.Component; | |
13 | + | |
14 | +import java.io.IOException; | |
15 | +import java.io.InputStream; | |
16 | +import java.net.InetSocketAddress; | |
17 | +import java.net.Proxy; | |
18 | +import java.net.URL; | |
19 | +import java.net.URLConnection; | |
20 | +import java.util.List; | |
21 | + | |
22 | +/** | |
23 | + * @author: xms | |
24 | + * @description: TODO | |
25 | + * @date: 2024/4/9 10:35 | |
26 | + * @version: 1.0 | |
27 | + */ | |
28 | +@Component | |
29 | +public class UpdateIp { | |
30 | + | |
31 | + @Autowired | |
32 | + private RedisTemplate redisTemplate; | |
33 | + | |
34 | + @Scheduled(cron = "*/20 * * * * ?") | |
35 | + void update() { | |
36 | + List<String> range = redisTemplate.opsForList().range("ip", 0, -1); | |
37 | + for (String ip : range) { | |
38 | + if (ifUseless(ip)) { | |
39 | + System.err.println(ip + " 从redis移除"); | |
40 | + redisTemplate.opsForList().remove("ip", 0, ip); | |
41 | + } | |
42 | + } | |
43 | + } | |
44 | + | |
45 | + @Scheduled(cron = "*/15 * * * * ?") | |
46 | + void ips() { | |
47 | + String string = null; | |
48 | + try { | |
49 | + Document document = Jsoup.connect("https://www.xicidaili.com/nn").timeout(3000).get(); | |
50 | + Elements tags = document.select("#ip_list > tbody > tr"); | |
51 | + for (Element element : tags) { | |
52 | + //取得ip地址节点 | |
53 | + Elements tdChilds = element.select("tr > td:nth-child(2)"); | |
54 | + //取得端口号节点 | |
55 | + Elements tcpd = element.select("tr > td:nth-child(3)"); | |
56 | + if (StringUtils.isNotBlank(tdChilds.text()) && StringUtils.isNotBlank(tcpd.text())) { | |
57 | + string = tdChilds.text() + ":" + tcpd.text(); | |
58 | + if (!ifUseless(string)) { | |
59 | + List<String> range = redisTemplate.opsForList().range("ip", 0, -1); | |
60 | + if (!range.contains(string)) { | |
61 | + System.err.println(string + " 存进redis"); | |
62 | + if (redisTemplate.opsForList().size("ip") > 100) { | |
63 | + redisTemplate.opsForList().rightPopAndLeftPush("ip", string); | |
64 | + } else { | |
65 | + redisTemplate.opsForList().leftPush("ip", string); | |
66 | + } | |
67 | + } | |
68 | + } | |
69 | + } | |
70 | + } | |
71 | + } catch (IOException e) { | |
72 | + e.printStackTrace(); | |
73 | + } | |
74 | + } | |
75 | + | |
76 | + /** | |
77 | + * 无效的ip 返回true 有效的ip返回false | |
78 | + * | |
79 | + * @param ip | |
80 | + * @return | |
81 | + */ | |
82 | + boolean ifUseless(String ip) { | |
83 | + String[] split = ip.split(":"); | |
84 | + URL url = null; | |
85 | + try { | |
86 | + url = new URL("http://www.baidu.com"); | |
87 | + InetSocketAddress addr = new InetSocketAddress(split[0], Integer.parseInt(split[1])); | |
88 | + Proxy proxy = new Proxy(Proxy.Type.HTTP, addr); | |
89 | + InputStream in = null; | |
90 | + try { | |
91 | + URLConnection conn = url.openConnection(proxy); | |
92 | + conn.setConnectTimeout(2000); | |
93 | + in = conn.getInputStream(); | |
94 | + } catch (Exception e) { | |
95 | + return true; | |
96 | + } | |
97 | + String s = IOUtils.toString(in); | |
98 | + if (s.indexOf("baidu") > 0) { | |
99 | + return false; | |
100 | + } | |
101 | + return true; | |
102 | + } catch (Exception e) { | |
103 | + return true; | |
104 | + } | |
105 | + } | |
106 | +} | ... | ... |
src/main/java/com/canrd/webmagic/processor/NatureArticlePipeline.java renamed to src/main/java/com/canrd/webmagic/processor/pipeline/NatureArticlePipeline.java
src/main/resources/log4j2-dev.xml
... | ... | @@ -66,7 +66,7 @@ |
66 | 66 | <ThresholdFilter level="debug"/> |
67 | 67 | <appender-ref ref="RollingFileInfo" /> |
68 | 68 | </logger> |
69 | - <logger name="com.canrd.shop" level="DEBUG" > | |
69 | + <logger name="com.canrd.webmagic" level="DEBUG" > | |
70 | 70 | <ThresholdFilter level="debug"/> |
71 | 71 | <appender-ref ref="RollingFileInfo" /> |
72 | 72 | </logger> | ... | ... |
src/main/resources/log4j2-prod.xml
... | ... | @@ -30,7 +30,7 @@ |
30 | 30 | <!--</console>--> |
31 | 31 | |
32 | 32 | <!-- 这个会打印出所有的info及以下级别的信息,每次大小超过size,则这size大小的日志会自动存入按年份-月份建立的文件夹下面并进行压缩,作为存档 --> |
33 | - <RollingFile name="RollingFileInfo" fileName="${sys:logging.path}/logs/overtime.log" filePattern="${sys:logging.path}/logs/$${date:yyyy-MM-dd}/info-%d{yyyy-MM-dd}-%i.log"> | |
33 | + <RollingFile name="RollingFileInfo" fileName="${sys:logging.path}/logs/webmagic.log" filePattern="${sys:logging.path}/logs/$${date:yyyy-MM-dd}/info-%d{yyyy-MM-dd}-%i.log"> | |
34 | 34 | <!--控制台只输出level及以上级别的信息(onMatch),其他的直接拒绝(onMismatch) --> |
35 | 35 | <ThresholdFilter level="info" onMatch="ACCEPT" onMismatch="DENY" /> |
36 | 36 | <!--<Filters>--> |
... | ... | @@ -65,7 +65,7 @@ |
65 | 65 | <ThresholdFilter level="info"/> |
66 | 66 | <appender-ref ref="RollingFileInfo" /> |
67 | 67 | </logger> |
68 | - <logger name="com.canrd.shop" level="info" > | |
68 | + <logger name="com.canrd.webmagic" level="info" > | |
69 | 69 | <ThresholdFilter level="info"/> |
70 | 70 | <appender-ref ref="RollingFileInfo" /> |
71 | 71 | </logger> | ... | ... |
src/main/resources/user-agent/User-Agents.txt
0 → 100644
1 | +Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60 | |
2 | +Opera/8.0 (Windows NT 5.1; U; en) | |
3 | +Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50 | |
4 | +Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50 | |
5 | +Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0 | |
6 | +Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10 | |
7 | +Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2 | |
8 | +Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36 | |
9 | +Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11 | |
10 | +Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16 | |
11 | +Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36 | |
12 | +Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko | |
13 | +Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11 | |
14 | +Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER | |
15 | +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER) | |
16 | +Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)" | |
17 | +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400) | |
18 | +Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E) | |
19 | +Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0 | |
20 | +Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0) | |
21 | +Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36 | |
22 | +Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36 | |
0 | 23 | \ No newline at end of file | ... | ... |
src/test/java/com/canrd/webmagic/BaseTest.java
0 → 100644
1 | +package com.canrd.webmagic; | |
2 | + | |
3 | +import org.junit.runner.RunWith; | |
4 | +import org.springframework.boot.test.context.SpringBootTest; | |
5 | +import org.springframework.test.context.junit4.SpringRunner; | |
6 | +import org.springframework.transaction.annotation.Transactional; | |
7 | + | |
8 | + | |
9 | +@RunWith(SpringRunner.class) | |
10 | +@SpringBootTest(classes = Application.class) | |
11 | +//主动回滚测试产生的数据 | |
12 | +@Transactional | |
13 | +public class BaseTest { | |
14 | + | |
15 | + | |
16 | +} | ... | ... |
src/test/java/com/canrd/webmagic/utils/DateTimeUtilTest.java
0 → 100644
1 | +package com.canrd.webmagic.utils; | |
2 | + | |
3 | +import com.alibaba.fastjson.JSON; | |
4 | +import com.alibaba.fastjson.JSONArray; | |
5 | +import com.alibaba.fastjson.JSONObject; | |
6 | +import com.canrd.webmagic.BaseTest; | |
7 | +import com.canrd.webmagic.domain.dto.NatureArticleDO; | |
8 | +import com.canrd.webmagic.service.NatureArticleService; | |
9 | +import org.junit.Test; | |
10 | + | |
11 | +import javax.annotation.Resource; | |
12 | +import java.util.List; | |
13 | + | |
14 | +/** | |
15 | + * @author: xms | |
16 | + * @description: TODO | |
17 | + * @date: 2023/2/10 14:09 | |
18 | + * @version: 1.0 | |
19 | + */ | |
20 | +public class DateTimeUtilTest extends BaseTest { | |
21 | + | |
22 | + @Resource | |
23 | + private NatureArticleService natureArticleService; | |
24 | + | |
25 | + @Test | |
26 | + public void export() { | |
27 | + List<NatureArticleDO> articleDOList = natureArticleService.list(); | |
28 | + JSONArray array = new JSONArray(); | |
29 | + for (NatureArticleDO articleDO : articleDOList) { | |
30 | + JSONArray jsonArray = JSONArray.parseArray(articleDO.getEmailInfo()); | |
31 | + array.addAll(jsonArray); | |
32 | + } | |
33 | + System.out.println("联系作者,邮箱"); | |
34 | + for (Object o : array) { | |
35 | + JSONObject jsonObject = JSONObject.parseObject(JSON.toJSONString(o)); | |
36 | + String authorEmailName = jsonObject.getString("authorEmailName"); | |
37 | + String email = jsonObject.getString("email"); | |
38 | + System.out.println(authorEmailName + "," + email); | |
39 | + } | |
40 | + } | |
41 | +} | ... | ... |