Commit 0c4251f275d9fc307a18ba4114fa122c296069e0

Authored by 谢茂盛
1 parent 89ec05fb

feat: nature article爬取

src/main/java/com/canrd/webmagic/controller/ArticleController.java
@@ -5,6 +5,7 @@ import com.canrd.webmagic.common.jsr303.OperateGroup; @@ -5,6 +5,7 @@ import com.canrd.webmagic.common.jsr303.OperateGroup;
5 import com.canrd.webmagic.domain.vo.NatureArticleQueryVO; 5 import com.canrd.webmagic.domain.vo.NatureArticleQueryVO;
6 import com.canrd.webmagic.domain.vo.NatureArticleVO; 6 import com.canrd.webmagic.domain.vo.NatureArticleVO;
7 import com.canrd.webmagic.processor.NatureSearchPageProcessor; 7 import com.canrd.webmagic.processor.NatureSearchPageProcessor;
  8 +import com.canrd.webmagic.processor.config.Downloader;
8 import com.canrd.webmagic.processor.pipeline.NatureArticlePipeline; 9 import com.canrd.webmagic.processor.pipeline.NatureArticlePipeline;
9 import com.canrd.webmagic.service.ArticleService; 10 import com.canrd.webmagic.service.ArticleService;
10 import org.springframework.validation.annotation.Validated; 11 import org.springframework.validation.annotation.Validated;
@@ -34,6 +35,9 @@ public class ArticleController { @@ -34,6 +35,9 @@ public class ArticleController {
34 @Resource 35 @Resource
35 private NatureArticlePipeline articlePipeline; 36 private NatureArticlePipeline articlePipeline;
36 37
  38 + @Resource
  39 + private Downloader downloader;
  40 +
37 /** 41 /**
38 * @return 42 * @return
39 */ 43 */
@@ -45,6 +49,7 @@ public class ArticleController { @@ -45,6 +49,7 @@ public class ArticleController {
45 .addUrl("https://www.nature.com/search?q=" + keyword + "&page=" + i) 49 .addUrl("https://www.nature.com/search?q=" + keyword + "&page=" + i)
46 .addUrl("https://www.nature.com/nature/research-articles?sort=PubDate&page=" + i) 50 .addUrl("https://www.nature.com/nature/research-articles?sort=PubDate&page=" + i)
47 .addPipeline(articlePipeline) 51 .addPipeline(articlePipeline)
  52 + .setDownloader(downloader.newIpDownloader())
48 // 开启5个线程执行,并开始爬取 53 // 开启5个线程执行,并开始爬取
49 .thread(5).run(); 54 .thread(5).run();
50 } 55 }
src/main/java/com/canrd/webmagic/processor/config/Downloader.java
@@ -28,10 +28,9 @@ public class Downloader { @@ -28,10 +28,9 @@ public class Downloader {
28 } 28 }
29 29
30 /** 30 /**
31 - *  
32 * @return 31 * @return
33 */ 32 */
34 - public static HttpClientDownloader newIpDownloader() { 33 + public HttpClientDownloader newIpDownloader() {
35 HttpClientDownloader downloader = new HttpClientDownloader() { 34 HttpClientDownloader downloader = new HttpClientDownloader() {
36 @Override 35 @Override
37 protected void onError(Request request) { 36 protected void onError(Request request) {
@@ -39,10 +38,12 @@ public class Downloader { @@ -39,10 +38,12 @@ public class Downloader {
39 setProxyProvider(SimpleProxyProvider.from(new Proxy(ips[0], Integer.parseInt(ips[1])))); 38 setProxyProvider(SimpleProxyProvider.from(new Proxy(ips[0], Integer.parseInt(ips[1]))));
40 } 39 }
41 }; 40 };
  41 + String[] ips = newIp();
  42 + downloader.setProxyProvider(SimpleProxyProvider.from(new Proxy(ips[0], Integer.parseInt(ips[1]))));
42 return downloader; 43 return downloader;
43 } 44 }
44 45
45 - static String[] newIp() { 46 + private String[] newIp() {
46 Long size = redisTemplate.opsForList().size("ip"); 47 Long size = redisTemplate.opsForList().size("ip");
47 String ip = redisTemplate.opsForList().index("ip", new Random().nextInt(size.intValue())).toString(); 48 String ip = redisTemplate.opsForList().index("ip", new Random().nextInt(size.intValue())).toString();
48 log.info("获取ip===========>" + ip); 49 log.info("获取ip===========>" + ip);
src/main/java/com/canrd/webmagic/processor/config/UpdateIp.java
1 package com.canrd.webmagic.processor.config; 1 package com.canrd.webmagic.processor.config;
2 2
3 -import com.baomidou.mybatisplus.core.toolkit.StringUtils;  
4 import org.apache.commons.io.IOUtils; 3 import org.apache.commons.io.IOUtils;
5 import org.jsoup.Jsoup; 4 import org.jsoup.Jsoup;
6 import org.jsoup.nodes.Document; 5 import org.jsoup.nodes.Document;
@@ -31,7 +30,7 @@ public class UpdateIp { @@ -31,7 +30,7 @@ public class UpdateIp {
31 @Autowired 30 @Autowired
32 private RedisTemplate redisTemplate; 31 private RedisTemplate redisTemplate;
33 32
34 -// @Scheduled(cron = "*/20 * * * * ?") 33 + @Scheduled(cron = "*/20 * * * * ?")
35 void update() { 34 void update() {
36 List<String> range = redisTemplate.opsForList().range("ip", 0, -1); 35 List<String> range = redisTemplate.opsForList().range("ip", 0, -1);
37 for (String ip : range) { 36 for (String ip : range) {
@@ -44,25 +43,22 @@ public class UpdateIp { @@ -44,25 +43,22 @@ public class UpdateIp {
44 43
45 // @Scheduled(cron = "*/15 * * * * ?") 44 // @Scheduled(cron = "*/15 * * * * ?")
46 void ips() { 45 void ips() {
47 - String string = null;  
48 try { 46 try {
49 - Document document = Jsoup.connect("https://www.xicidaili.com/nn").timeout(3000).get();  
50 - Elements tags = document.select("#ip_list > tbody > tr");  
51 - for (Element element : tags) {  
52 - //取得ip地址节点  
53 - Elements tdChilds = element.select("tr > td:nth-child(2)");  
54 - //取得端口号节点  
55 - Elements tcpd = element.select("tr > td:nth-child(3)");  
56 - if (StringUtils.isNotBlank(tdChilds.text()) && StringUtils.isNotBlank(tcpd.text())) {  
57 - string = tdChilds.text() + ":" + tcpd.text();  
58 - if (!ifUseless(string)) { 47 + for (int i = 1; i < 10; i++) {
  48 + Document document = Jsoup.connect("https://www.zdaye.com/free/" + i + "/?sAdr=taiwan").timeout(3000).get();
  49 + Elements tags = document.selectXpath("//table[@id='ipc']/tbody/tr");
  50 + for (Element element : tags) {
  51 + String ip = element.getElementsByTag("td").get(0).text();
  52 + String port = document.selectXpath("//table[@id='ipc']/tbody/tr").get(0).getElementsByTag("td").get(1).text();
  53 + String uri = ip + ":" + port;
  54 + if (!ifUseless(uri)) {
59 List<String> range = redisTemplate.opsForList().range("ip", 0, -1); 55 List<String> range = redisTemplate.opsForList().range("ip", 0, -1);
60 - if (!range.contains(string)) {  
61 - System.err.println(string + " 存进redis"); 56 + if (!range.contains(uri)) {
  57 + System.err.println(uri + " 存进redis");
62 if (redisTemplate.opsForList().size("ip") > 100) { 58 if (redisTemplate.opsForList().size("ip") > 100) {
63 - redisTemplate.opsForList().rightPopAndLeftPush("ip", string); 59 + redisTemplate.opsForList().rightPopAndLeftPush("ip", uri);
64 } else { 60 } else {
65 - redisTemplate.opsForList().leftPush("ip", string); 61 + redisTemplate.opsForList().leftPush("ip", uri);
66 } 62 }
67 } 63 }
68 } 64 }