Commit 0c4251f275d9fc307a18ba4114fa122c296069e0

Authored by 谢茂盛
1 parent 89ec05fb

feat: nature article爬取

src/main/java/com/canrd/webmagic/controller/ArticleController.java
... ... @@ -5,6 +5,7 @@ import com.canrd.webmagic.common.jsr303.OperateGroup;
5 5 import com.canrd.webmagic.domain.vo.NatureArticleQueryVO;
6 6 import com.canrd.webmagic.domain.vo.NatureArticleVO;
7 7 import com.canrd.webmagic.processor.NatureSearchPageProcessor;
  8 +import com.canrd.webmagic.processor.config.Downloader;
8 9 import com.canrd.webmagic.processor.pipeline.NatureArticlePipeline;
9 10 import com.canrd.webmagic.service.ArticleService;
10 11 import org.springframework.validation.annotation.Validated;
... ... @@ -34,6 +35,9 @@ public class ArticleController {
34 35 @Resource
35 36 private NatureArticlePipeline articlePipeline;
36 37  
  38 + @Resource
  39 + private Downloader downloader;
  40 +
37 41 /**
38 42 * @return
39 43 */
... ... @@ -45,6 +49,7 @@ public class ArticleController {
45 49 .addUrl("https://www.nature.com/search?q=" + keyword + "&page=" + i)
46 50 .addUrl("https://www.nature.com/nature/research-articles?sort=PubDate&page=" + i)
47 51 .addPipeline(articlePipeline)
  52 + .setDownloader(downloader.newIpDownloader())
48 53 // 开启5个线程执行,并开始爬取
49 54 .thread(5).run();
50 55 }
... ...
src/main/java/com/canrd/webmagic/processor/config/Downloader.java
... ... @@ -28,10 +28,9 @@ public class Downloader {
28 28 }
29 29  
30 30 /**
31   - *
32 31 * @return
33 32 */
34   - public static HttpClientDownloader newIpDownloader() {
  33 + public HttpClientDownloader newIpDownloader() {
35 34 HttpClientDownloader downloader = new HttpClientDownloader() {
36 35 @Override
37 36 protected void onError(Request request) {
... ... @@ -39,10 +38,12 @@ public class Downloader {
39 38 setProxyProvider(SimpleProxyProvider.from(new Proxy(ips[0], Integer.parseInt(ips[1]))));
40 39 }
41 40 };
  41 + String[] ips = newIp();
  42 + downloader.setProxyProvider(SimpleProxyProvider.from(new Proxy(ips[0], Integer.parseInt(ips[1]))));
42 43 return downloader;
43 44 }
44 45  
45   - static String[] newIp() {
  46 + private String[] newIp() {
46 47 Long size = redisTemplate.opsForList().size("ip");
47 48 String ip = redisTemplate.opsForList().index("ip", new Random().nextInt(size.intValue())).toString();
48 49 log.info("获取ip===========>" + ip);
... ...
src/main/java/com/canrd/webmagic/processor/config/UpdateIp.java
1 1 package com.canrd.webmagic.processor.config;
2 2  
3   -import com.baomidou.mybatisplus.core.toolkit.StringUtils;
4 3 import org.apache.commons.io.IOUtils;
5 4 import org.jsoup.Jsoup;
6 5 import org.jsoup.nodes.Document;
... ... @@ -31,7 +30,7 @@ public class UpdateIp {
31 30 @Autowired
32 31 private RedisTemplate redisTemplate;
33 32  
34   -// @Scheduled(cron = "*/20 * * * * ?")
  33 + @Scheduled(cron = "*/20 * * * * ?")
35 34 void update() {
36 35 List<String> range = redisTemplate.opsForList().range("ip", 0, -1);
37 36 for (String ip : range) {
... ... @@ -44,25 +43,22 @@ public class UpdateIp {
44 43  
45 44 // @Scheduled(cron = "*/15 * * * * ?")
46 45 void ips() {
47   - String string = null;
48 46 try {
49   - Document document = Jsoup.connect("https://www.xicidaili.com/nn").timeout(3000).get();
50   - Elements tags = document.select("#ip_list > tbody > tr");
51   - for (Element element : tags) {
52   - //取得ip地址节点
53   - Elements tdChilds = element.select("tr > td:nth-child(2)");
54   - //取得端口号节点
55   - Elements tcpd = element.select("tr > td:nth-child(3)");
56   - if (StringUtils.isNotBlank(tdChilds.text()) && StringUtils.isNotBlank(tcpd.text())) {
57   - string = tdChilds.text() + ":" + tcpd.text();
58   - if (!ifUseless(string)) {
  47 + for (int i = 1; i < 10; i++) {
  48 + Document document = Jsoup.connect("https://www.zdaye.com/free/" + i + "/?sAdr=taiwan").timeout(3000).get();
  49 + Elements tags = document.selectXpath("//table[@id='ipc']/tbody/tr");
  50 + for (Element element : tags) {
  51 + String ip = element.getElementsByTag("td").get(0).text();
  52 + String port = document.selectXpath("//table[@id='ipc']/tbody/tr").get(0).getElementsByTag("td").get(1).text();
  53 + String uri = ip + ":" + port;
  54 + if (!ifUseless(uri)) {
59 55 List<String> range = redisTemplate.opsForList().range("ip", 0, -1);
60   - if (!range.contains(string)) {
61   - System.err.println(string + " 存进redis");
  56 + if (!range.contains(uri)) {
  57 + System.err.println(uri + " 存进redis");
62 58 if (redisTemplate.opsForList().size("ip") > 100) {
63   - redisTemplate.opsForList().rightPopAndLeftPush("ip", string);
  59 + redisTemplate.opsForList().rightPopAndLeftPush("ip", uri);
64 60 } else {
65   - redisTemplate.opsForList().leftPush("ip", string);
  61 + redisTemplate.opsForList().leftPush("ip", uri);
66 62 }
67 63 }
68 64 }
... ...