Commit 0c4251f275d9fc307a18ba4114fa122c296069e0
1 parent
89ec05fb
feat: nature article爬取
Showing
3 changed files
with
22 additions
and
20 deletions
src/main/java/com/canrd/webmagic/controller/ArticleController.java
@@ -5,6 +5,7 @@ import com.canrd.webmagic.common.jsr303.OperateGroup; | @@ -5,6 +5,7 @@ import com.canrd.webmagic.common.jsr303.OperateGroup; | ||
5 | import com.canrd.webmagic.domain.vo.NatureArticleQueryVO; | 5 | import com.canrd.webmagic.domain.vo.NatureArticleQueryVO; |
6 | import com.canrd.webmagic.domain.vo.NatureArticleVO; | 6 | import com.canrd.webmagic.domain.vo.NatureArticleVO; |
7 | import com.canrd.webmagic.processor.NatureSearchPageProcessor; | 7 | import com.canrd.webmagic.processor.NatureSearchPageProcessor; |
8 | +import com.canrd.webmagic.processor.config.Downloader; | ||
8 | import com.canrd.webmagic.processor.pipeline.NatureArticlePipeline; | 9 | import com.canrd.webmagic.processor.pipeline.NatureArticlePipeline; |
9 | import com.canrd.webmagic.service.ArticleService; | 10 | import com.canrd.webmagic.service.ArticleService; |
10 | import org.springframework.validation.annotation.Validated; | 11 | import org.springframework.validation.annotation.Validated; |
@@ -34,6 +35,9 @@ public class ArticleController { | @@ -34,6 +35,9 @@ public class ArticleController { | ||
34 | @Resource | 35 | @Resource |
35 | private NatureArticlePipeline articlePipeline; | 36 | private NatureArticlePipeline articlePipeline; |
36 | 37 | ||
38 | + @Resource | ||
39 | + private Downloader downloader; | ||
40 | + | ||
37 | /** | 41 | /** |
38 | * @return | 42 | * @return |
39 | */ | 43 | */ |
@@ -45,6 +49,7 @@ public class ArticleController { | @@ -45,6 +49,7 @@ public class ArticleController { | ||
45 | .addUrl("https://www.nature.com/search?q=" + keyword + "&page=" + i) | 49 | .addUrl("https://www.nature.com/search?q=" + keyword + "&page=" + i) |
46 | .addUrl("https://www.nature.com/nature/research-articles?sort=PubDate&page=" + i) | 50 | .addUrl("https://www.nature.com/nature/research-articles?sort=PubDate&page=" + i) |
47 | .addPipeline(articlePipeline) | 51 | .addPipeline(articlePipeline) |
52 | + .setDownloader(downloader.newIpDownloader()) | ||
48 | // 开启5个线程执行,并开始爬取 | 53 | // 开启5个线程执行,并开始爬取 |
49 | .thread(5).run(); | 54 | .thread(5).run(); |
50 | } | 55 | } |
src/main/java/com/canrd/webmagic/processor/config/Downloader.java
@@ -28,10 +28,9 @@ public class Downloader { | @@ -28,10 +28,9 @@ public class Downloader { | ||
28 | } | 28 | } |
29 | 29 | ||
30 | /** | 30 | /** |
31 | - * | ||
32 | * @return | 31 | * @return |
33 | */ | 32 | */ |
34 | - public static HttpClientDownloader newIpDownloader() { | 33 | + public HttpClientDownloader newIpDownloader() { |
35 | HttpClientDownloader downloader = new HttpClientDownloader() { | 34 | HttpClientDownloader downloader = new HttpClientDownloader() { |
36 | @Override | 35 | @Override |
37 | protected void onError(Request request) { | 36 | protected void onError(Request request) { |
@@ -39,10 +38,12 @@ public class Downloader { | @@ -39,10 +38,12 @@ public class Downloader { | ||
39 | setProxyProvider(SimpleProxyProvider.from(new Proxy(ips[0], Integer.parseInt(ips[1])))); | 38 | setProxyProvider(SimpleProxyProvider.from(new Proxy(ips[0], Integer.parseInt(ips[1])))); |
40 | } | 39 | } |
41 | }; | 40 | }; |
41 | + String[] ips = newIp(); | ||
42 | + downloader.setProxyProvider(SimpleProxyProvider.from(new Proxy(ips[0], Integer.parseInt(ips[1])))); | ||
42 | return downloader; | 43 | return downloader; |
43 | } | 44 | } |
44 | 45 | ||
45 | - static String[] newIp() { | 46 | + private String[] newIp() { |
46 | Long size = redisTemplate.opsForList().size("ip"); | 47 | Long size = redisTemplate.opsForList().size("ip"); |
47 | String ip = redisTemplate.opsForList().index("ip", new Random().nextInt(size.intValue())).toString(); | 48 | String ip = redisTemplate.opsForList().index("ip", new Random().nextInt(size.intValue())).toString(); |
48 | log.info("获取ip===========>" + ip); | 49 | log.info("获取ip===========>" + ip); |
src/main/java/com/canrd/webmagic/processor/config/UpdateIp.java
1 | package com.canrd.webmagic.processor.config; | 1 | package com.canrd.webmagic.processor.config; |
2 | 2 | ||
3 | -import com.baomidou.mybatisplus.core.toolkit.StringUtils; | ||
4 | import org.apache.commons.io.IOUtils; | 3 | import org.apache.commons.io.IOUtils; |
5 | import org.jsoup.Jsoup; | 4 | import org.jsoup.Jsoup; |
6 | import org.jsoup.nodes.Document; | 5 | import org.jsoup.nodes.Document; |
@@ -31,7 +30,7 @@ public class UpdateIp { | @@ -31,7 +30,7 @@ public class UpdateIp { | ||
31 | @Autowired | 30 | @Autowired |
32 | private RedisTemplate redisTemplate; | 31 | private RedisTemplate redisTemplate; |
33 | 32 | ||
34 | -// @Scheduled(cron = "*/20 * * * * ?") | 33 | + @Scheduled(cron = "*/20 * * * * ?") |
35 | void update() { | 34 | void update() { |
36 | List<String> range = redisTemplate.opsForList().range("ip", 0, -1); | 35 | List<String> range = redisTemplate.opsForList().range("ip", 0, -1); |
37 | for (String ip : range) { | 36 | for (String ip : range) { |
@@ -44,25 +43,22 @@ public class UpdateIp { | @@ -44,25 +43,22 @@ public class UpdateIp { | ||
44 | 43 | ||
45 | // @Scheduled(cron = "*/15 * * * * ?") | 44 | // @Scheduled(cron = "*/15 * * * * ?") |
46 | void ips() { | 45 | void ips() { |
47 | - String string = null; | ||
48 | try { | 46 | try { |
49 | - Document document = Jsoup.connect("https://www.xicidaili.com/nn").timeout(3000).get(); | ||
50 | - Elements tags = document.select("#ip_list > tbody > tr"); | ||
51 | - for (Element element : tags) { | ||
52 | - //取得ip地址节点 | ||
53 | - Elements tdChilds = element.select("tr > td:nth-child(2)"); | ||
54 | - //取得端口号节点 | ||
55 | - Elements tcpd = element.select("tr > td:nth-child(3)"); | ||
56 | - if (StringUtils.isNotBlank(tdChilds.text()) && StringUtils.isNotBlank(tcpd.text())) { | ||
57 | - string = tdChilds.text() + ":" + tcpd.text(); | ||
58 | - if (!ifUseless(string)) { | 47 | + for (int i = 1; i < 10; i++) { |
48 | + Document document = Jsoup.connect("https://www.zdaye.com/free/" + i + "/?sAdr=taiwan").timeout(3000).get(); | ||
49 | + Elements tags = document.selectXpath("//table[@id='ipc']/tbody/tr"); | ||
50 | + for (Element element : tags) { | ||
51 | + String ip = element.getElementsByTag("td").get(0).text(); | ||
52 | + String port = document.selectXpath("//table[@id='ipc']/tbody/tr").get(0).getElementsByTag("td").get(1).text(); | ||
53 | + String uri = ip + ":" + port; | ||
54 | + if (!ifUseless(uri)) { | ||
59 | List<String> range = redisTemplate.opsForList().range("ip", 0, -1); | 55 | List<String> range = redisTemplate.opsForList().range("ip", 0, -1); |
60 | - if (!range.contains(string)) { | ||
61 | - System.err.println(string + " 存进redis"); | 56 | + if (!range.contains(uri)) { |
57 | + System.err.println(uri + " 存进redis"); | ||
62 | if (redisTemplate.opsForList().size("ip") > 100) { | 58 | if (redisTemplate.opsForList().size("ip") > 100) { |
63 | - redisTemplate.opsForList().rightPopAndLeftPush("ip", string); | 59 | + redisTemplate.opsForList().rightPopAndLeftPush("ip", uri); |
64 | } else { | 60 | } else { |
65 | - redisTemplate.opsForList().leftPush("ip", string); | 61 | + redisTemplate.opsForList().leftPush("ip", uri); |
66 | } | 62 | } |
67 | } | 63 | } |
68 | } | 64 | } |