Commit 0c4251f275d9fc307a18ba4114fa122c296069e0
1 parent
89ec05fb
feat: nature article爬取
Showing
3 changed files
with
22 additions
and
20 deletions
src/main/java/com/canrd/webmagic/controller/ArticleController.java
... | ... | @@ -5,6 +5,7 @@ import com.canrd.webmagic.common.jsr303.OperateGroup; |
5 | 5 | import com.canrd.webmagic.domain.vo.NatureArticleQueryVO; |
6 | 6 | import com.canrd.webmagic.domain.vo.NatureArticleVO; |
7 | 7 | import com.canrd.webmagic.processor.NatureSearchPageProcessor; |
8 | +import com.canrd.webmagic.processor.config.Downloader; | |
8 | 9 | import com.canrd.webmagic.processor.pipeline.NatureArticlePipeline; |
9 | 10 | import com.canrd.webmagic.service.ArticleService; |
10 | 11 | import org.springframework.validation.annotation.Validated; |
... | ... | @@ -34,6 +35,9 @@ public class ArticleController { |
34 | 35 | @Resource |
35 | 36 | private NatureArticlePipeline articlePipeline; |
36 | 37 | |
38 | + @Resource | |
39 | + private Downloader downloader; | |
40 | + | |
37 | 41 | /** |
38 | 42 | * @return |
39 | 43 | */ |
... | ... | @@ -45,6 +49,7 @@ public class ArticleController { |
45 | 49 | .addUrl("https://www.nature.com/search?q=" + keyword + "&page=" + i) |
46 | 50 | .addUrl("https://www.nature.com/nature/research-articles?sort=PubDate&page=" + i) |
47 | 51 | .addPipeline(articlePipeline) |
52 | + .setDownloader(downloader.newIpDownloader()) | |
48 | 53 | // 开启5个线程执行,并开始爬取 |
49 | 54 | .thread(5).run(); |
50 | 55 | } | ... | ... |
src/main/java/com/canrd/webmagic/processor/config/Downloader.java
... | ... | @@ -28,10 +28,9 @@ public class Downloader { |
28 | 28 | } |
29 | 29 | |
30 | 30 | /** |
31 | - * | |
32 | 31 | * @return |
33 | 32 | */ |
34 | - public static HttpClientDownloader newIpDownloader() { | |
33 | + public HttpClientDownloader newIpDownloader() { | |
35 | 34 | HttpClientDownloader downloader = new HttpClientDownloader() { |
36 | 35 | @Override |
37 | 36 | protected void onError(Request request) { |
... | ... | @@ -39,10 +38,12 @@ public class Downloader { |
39 | 38 | setProxyProvider(SimpleProxyProvider.from(new Proxy(ips[0], Integer.parseInt(ips[1])))); |
40 | 39 | } |
41 | 40 | }; |
41 | + String[] ips = newIp(); | |
42 | + downloader.setProxyProvider(SimpleProxyProvider.from(new Proxy(ips[0], Integer.parseInt(ips[1])))); | |
42 | 43 | return downloader; |
43 | 44 | } |
44 | 45 | |
45 | - static String[] newIp() { | |
46 | + private String[] newIp() { | |
46 | 47 | Long size = redisTemplate.opsForList().size("ip"); |
47 | 48 | String ip = redisTemplate.opsForList().index("ip", new Random().nextInt(size.intValue())).toString(); |
48 | 49 | log.info("获取ip===========>" + ip); | ... | ... |
src/main/java/com/canrd/webmagic/processor/config/UpdateIp.java
1 | 1 | package com.canrd.webmagic.processor.config; |
2 | 2 | |
3 | -import com.baomidou.mybatisplus.core.toolkit.StringUtils; | |
4 | 3 | import org.apache.commons.io.IOUtils; |
5 | 4 | import org.jsoup.Jsoup; |
6 | 5 | import org.jsoup.nodes.Document; |
... | ... | @@ -31,7 +30,7 @@ public class UpdateIp { |
31 | 30 | @Autowired |
32 | 31 | private RedisTemplate redisTemplate; |
33 | 32 | |
34 | -// @Scheduled(cron = "*/20 * * * * ?") | |
33 | + @Scheduled(cron = "*/20 * * * * ?") | |
35 | 34 | void update() { |
36 | 35 | List<String> range = redisTemplate.opsForList().range("ip", 0, -1); |
37 | 36 | for (String ip : range) { |
... | ... | @@ -44,25 +43,22 @@ public class UpdateIp { |
44 | 43 | |
45 | 44 | // @Scheduled(cron = "*/15 * * * * ?") |
46 | 45 | void ips() { |
47 | - String string = null; | |
48 | 46 | try { |
49 | - Document document = Jsoup.connect("https://www.xicidaili.com/nn").timeout(3000).get(); | |
50 | - Elements tags = document.select("#ip_list > tbody > tr"); | |
51 | - for (Element element : tags) { | |
52 | - //取得ip地址节点 | |
53 | - Elements tdChilds = element.select("tr > td:nth-child(2)"); | |
54 | - //取得端口号节点 | |
55 | - Elements tcpd = element.select("tr > td:nth-child(3)"); | |
56 | - if (StringUtils.isNotBlank(tdChilds.text()) && StringUtils.isNotBlank(tcpd.text())) { | |
57 | - string = tdChilds.text() + ":" + tcpd.text(); | |
58 | - if (!ifUseless(string)) { | |
47 | + for (int i = 1; i < 10; i++) { | |
48 | + Document document = Jsoup.connect("https://www.zdaye.com/free/" + i + "/?sAdr=taiwan").timeout(3000).get(); | |
49 | + Elements tags = document.selectXpath("//table[@id='ipc']/tbody/tr"); | |
50 | + for (Element element : tags) { | |
51 | + String ip = element.getElementsByTag("td").get(0).text(); | |
52 | + String port = document.selectXpath("//table[@id='ipc']/tbody/tr").get(0).getElementsByTag("td").get(1).text(); | |
53 | + String uri = ip + ":" + port; | |
54 | + if (!ifUseless(uri)) { | |
59 | 55 | List<String> range = redisTemplate.opsForList().range("ip", 0, -1); |
60 | - if (!range.contains(string)) { | |
61 | - System.err.println(string + " 存进redis"); | |
56 | + if (!range.contains(uri)) { | |
57 | + System.err.println(uri + " 存进redis"); | |
62 | 58 | if (redisTemplate.opsForList().size("ip") > 100) { |
63 | - redisTemplate.opsForList().rightPopAndLeftPush("ip", string); | |
59 | + redisTemplate.opsForList().rightPopAndLeftPush("ip", uri); | |
64 | 60 | } else { |
65 | - redisTemplate.opsForList().leftPush("ip", string); | |
61 | + redisTemplate.opsForList().leftPush("ip", uri); | |
66 | 62 | } |
67 | 63 | } |
68 | 64 | } | ... | ... |