Commit 174a7c30cdf0b758bbd294c8a49d1cdcd0b3b171

Authored by 谢茂盛
1 parent b7789621

feat:

1、selenium 整合
2、science 网站
src/main/java/com/canrd/webmagic/config/SeleniumConfig.java
1 package com.canrd.webmagic.config; 1 package com.canrd.webmagic.config;
2 2
3 -import com.canrd.webmagic.processor.config.Agent;  
4 import org.openqa.selenium.WebDriver; 3 import org.openqa.selenium.WebDriver;
5 import org.openqa.selenium.chrome.ChromeDriver; 4 import org.openqa.selenium.chrome.ChromeDriver;
6 import org.openqa.selenium.chrome.ChromeOptions; 5 import org.openqa.selenium.chrome.ChromeOptions;
@@ -16,9 +15,25 @@ import org.springframework.context.annotation.Configuration; @@ -16,9 +15,25 @@ import org.springframework.context.annotation.Configuration;
16 @Configuration 15 @Configuration
17 public class SeleniumConfig { 16 public class SeleniumConfig {
18 17
19 - @Bean  
20 - public WebDriver webDriver() { 18 +// @Bean
  19 + public WebDriver webDriver() throws InterruptedException {
21 System.setProperty("webdriver.chrome.driver", "D:\\chrome\\chromedriver-win64\\chromedriver-win64\\chromedriver.exe"); 20 System.setProperty("webdriver.chrome.driver", "D:\\chrome\\chromedriver-win64\\chromedriver-win64\\chromedriver.exe");
22 - return new ChromeDriver(); 21 + // 初始化ChromeOptions
  22 + ChromeOptions options = new ChromeOptions();
  23 + // 添加代理,这里使用的代理是示例,需要替换为实际的代理服务器地址和端口
  24 +// options.addArguments("--proxy-server=http://proxy-server:port");
  25 +
  26 + // 禁用JavaScript,有时这能帮助绕过Cloudflare的检查
  27 + options.addArguments("--disable-javascript");
  28 +
  29 + // 禁用浏览器扩展,如果知道Cloudflare使用了特定的扩展,可以禁用它
  30 + options.addArguments("--disable-extensions");
  31 +
  32 + // 禁用本地缓存,确保每次访问都从服务器获取
  33 + options.addArguments("--disable-application-cache");
  34 +
  35 + options.setBinary("D:\\chrome\\chrome-win64\\chrome-win64\\chrome.exe");
  36 +
  37 + return new ChromeDriver(options);
23 } 38 }
24 } 39 }
src/main/java/com/canrd/webmagic/controller/Science4JournalController.java
@@ -5,7 +5,9 @@ import com.canrd.webmagic.common.jsr303.OperateGroup; @@ -5,7 +5,9 @@ import com.canrd.webmagic.common.jsr303.OperateGroup;
5 import com.canrd.webmagic.domain.vo.NatureArticleQueryVO; 5 import com.canrd.webmagic.domain.vo.NatureArticleQueryVO;
6 import com.canrd.webmagic.domain.vo.NatureArticleVO; 6 import com.canrd.webmagic.domain.vo.NatureArticleVO;
7 import com.canrd.webmagic.processor.Science4JournalSearchPageProcessor; 7 import com.canrd.webmagic.processor.Science4JournalSearchPageProcessor;
  8 +import com.canrd.webmagic.processor.download.SeleniumDownloader;
8 import com.canrd.webmagic.service.ArticleService; 9 import com.canrd.webmagic.service.ArticleService;
  10 +import org.apache.logging.log4j.core.util.UuidUtil;
9 import org.springframework.validation.annotation.Validated; 11 import org.springframework.validation.annotation.Validated;
10 import org.springframework.web.bind.annotation.*; 12 import org.springframework.web.bind.annotation.*;
11 import us.codecraft.webmagic.Spider; 13 import us.codecraft.webmagic.Spider;
@@ -30,6 +32,9 @@ public class Science4JournalController { @@ -30,6 +32,9 @@ public class Science4JournalController {
30 @Resource 32 @Resource
31 private Science4JournalSearchPageProcessor science4JournalSearchPageProcessor; 33 private Science4JournalSearchPageProcessor science4JournalSearchPageProcessor;
32 34
  35 + @Resource
  36 + private SeleniumDownloader seleniumDownloader;
  37 +
33 /** 38 /**
34 * @return 39 * @return
35 */ 40 */
@@ -37,8 +42,10 @@ public class Science4JournalController { @@ -37,8 +42,10 @@ public class Science4JournalController {
37 public ServerResult start(@RequestParam(value = "keyword") String keyword, @RequestParam(value = "indexSize") Integer indexSize) { 42 public ServerResult start(@RequestParam(value = "keyword") String keyword, @RequestParam(value = "indexSize") Integer indexSize) {
38 for (int i = 0; i <= indexSize; i++) { 43 for (int i = 0; i <= indexSize; i++) {
39 Spider.create(science4JournalSearchPageProcessor) 44 Spider.create(science4JournalSearchPageProcessor)
40 - .addUrl("http://www.science.org/journal/science/insights?startPage=" + i) 45 + .addUrl("https://www.science.org/journal/science/insights?startPage=" + i)
41 // 开启5个线程执行,并开始爬取 46 // 开启5个线程执行,并开始爬取
  47 + .setUUID(UuidUtil.getTimeBasedUuid().toString())
  48 + .setDownloader(seleniumDownloader)
42 .thread(5).run(); 49 .thread(5).run();
43 } 50 }
44 51
src/main/java/com/canrd/webmagic/processor/Science4JournalArticlePageProcessor.java
@@ -32,10 +32,9 @@ import java.util.Objects; @@ -32,10 +32,9 @@ import java.util.Objects;
32 @Slf4j 32 @Slf4j
33 @Component 33 @Component
34 public class Science4JournalArticlePageProcessor implements PageProcessor { 34 public class Science4JournalArticlePageProcessor implements PageProcessor {
35 - private String agent = Agent.getRandom();  
36 35
37 // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等 36 // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等
38 - private Site site = Site.me().setRetryTimes(3).setSleepTime(100).setUserAgent(Agent.getRandom()); 37 + private Site site = Site.me().setRetryTimes(3).setSleepTime(5).setUserAgent(Agent.getRandom());
39 38
40 /** 39 /**
41 * 定制爬虫逻辑的核心接口,在这里编写抽取逻辑 40 * 定制爬虫逻辑的核心接口,在这里编写抽取逻辑
@@ -108,6 +107,10 @@ public class Science4JournalArticlePageProcessor implements PageProcessor { @@ -108,6 +107,10 @@ public class Science4JournalArticlePageProcessor implements PageProcessor {
108 return site; 107 return site;
109 } 108 }
110 109
  110 + public void setSite(Site site) {
  111 + this.site = site;
  112 + }
  113 +
111 public static void main(String[] args) { 114 public static void main(String[] args) {
112 // 创建一个Spider,并把我们的处理器放进去 115 // 创建一个Spider,并把我们的处理器放进去
113 Spider.create(new Science4JournalArticlePageProcessor()) 116 Spider.create(new Science4JournalArticlePageProcessor())
src/main/java/com/canrd/webmagic/processor/Science4JournalSearchPageProcessor.java
@@ -6,6 +6,7 @@ import com.canrd.webmagic.processor.config.Agent; @@ -6,6 +6,7 @@ import com.canrd.webmagic.processor.config.Agent;
6 import com.canrd.webmagic.processor.download.SeleniumDownloader; 6 import com.canrd.webmagic.processor.download.SeleniumDownloader;
7 import com.canrd.webmagic.processor.pipeline.ArticlePipeline; 7 import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
8 import lombok.extern.slf4j.Slf4j; 8 import lombok.extern.slf4j.Slf4j;
  9 +import org.apache.logging.log4j.core.util.UuidUtil;
9 import org.springframework.stereotype.Component; 10 import org.springframework.stereotype.Component;
10 import us.codecraft.webmagic.Page; 11 import us.codecraft.webmagic.Page;
11 import us.codecraft.webmagic.Site; 12 import us.codecraft.webmagic.Site;
@@ -43,7 +44,7 @@ public class Science4JournalSearchPageProcessor implements PageProcessor { @@ -43,7 +44,7 @@ public class Science4JournalSearchPageProcessor implements PageProcessor {
43 /** 44 /**
44 * 抓取网站的相关配置,包括编码、抓取间隔、重试次数等 45 * 抓取网站的相关配置,包括编码、抓取间隔、重试次数等
45 */ 46 */
46 - private Site site = Site.me().setRetryTimes(3).setSleepTime(100).setUserAgent(Agent.getRandom()); 47 + private Site site = Site.me().setRetryTimes(3).setSleepTime(5).setUserAgent(Agent.getRandom());
47 48
48 /** 49 /**
49 * 定制爬虫逻辑的核心接口,在这里编写抽取逻辑 50 * 定制爬虫逻辑的核心接口,在这里编写抽取逻辑
@@ -84,6 +85,7 @@ public class Science4JournalSearchPageProcessor implements PageProcessor { @@ -84,6 +85,7 @@ public class Science4JournalSearchPageProcessor implements PageProcessor {
84 .addUrl(link) 85 .addUrl(link)
85 .addPipeline(articlePipeline) 86 .addPipeline(articlePipeline)
86 .setDownloader(seleniumDownloader) 87 .setDownloader(seleniumDownloader)
  88 + .setUUID(UuidUtil.getTimeBasedUuid().toString())
87 // 开启5个线程执行,并开始爬取 89 // 开启5个线程执行,并开始爬取
88 .thread(1).run(); 90 .thread(1).run();
89 log.info("关键字文章列表链接:{},标题:{},文章链接:{}", url, title, link); 91 log.info("关键字文章列表链接:{},标题:{},文章链接:{}", url, title, link);
src/main/java/com/canrd/webmagic/processor/download/SeleniumDownloader.java
1 package com.canrd.webmagic.processor.download; 1 package com.canrd.webmagic.processor.download;
2 2
  3 +import com.canrd.webmagic.config.SeleniumConfig;
  4 +import com.canrd.webmagic.processor.config.Agent;
3 import lombok.extern.slf4j.Slf4j; 5 import lombok.extern.slf4j.Slf4j;
4 import org.openqa.selenium.By; 6 import org.openqa.selenium.By;
5 import org.openqa.selenium.Cookie; 7 import org.openqa.selenium.Cookie;
@@ -26,10 +28,10 @@ import java.util.Map; @@ -26,10 +28,10 @@ import java.util.Map;
26 @Slf4j 28 @Slf4j
27 @Component 29 @Component
28 public class SeleniumDownloader extends AbstractDownloader { 30 public class SeleniumDownloader extends AbstractDownloader {
29 - private int sleepTime = 0; 31 + private int sleepTime = 30;
30 32
31 @Resource 33 @Resource
32 - private WebDriver webDriver; 34 + private SeleniumConfig config;
33 35
34 /** 36 /**
35 * set sleep time to wait until load success 37 * set sleep time to wait until load success
@@ -45,20 +47,12 @@ public class SeleniumDownloader extends AbstractDownloader { @@ -45,20 +47,12 @@ public class SeleniumDownloader extends AbstractDownloader {
45 @Override 47 @Override
46 public Page download(Request request, Task task) { 48 public Page download(Request request, Task task) {
47 Page page = Page.fail(); 49 Page page = Page.fail();
  50 + WebDriver webDriver = null;
48 try { 51 try {
49 -  
50 -  
51 - log.info("downloading page " + request.getUrl());  
52 - webDriver.get(request.getUrl());  
53 - try {  
54 - if (sleepTime > 0) {  
55 - Thread.sleep(sleepTime);  
56 - }  
57 - } catch (InterruptedException e) {  
58 - e.printStackTrace();  
59 - } 52 + webDriver = config.webDriver();
60 WebDriver.Options manage = webDriver.manage(); 53 WebDriver.Options manage = webDriver.manage();
61 Site site = task.getSite(); 54 Site site = task.getSite();
  55 + site.setUserAgent(Agent.getRandom());
62 if (site.getCookies() != null) { 56 if (site.getCookies() != null) {
63 for (Map.Entry<String, String> cookieEntry : site.getCookies() 57 for (Map.Entry<String, String> cookieEntry : site.getCookies()
64 .entrySet()) { 58 .entrySet()) {
@@ -68,6 +62,18 @@ public class SeleniumDownloader extends AbstractDownloader { @@ -68,6 +62,18 @@ public class SeleniumDownloader extends AbstractDownloader {
68 } 62 }
69 } 63 }
70 64
  65 + log.info("downloading page " + request.getUrl());
  66 +
  67 + webDriver.get(request.getUrl());
  68 + try {
  69 + if (sleepTime > 0) {
  70 + Thread.sleep(sleepTime);
  71 + }
  72 + } catch (InterruptedException e) {
  73 + e.printStackTrace();
  74 + }
  75 +
  76 +
71 /* 77 /*
72 * TODO You can add mouse event or other processes 78 * TODO You can add mouse event or other processes
73 * 79 *
@@ -75,7 +81,7 @@ public class SeleniumDownloader extends AbstractDownloader { @@ -75,7 +81,7 @@ public class SeleniumDownloader extends AbstractDownloader {
75 */ 81 */
76 try { 82 try {
77 //休眠3秒就是为了动态的数据渲染完成后在进行获取 83 //休眠3秒就是为了动态的数据渲染完成后在进行获取
78 - Thread.sleep(30000); 84 + Thread.sleep(3000);
79 } catch (InterruptedException e) { 85 } catch (InterruptedException e) {
80 throw new RuntimeException(e); 86 throw new RuntimeException(e);
81 } 87 }
@@ -91,7 +97,11 @@ public class SeleniumDownloader extends AbstractDownloader { @@ -91,7 +97,11 @@ public class SeleniumDownloader extends AbstractDownloader {
91 log.warn("download page {} error", request.getUrl(), e); 97 log.warn("download page {} error", request.getUrl(), e);
92 onError(request, task, e); 98 onError(request, task, e);
93 } finally { 99 } finally {
94 - 100 + if (webDriver != null) {
  101 + webDriver.close();
  102 + webDriver.quit();
  103 + webDriver = null;
  104 + }
95 } 105 }
96 return page; 106 return page;
97 } 107 }
src/main/resources/user-agent/User-Agents.txt
@@ -3,19 +3,15 @@ Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera @@ -3,19 +3,15 @@ Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera
3 Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50 3 Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50
4 Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0 4 Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0
5 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2 5 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2
6 -Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36  
7 Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11 6 Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11
8 Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16 7 Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16
9 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36 8 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36
10 Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko 9 Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko
11 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11 10 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11
12 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER 11 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER
13 -Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)  
14 Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)" 12 Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)"
15 -Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)  
16 Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E) 13 Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)
17 Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0 14 Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0
18 -Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)  
19 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36 15 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36
20 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36 16 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36
21 Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 17 Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36