Commit 174a7c30cdf0b758bbd294c8a49d1cdcd0b3b171
1 parent
b7789621
feat:
1、selenium 整合 2、science 网站
Showing
6 changed files
with
60 additions
and
27 deletions
src/main/java/com/canrd/webmagic/config/SeleniumConfig.java
1 | package com.canrd.webmagic.config; | 1 | package com.canrd.webmagic.config; |
2 | 2 | ||
3 | -import com.canrd.webmagic.processor.config.Agent; | ||
4 | import org.openqa.selenium.WebDriver; | 3 | import org.openqa.selenium.WebDriver; |
5 | import org.openqa.selenium.chrome.ChromeDriver; | 4 | import org.openqa.selenium.chrome.ChromeDriver; |
6 | import org.openqa.selenium.chrome.ChromeOptions; | 5 | import org.openqa.selenium.chrome.ChromeOptions; |
@@ -16,9 +15,25 @@ import org.springframework.context.annotation.Configuration; | @@ -16,9 +15,25 @@ import org.springframework.context.annotation.Configuration; | ||
16 | @Configuration | 15 | @Configuration |
17 | public class SeleniumConfig { | 16 | public class SeleniumConfig { |
18 | 17 | ||
19 | - @Bean | ||
20 | - public WebDriver webDriver() { | 18 | +// @Bean |
19 | + public WebDriver webDriver() throws InterruptedException { | ||
21 | System.setProperty("webdriver.chrome.driver", "D:\\chrome\\chromedriver-win64\\chromedriver-win64\\chromedriver.exe"); | 20 | System.setProperty("webdriver.chrome.driver", "D:\\chrome\\chromedriver-win64\\chromedriver-win64\\chromedriver.exe"); |
22 | - return new ChromeDriver(); | 21 | + // 初始化ChromeOptions |
22 | + ChromeOptions options = new ChromeOptions(); | ||
23 | + // 添加代理,这里使用的代理是示例,需要替换为实际的代理服务器地址和端口 | ||
24 | +// options.addArguments("--proxy-server=http://proxy-server:port"); | ||
25 | + | ||
26 | + // 禁用JavaScript,有时这能帮助绕过Cloudflare的检查 | ||
27 | + options.addArguments("--disable-javascript"); | ||
28 | + | ||
29 | + // 禁用浏览器扩展,如果知道Cloudflare使用了特定的扩展,可以禁用它 | ||
30 | + options.addArguments("--disable-extensions"); | ||
31 | + | ||
32 | + // 禁用本地缓存,确保每次访问都从服务器获取 | ||
33 | + options.addArguments("--disable-application-cache"); | ||
34 | + | ||
35 | + options.setBinary("D:\\chrome\\chrome-win64\\chrome-win64\\chrome.exe"); | ||
36 | + | ||
37 | + return new ChromeDriver(options); | ||
23 | } | 38 | } |
24 | } | 39 | } |
src/main/java/com/canrd/webmagic/controller/Science4JournalController.java
@@ -5,7 +5,9 @@ import com.canrd.webmagic.common.jsr303.OperateGroup; | @@ -5,7 +5,9 @@ import com.canrd.webmagic.common.jsr303.OperateGroup; | ||
5 | import com.canrd.webmagic.domain.vo.NatureArticleQueryVO; | 5 | import com.canrd.webmagic.domain.vo.NatureArticleQueryVO; |
6 | import com.canrd.webmagic.domain.vo.NatureArticleVO; | 6 | import com.canrd.webmagic.domain.vo.NatureArticleVO; |
7 | import com.canrd.webmagic.processor.Science4JournalSearchPageProcessor; | 7 | import com.canrd.webmagic.processor.Science4JournalSearchPageProcessor; |
8 | +import com.canrd.webmagic.processor.download.SeleniumDownloader; | ||
8 | import com.canrd.webmagic.service.ArticleService; | 9 | import com.canrd.webmagic.service.ArticleService; |
10 | +import org.apache.logging.log4j.core.util.UuidUtil; | ||
9 | import org.springframework.validation.annotation.Validated; | 11 | import org.springframework.validation.annotation.Validated; |
10 | import org.springframework.web.bind.annotation.*; | 12 | import org.springframework.web.bind.annotation.*; |
11 | import us.codecraft.webmagic.Spider; | 13 | import us.codecraft.webmagic.Spider; |
@@ -30,6 +32,9 @@ public class Science4JournalController { | @@ -30,6 +32,9 @@ public class Science4JournalController { | ||
30 | @Resource | 32 | @Resource |
31 | private Science4JournalSearchPageProcessor science4JournalSearchPageProcessor; | 33 | private Science4JournalSearchPageProcessor science4JournalSearchPageProcessor; |
32 | 34 | ||
35 | + @Resource | ||
36 | + private SeleniumDownloader seleniumDownloader; | ||
37 | + | ||
33 | /** | 38 | /** |
34 | * @return | 39 | * @return |
35 | */ | 40 | */ |
@@ -37,8 +42,10 @@ public class Science4JournalController { | @@ -37,8 +42,10 @@ public class Science4JournalController { | ||
37 | public ServerResult start(@RequestParam(value = "keyword") String keyword, @RequestParam(value = "indexSize") Integer indexSize) { | 42 | public ServerResult start(@RequestParam(value = "keyword") String keyword, @RequestParam(value = "indexSize") Integer indexSize) { |
38 | for (int i = 0; i <= indexSize; i++) { | 43 | for (int i = 0; i <= indexSize; i++) { |
39 | Spider.create(science4JournalSearchPageProcessor) | 44 | Spider.create(science4JournalSearchPageProcessor) |
40 | - .addUrl("http://www.science.org/journal/science/insights?startPage=" + i) | 45 | + .addUrl("https://www.science.org/journal/science/insights?startPage=" + i) |
41 | // 开启5个线程执行,并开始爬取 | 46 | // 开启5个线程执行,并开始爬取 |
47 | + .setUUID(UuidUtil.getTimeBasedUuid().toString()) | ||
48 | + .setDownloader(seleniumDownloader) | ||
42 | .thread(5).run(); | 49 | .thread(5).run(); |
43 | } | 50 | } |
44 | 51 |
src/main/java/com/canrd/webmagic/processor/Science4JournalArticlePageProcessor.java
@@ -32,10 +32,9 @@ import java.util.Objects; | @@ -32,10 +32,9 @@ import java.util.Objects; | ||
32 | @Slf4j | 32 | @Slf4j |
33 | @Component | 33 | @Component |
34 | public class Science4JournalArticlePageProcessor implements PageProcessor { | 34 | public class Science4JournalArticlePageProcessor implements PageProcessor { |
35 | - private String agent = Agent.getRandom(); | ||
36 | 35 | ||
37 | // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等 | 36 | // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等 |
38 | - private Site site = Site.me().setRetryTimes(3).setSleepTime(100).setUserAgent(Agent.getRandom()); | 37 | + private Site site = Site.me().setRetryTimes(3).setSleepTime(5).setUserAgent(Agent.getRandom()); |
39 | 38 | ||
40 | /** | 39 | /** |
41 | * 定制爬虫逻辑的核心接口,在这里编写抽取逻辑 | 40 | * 定制爬虫逻辑的核心接口,在这里编写抽取逻辑 |
@@ -108,6 +107,10 @@ public class Science4JournalArticlePageProcessor implements PageProcessor { | @@ -108,6 +107,10 @@ public class Science4JournalArticlePageProcessor implements PageProcessor { | ||
108 | return site; | 107 | return site; |
109 | } | 108 | } |
110 | 109 | ||
110 | + public void setSite(Site site) { | ||
111 | + this.site = site; | ||
112 | + } | ||
113 | + | ||
111 | public static void main(String[] args) { | 114 | public static void main(String[] args) { |
112 | // 创建一个Spider,并把我们的处理器放进去 | 115 | // 创建一个Spider,并把我们的处理器放进去 |
113 | Spider.create(new Science4JournalArticlePageProcessor()) | 116 | Spider.create(new Science4JournalArticlePageProcessor()) |
src/main/java/com/canrd/webmagic/processor/Science4JournalSearchPageProcessor.java
@@ -6,6 +6,7 @@ import com.canrd.webmagic.processor.config.Agent; | @@ -6,6 +6,7 @@ import com.canrd.webmagic.processor.config.Agent; | ||
6 | import com.canrd.webmagic.processor.download.SeleniumDownloader; | 6 | import com.canrd.webmagic.processor.download.SeleniumDownloader; |
7 | import com.canrd.webmagic.processor.pipeline.ArticlePipeline; | 7 | import com.canrd.webmagic.processor.pipeline.ArticlePipeline; |
8 | import lombok.extern.slf4j.Slf4j; | 8 | import lombok.extern.slf4j.Slf4j; |
9 | +import org.apache.logging.log4j.core.util.UuidUtil; | ||
9 | import org.springframework.stereotype.Component; | 10 | import org.springframework.stereotype.Component; |
10 | import us.codecraft.webmagic.Page; | 11 | import us.codecraft.webmagic.Page; |
11 | import us.codecraft.webmagic.Site; | 12 | import us.codecraft.webmagic.Site; |
@@ -43,7 +44,7 @@ public class Science4JournalSearchPageProcessor implements PageProcessor { | @@ -43,7 +44,7 @@ public class Science4JournalSearchPageProcessor implements PageProcessor { | ||
43 | /** | 44 | /** |
44 | * 抓取网站的相关配置,包括编码、抓取间隔、重试次数等 | 45 | * 抓取网站的相关配置,包括编码、抓取间隔、重试次数等 |
45 | */ | 46 | */ |
46 | - private Site site = Site.me().setRetryTimes(3).setSleepTime(100).setUserAgent(Agent.getRandom()); | 47 | + private Site site = Site.me().setRetryTimes(3).setSleepTime(5).setUserAgent(Agent.getRandom()); |
47 | 48 | ||
48 | /** | 49 | /** |
49 | * 定制爬虫逻辑的核心接口,在这里编写抽取逻辑 | 50 | * 定制爬虫逻辑的核心接口,在这里编写抽取逻辑 |
@@ -84,6 +85,7 @@ public class Science4JournalSearchPageProcessor implements PageProcessor { | @@ -84,6 +85,7 @@ public class Science4JournalSearchPageProcessor implements PageProcessor { | ||
84 | .addUrl(link) | 85 | .addUrl(link) |
85 | .addPipeline(articlePipeline) | 86 | .addPipeline(articlePipeline) |
86 | .setDownloader(seleniumDownloader) | 87 | .setDownloader(seleniumDownloader) |
88 | + .setUUID(UuidUtil.getTimeBasedUuid().toString()) | ||
87 | // 开启5个线程执行,并开始爬取 | 89 | // 开启5个线程执行,并开始爬取 |
88 | .thread(1).run(); | 90 | .thread(1).run(); |
89 | log.info("关键字文章列表链接:{},标题:{},文章链接:{}", url, title, link); | 91 | log.info("关键字文章列表链接:{},标题:{},文章链接:{}", url, title, link); |
src/main/java/com/canrd/webmagic/processor/download/SeleniumDownloader.java
1 | package com.canrd.webmagic.processor.download; | 1 | package com.canrd.webmagic.processor.download; |
2 | 2 | ||
3 | +import com.canrd.webmagic.config.SeleniumConfig; | ||
4 | +import com.canrd.webmagic.processor.config.Agent; | ||
3 | import lombok.extern.slf4j.Slf4j; | 5 | import lombok.extern.slf4j.Slf4j; |
4 | import org.openqa.selenium.By; | 6 | import org.openqa.selenium.By; |
5 | import org.openqa.selenium.Cookie; | 7 | import org.openqa.selenium.Cookie; |
@@ -26,10 +28,10 @@ import java.util.Map; | @@ -26,10 +28,10 @@ import java.util.Map; | ||
26 | @Slf4j | 28 | @Slf4j |
27 | @Component | 29 | @Component |
28 | public class SeleniumDownloader extends AbstractDownloader { | 30 | public class SeleniumDownloader extends AbstractDownloader { |
29 | - private int sleepTime = 0; | 31 | + private int sleepTime = 30; |
30 | 32 | ||
31 | @Resource | 33 | @Resource |
32 | - private WebDriver webDriver; | 34 | + private SeleniumConfig config; |
33 | 35 | ||
34 | /** | 36 | /** |
35 | * set sleep time to wait until load success | 37 | * set sleep time to wait until load success |
@@ -45,20 +47,12 @@ public class SeleniumDownloader extends AbstractDownloader { | @@ -45,20 +47,12 @@ public class SeleniumDownloader extends AbstractDownloader { | ||
45 | @Override | 47 | @Override |
46 | public Page download(Request request, Task task) { | 48 | public Page download(Request request, Task task) { |
47 | Page page = Page.fail(); | 49 | Page page = Page.fail(); |
50 | + WebDriver webDriver = null; | ||
48 | try { | 51 | try { |
49 | - | ||
50 | - | ||
51 | - log.info("downloading page " + request.getUrl()); | ||
52 | - webDriver.get(request.getUrl()); | ||
53 | - try { | ||
54 | - if (sleepTime > 0) { | ||
55 | - Thread.sleep(sleepTime); | ||
56 | - } | ||
57 | - } catch (InterruptedException e) { | ||
58 | - e.printStackTrace(); | ||
59 | - } | 52 | + webDriver = config.webDriver(); |
60 | WebDriver.Options manage = webDriver.manage(); | 53 | WebDriver.Options manage = webDriver.manage(); |
61 | Site site = task.getSite(); | 54 | Site site = task.getSite(); |
55 | + site.setUserAgent(Agent.getRandom()); | ||
62 | if (site.getCookies() != null) { | 56 | if (site.getCookies() != null) { |
63 | for (Map.Entry<String, String> cookieEntry : site.getCookies() | 57 | for (Map.Entry<String, String> cookieEntry : site.getCookies() |
64 | .entrySet()) { | 58 | .entrySet()) { |
@@ -68,6 +62,18 @@ public class SeleniumDownloader extends AbstractDownloader { | @@ -68,6 +62,18 @@ public class SeleniumDownloader extends AbstractDownloader { | ||
68 | } | 62 | } |
69 | } | 63 | } |
70 | 64 | ||
65 | + log.info("downloading page " + request.getUrl()); | ||
66 | + | ||
67 | + webDriver.get(request.getUrl()); | ||
68 | + try { | ||
69 | + if (sleepTime > 0) { | ||
70 | + Thread.sleep(sleepTime); | ||
71 | + } | ||
72 | + } catch (InterruptedException e) { | ||
73 | + e.printStackTrace(); | ||
74 | + } | ||
75 | + | ||
76 | + | ||
71 | /* | 77 | /* |
72 | * TODO You can add mouse event or other processes | 78 | * TODO You can add mouse event or other processes |
73 | * | 79 | * |
@@ -75,7 +81,7 @@ public class SeleniumDownloader extends AbstractDownloader { | @@ -75,7 +81,7 @@ public class SeleniumDownloader extends AbstractDownloader { | ||
75 | */ | 81 | */ |
76 | try { | 82 | try { |
77 | //休眠3秒就是为了动态的数据渲染完成后在进行获取 | 83 | //休眠3秒就是为了动态的数据渲染完成后在进行获取 |
78 | - Thread.sleep(30000); | 84 | + Thread.sleep(3000); |
79 | } catch (InterruptedException e) { | 85 | } catch (InterruptedException e) { |
80 | throw new RuntimeException(e); | 86 | throw new RuntimeException(e); |
81 | } | 87 | } |
@@ -91,7 +97,11 @@ public class SeleniumDownloader extends AbstractDownloader { | @@ -91,7 +97,11 @@ public class SeleniumDownloader extends AbstractDownloader { | ||
91 | log.warn("download page {} error", request.getUrl(), e); | 97 | log.warn("download page {} error", request.getUrl(), e); |
92 | onError(request, task, e); | 98 | onError(request, task, e); |
93 | } finally { | 99 | } finally { |
94 | - | 100 | + if (webDriver != null) { |
101 | + webDriver.close(); | ||
102 | + webDriver.quit(); | ||
103 | + webDriver = null; | ||
104 | + } | ||
95 | } | 105 | } |
96 | return page; | 106 | return page; |
97 | } | 107 | } |
src/main/resources/user-agent/User-Agents.txt
@@ -3,19 +3,15 @@ Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera | @@ -3,19 +3,15 @@ Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera | ||
3 | Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50 | 3 | Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50 |
4 | Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0 | 4 | Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0 |
5 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2 | 5 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2 |
6 | -Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36 | ||
7 | Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11 | 6 | Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11 |
8 | Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16 | 7 | Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16 |
9 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36 | 8 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36 |
10 | Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko | 9 | Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko |
11 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11 | 10 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11 |
12 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER | 11 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER |
13 | -Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER) | ||
14 | Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)" | 12 | Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)" |
15 | -Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400) | ||
16 | Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E) | 13 | Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E) |
17 | Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0 | 14 | Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0 |
18 | -Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0) | ||
19 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36 | 15 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36 |
20 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36 | 16 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36 |
21 | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 | 17 | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 |