Commit 174a7c30cdf0b758bbd294c8a49d1cdcd0b3b171
1 parent
b7789621
feat:
1、selenium 整合 2、science 网站
Showing
6 changed files
with
60 additions
and
27 deletions
src/main/java/com/canrd/webmagic/config/SeleniumConfig.java
1 | 1 | package com.canrd.webmagic.config; |
2 | 2 | |
3 | -import com.canrd.webmagic.processor.config.Agent; | |
4 | 3 | import org.openqa.selenium.WebDriver; |
5 | 4 | import org.openqa.selenium.chrome.ChromeDriver; |
6 | 5 | import org.openqa.selenium.chrome.ChromeOptions; |
... | ... | @@ -16,9 +15,25 @@ import org.springframework.context.annotation.Configuration; |
16 | 15 | @Configuration |
17 | 16 | public class SeleniumConfig { |
18 | 17 | |
19 | - @Bean | |
20 | - public WebDriver webDriver() { | |
18 | +// @Bean | |
19 | + public WebDriver webDriver() throws InterruptedException { | |
21 | 20 | System.setProperty("webdriver.chrome.driver", "D:\\chrome\\chromedriver-win64\\chromedriver-win64\\chromedriver.exe"); |
22 | - return new ChromeDriver(); | |
21 | + // 初始化ChromeOptions | |
22 | + ChromeOptions options = new ChromeOptions(); | |
23 | + // 添加代理,这里使用的代理是示例,需要替换为实际的代理服务器地址和端口 | |
24 | +// options.addArguments("--proxy-server=http://proxy-server:port"); | |
25 | + | |
26 | + // 禁用JavaScript,有时这能帮助绕过Cloudflare的检查 | |
27 | + options.addArguments("--disable-javascript"); | |
28 | + | |
29 | + // 禁用浏览器扩展,如果知道Cloudflare使用了特定的扩展,可以禁用它 | |
30 | + options.addArguments("--disable-extensions"); | |
31 | + | |
32 | + // 禁用本地缓存,确保每次访问都从服务器获取 | |
33 | + options.addArguments("--disable-application-cache"); | |
34 | + | |
35 | + options.setBinary("D:\\chrome\\chrome-win64\\chrome-win64\\chrome.exe"); | |
36 | + | |
37 | + return new ChromeDriver(options); | |
23 | 38 | } |
24 | 39 | } | ... | ... |
src/main/java/com/canrd/webmagic/controller/Science4JournalController.java
... | ... | @@ -5,7 +5,9 @@ import com.canrd.webmagic.common.jsr303.OperateGroup; |
5 | 5 | import com.canrd.webmagic.domain.vo.NatureArticleQueryVO; |
6 | 6 | import com.canrd.webmagic.domain.vo.NatureArticleVO; |
7 | 7 | import com.canrd.webmagic.processor.Science4JournalSearchPageProcessor; |
8 | +import com.canrd.webmagic.processor.download.SeleniumDownloader; | |
8 | 9 | import com.canrd.webmagic.service.ArticleService; |
10 | +import org.apache.logging.log4j.core.util.UuidUtil; | |
9 | 11 | import org.springframework.validation.annotation.Validated; |
10 | 12 | import org.springframework.web.bind.annotation.*; |
11 | 13 | import us.codecraft.webmagic.Spider; |
... | ... | @@ -30,6 +32,9 @@ public class Science4JournalController { |
30 | 32 | @Resource |
31 | 33 | private Science4JournalSearchPageProcessor science4JournalSearchPageProcessor; |
32 | 34 | |
35 | + @Resource | |
36 | + private SeleniumDownloader seleniumDownloader; | |
37 | + | |
33 | 38 | /** |
34 | 39 | * @return |
35 | 40 | */ |
... | ... | @@ -37,8 +42,10 @@ public class Science4JournalController { |
37 | 42 | public ServerResult start(@RequestParam(value = "keyword") String keyword, @RequestParam(value = "indexSize") Integer indexSize) { |
38 | 43 | for (int i = 0; i <= indexSize; i++) { |
39 | 44 | Spider.create(science4JournalSearchPageProcessor) |
40 | - .addUrl("http://www.science.org/journal/science/insights?startPage=" + i) | |
45 | + .addUrl("https://www.science.org/journal/science/insights?startPage=" + i) | |
41 | 46 | // 开启5个线程执行,并开始爬取 |
47 | + .setUUID(UuidUtil.getTimeBasedUuid().toString()) | |
48 | + .setDownloader(seleniumDownloader) | |
42 | 49 | .thread(5).run(); |
43 | 50 | } |
44 | 51 | ... | ... |
src/main/java/com/canrd/webmagic/processor/Science4JournalArticlePageProcessor.java
... | ... | @@ -32,10 +32,9 @@ import java.util.Objects; |
32 | 32 | @Slf4j |
33 | 33 | @Component |
34 | 34 | public class Science4JournalArticlePageProcessor implements PageProcessor { |
35 | - private String agent = Agent.getRandom(); | |
36 | 35 | |
37 | 36 | // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等 |
38 | - private Site site = Site.me().setRetryTimes(3).setSleepTime(100).setUserAgent(Agent.getRandom()); | |
37 | + private Site site = Site.me().setRetryTimes(3).setSleepTime(5).setUserAgent(Agent.getRandom()); | |
39 | 38 | |
40 | 39 | /** |
41 | 40 | * 定制爬虫逻辑的核心接口,在这里编写抽取逻辑 |
... | ... | @@ -108,6 +107,10 @@ public class Science4JournalArticlePageProcessor implements PageProcessor { |
108 | 107 | return site; |
109 | 108 | } |
110 | 109 | |
110 | + public void setSite(Site site) { | |
111 | + this.site = site; | |
112 | + } | |
113 | + | |
111 | 114 | public static void main(String[] args) { |
112 | 115 | // 创建一个Spider,并把我们的处理器放进去 |
113 | 116 | Spider.create(new Science4JournalArticlePageProcessor()) | ... | ... |
src/main/java/com/canrd/webmagic/processor/Science4JournalSearchPageProcessor.java
... | ... | @@ -6,6 +6,7 @@ import com.canrd.webmagic.processor.config.Agent; |
6 | 6 | import com.canrd.webmagic.processor.download.SeleniumDownloader; |
7 | 7 | import com.canrd.webmagic.processor.pipeline.ArticlePipeline; |
8 | 8 | import lombok.extern.slf4j.Slf4j; |
9 | +import org.apache.logging.log4j.core.util.UuidUtil; | |
9 | 10 | import org.springframework.stereotype.Component; |
10 | 11 | import us.codecraft.webmagic.Page; |
11 | 12 | import us.codecraft.webmagic.Site; |
... | ... | @@ -43,7 +44,7 @@ public class Science4JournalSearchPageProcessor implements PageProcessor { |
43 | 44 | /** |
44 | 45 | * 抓取网站的相关配置,包括编码、抓取间隔、重试次数等 |
45 | 46 | */ |
46 | - private Site site = Site.me().setRetryTimes(3).setSleepTime(100).setUserAgent(Agent.getRandom()); | |
47 | + private Site site = Site.me().setRetryTimes(3).setSleepTime(5).setUserAgent(Agent.getRandom()); | |
47 | 48 | |
48 | 49 | /** |
49 | 50 | * 定制爬虫逻辑的核心接口,在这里编写抽取逻辑 |
... | ... | @@ -84,6 +85,7 @@ public class Science4JournalSearchPageProcessor implements PageProcessor { |
84 | 85 | .addUrl(link) |
85 | 86 | .addPipeline(articlePipeline) |
86 | 87 | .setDownloader(seleniumDownloader) |
88 | + .setUUID(UuidUtil.getTimeBasedUuid().toString()) | |
87 | 89 | // 开启5个线程执行,并开始爬取 |
88 | 90 | .thread(1).run(); |
89 | 91 | log.info("关键字文章列表链接:{},标题:{},文章链接:{}", url, title, link); | ... | ... |
src/main/java/com/canrd/webmagic/processor/download/SeleniumDownloader.java
1 | 1 | package com.canrd.webmagic.processor.download; |
2 | 2 | |
3 | +import com.canrd.webmagic.config.SeleniumConfig; | |
4 | +import com.canrd.webmagic.processor.config.Agent; | |
3 | 5 | import lombok.extern.slf4j.Slf4j; |
4 | 6 | import org.openqa.selenium.By; |
5 | 7 | import org.openqa.selenium.Cookie; |
... | ... | @@ -26,10 +28,10 @@ import java.util.Map; |
26 | 28 | @Slf4j |
27 | 29 | @Component |
28 | 30 | public class SeleniumDownloader extends AbstractDownloader { |
29 | - private int sleepTime = 0; | |
31 | + private int sleepTime = 30; | |
30 | 32 | |
31 | 33 | @Resource |
32 | - private WebDriver webDriver; | |
34 | + private SeleniumConfig config; | |
33 | 35 | |
34 | 36 | /** |
35 | 37 | * set sleep time to wait until load success |
... | ... | @@ -45,20 +47,12 @@ public class SeleniumDownloader extends AbstractDownloader { |
45 | 47 | @Override |
46 | 48 | public Page download(Request request, Task task) { |
47 | 49 | Page page = Page.fail(); |
50 | + WebDriver webDriver = null; | |
48 | 51 | try { |
49 | - | |
50 | - | |
51 | - log.info("downloading page " + request.getUrl()); | |
52 | - webDriver.get(request.getUrl()); | |
53 | - try { | |
54 | - if (sleepTime > 0) { | |
55 | - Thread.sleep(sleepTime); | |
56 | - } | |
57 | - } catch (InterruptedException e) { | |
58 | - e.printStackTrace(); | |
59 | - } | |
52 | + webDriver = config.webDriver(); | |
60 | 53 | WebDriver.Options manage = webDriver.manage(); |
61 | 54 | Site site = task.getSite(); |
55 | + site.setUserAgent(Agent.getRandom()); | |
62 | 56 | if (site.getCookies() != null) { |
63 | 57 | for (Map.Entry<String, String> cookieEntry : site.getCookies() |
64 | 58 | .entrySet()) { |
... | ... | @@ -68,6 +62,18 @@ public class SeleniumDownloader extends AbstractDownloader { |
68 | 62 | } |
69 | 63 | } |
70 | 64 | |
65 | + log.info("downloading page " + request.getUrl()); | |
66 | + | |
67 | + webDriver.get(request.getUrl()); | |
68 | + try { | |
69 | + if (sleepTime > 0) { | |
70 | + Thread.sleep(sleepTime); | |
71 | + } | |
72 | + } catch (InterruptedException e) { | |
73 | + e.printStackTrace(); | |
74 | + } | |
75 | + | |
76 | + | |
71 | 77 | /* |
72 | 78 | * TODO You can add mouse event or other processes |
73 | 79 | * |
... | ... | @@ -75,7 +81,7 @@ public class SeleniumDownloader extends AbstractDownloader { |
75 | 81 | */ |
76 | 82 | try { |
77 | 83 | //休眠3秒就是为了动态的数据渲染完成后在进行获取 |
78 | - Thread.sleep(30000); | |
84 | + Thread.sleep(3000); | |
79 | 85 | } catch (InterruptedException e) { |
80 | 86 | throw new RuntimeException(e); |
81 | 87 | } |
... | ... | @@ -91,7 +97,11 @@ public class SeleniumDownloader extends AbstractDownloader { |
91 | 97 | log.warn("download page {} error", request.getUrl(), e); |
92 | 98 | onError(request, task, e); |
93 | 99 | } finally { |
94 | - | |
100 | + if (webDriver != null) { | |
101 | + webDriver.close(); | |
102 | + webDriver.quit(); | |
103 | + webDriver = null; | |
104 | + } | |
95 | 105 | } |
96 | 106 | return page; |
97 | 107 | } | ... | ... |
src/main/resources/user-agent/User-Agents.txt
... | ... | @@ -3,19 +3,15 @@ Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera |
3 | 3 | Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50 |
4 | 4 | Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0 |
5 | 5 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2 |
6 | -Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36 | |
7 | 6 | Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11 |
8 | 7 | Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16 |
9 | 8 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36 |
10 | 9 | Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko |
11 | 10 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11 |
12 | 11 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER |
13 | -Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER) | |
14 | 12 | Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)" |
15 | -Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400) | |
16 | 13 | Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E) |
17 | 14 | Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0 |
18 | -Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0) | |
19 | 15 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36 |
20 | 16 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36 |
21 | 17 | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 | ... | ... |