Commit 174a7c30cdf0b758bbd294c8a49d1cdcd0b3b171

Authored by 谢茂盛
1 parent b7789621

feat:

1、selenium 整合
2、science 网站
src/main/java/com/canrd/webmagic/config/SeleniumConfig.java
1 1 package com.canrd.webmagic.config;
2 2  
3   -import com.canrd.webmagic.processor.config.Agent;
4 3 import org.openqa.selenium.WebDriver;
5 4 import org.openqa.selenium.chrome.ChromeDriver;
6 5 import org.openqa.selenium.chrome.ChromeOptions;
... ... @@ -16,9 +15,25 @@ import org.springframework.context.annotation.Configuration;
16 15 @Configuration
17 16 public class SeleniumConfig {
18 17  
19   - @Bean
20   - public WebDriver webDriver() {
  18 +// @Bean
  19 + public WebDriver webDriver() throws InterruptedException {
21 20 System.setProperty("webdriver.chrome.driver", "D:\\chrome\\chromedriver-win64\\chromedriver-win64\\chromedriver.exe");
22   - return new ChromeDriver();
  21 + // 初始化ChromeOptions
  22 + ChromeOptions options = new ChromeOptions();
  23 + // 添加代理,这里使用的代理是示例,需要替换为实际的代理服务器地址和端口
  24 +// options.addArguments("--proxy-server=http://proxy-server:port");
  25 +
  26 + // 禁用JavaScript,有时这能帮助绕过Cloudflare的检查
  27 + options.addArguments("--disable-javascript");
  28 +
  29 + // 禁用浏览器扩展,如果知道Cloudflare使用了特定的扩展,可以禁用它
  30 + options.addArguments("--disable-extensions");
  31 +
  32 + // 禁用本地缓存,确保每次访问都从服务器获取
  33 + options.addArguments("--disable-application-cache");
  34 +
  35 + options.setBinary("D:\\chrome\\chrome-win64\\chrome-win64\\chrome.exe");
  36 +
  37 + return new ChromeDriver(options);
23 38 }
24 39 }
... ...
src/main/java/com/canrd/webmagic/controller/Science4JournalController.java
... ... @@ -5,7 +5,9 @@ import com.canrd.webmagic.common.jsr303.OperateGroup;
5 5 import com.canrd.webmagic.domain.vo.NatureArticleQueryVO;
6 6 import com.canrd.webmagic.domain.vo.NatureArticleVO;
7 7 import com.canrd.webmagic.processor.Science4JournalSearchPageProcessor;
  8 +import com.canrd.webmagic.processor.download.SeleniumDownloader;
8 9 import com.canrd.webmagic.service.ArticleService;
  10 +import org.apache.logging.log4j.core.util.UuidUtil;
9 11 import org.springframework.validation.annotation.Validated;
10 12 import org.springframework.web.bind.annotation.*;
11 13 import us.codecraft.webmagic.Spider;
... ... @@ -30,6 +32,9 @@ public class Science4JournalController {
30 32 @Resource
31 33 private Science4JournalSearchPageProcessor science4JournalSearchPageProcessor;
32 34  
  35 + @Resource
  36 + private SeleniumDownloader seleniumDownloader;
  37 +
33 38 /**
34 39 * @return
35 40 */
... ... @@ -37,8 +42,10 @@ public class Science4JournalController {
37 42 public ServerResult start(@RequestParam(value = "keyword") String keyword, @RequestParam(value = "indexSize") Integer indexSize) {
38 43 for (int i = 0; i <= indexSize; i++) {
39 44 Spider.create(science4JournalSearchPageProcessor)
40   - .addUrl("http://www.science.org/journal/science/insights?startPage=" + i)
  45 + .addUrl("https://www.science.org/journal/science/insights?startPage=" + i)
41 46 // 开启5个线程执行,并开始爬取
  47 + .setUUID(UuidUtil.getTimeBasedUuid().toString())
  48 + .setDownloader(seleniumDownloader)
42 49 .thread(5).run();
43 50 }
44 51  
... ...
src/main/java/com/canrd/webmagic/processor/Science4JournalArticlePageProcessor.java
... ... @@ -32,10 +32,9 @@ import java.util.Objects;
32 32 @Slf4j
33 33 @Component
34 34 public class Science4JournalArticlePageProcessor implements PageProcessor {
35   - private String agent = Agent.getRandom();
36 35  
37 36 // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等
38   - private Site site = Site.me().setRetryTimes(3).setSleepTime(100).setUserAgent(Agent.getRandom());
  37 + private Site site = Site.me().setRetryTimes(3).setSleepTime(5).setUserAgent(Agent.getRandom());
39 38  
40 39 /**
41 40 * 定制爬虫逻辑的核心接口,在这里编写抽取逻辑
... ... @@ -108,6 +107,10 @@ public class Science4JournalArticlePageProcessor implements PageProcessor {
108 107 return site;
109 108 }
110 109  
  110 + public void setSite(Site site) {
  111 + this.site = site;
  112 + }
  113 +
111 114 public static void main(String[] args) {
112 115 // 创建一个Spider,并把我们的处理器放进去
113 116 Spider.create(new Science4JournalArticlePageProcessor())
... ...
src/main/java/com/canrd/webmagic/processor/Science4JournalSearchPageProcessor.java
... ... @@ -6,6 +6,7 @@ import com.canrd.webmagic.processor.config.Agent;
6 6 import com.canrd.webmagic.processor.download.SeleniumDownloader;
7 7 import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
8 8 import lombok.extern.slf4j.Slf4j;
  9 +import org.apache.logging.log4j.core.util.UuidUtil;
9 10 import org.springframework.stereotype.Component;
10 11 import us.codecraft.webmagic.Page;
11 12 import us.codecraft.webmagic.Site;
... ... @@ -43,7 +44,7 @@ public class Science4JournalSearchPageProcessor implements PageProcessor {
43 44 /**
44 45 * 抓取网站的相关配置,包括编码、抓取间隔、重试次数等
45 46 */
46   - private Site site = Site.me().setRetryTimes(3).setSleepTime(100).setUserAgent(Agent.getRandom());
  47 + private Site site = Site.me().setRetryTimes(3).setSleepTime(5).setUserAgent(Agent.getRandom());
47 48  
48 49 /**
49 50 * 定制爬虫逻辑的核心接口,在这里编写抽取逻辑
... ... @@ -84,6 +85,7 @@ public class Science4JournalSearchPageProcessor implements PageProcessor {
84 85 .addUrl(link)
85 86 .addPipeline(articlePipeline)
86 87 .setDownloader(seleniumDownloader)
  88 + .setUUID(UuidUtil.getTimeBasedUuid().toString())
87 89 // 开启5个线程执行,并开始爬取
88 90 .thread(1).run();
89 91 log.info("关键字文章列表链接:{},标题:{},文章链接:{}", url, title, link);
... ...
src/main/java/com/canrd/webmagic/processor/download/SeleniumDownloader.java
1 1 package com.canrd.webmagic.processor.download;
2 2  
  3 +import com.canrd.webmagic.config.SeleniumConfig;
  4 +import com.canrd.webmagic.processor.config.Agent;
3 5 import lombok.extern.slf4j.Slf4j;
4 6 import org.openqa.selenium.By;
5 7 import org.openqa.selenium.Cookie;
... ... @@ -26,10 +28,10 @@ import java.util.Map;
26 28 @Slf4j
27 29 @Component
28 30 public class SeleniumDownloader extends AbstractDownloader {
29   - private int sleepTime = 0;
  31 + private int sleepTime = 30;
30 32  
31 33 @Resource
32   - private WebDriver webDriver;
  34 + private SeleniumConfig config;
33 35  
34 36 /**
35 37 * set sleep time to wait until load success
... ... @@ -45,20 +47,12 @@ public class SeleniumDownloader extends AbstractDownloader {
45 47 @Override
46 48 public Page download(Request request, Task task) {
47 49 Page page = Page.fail();
  50 + WebDriver webDriver = null;
48 51 try {
49   -
50   -
51   - log.info("downloading page " + request.getUrl());
52   - webDriver.get(request.getUrl());
53   - try {
54   - if (sleepTime > 0) {
55   - Thread.sleep(sleepTime);
56   - }
57   - } catch (InterruptedException e) {
58   - e.printStackTrace();
59   - }
  52 + webDriver = config.webDriver();
60 53 WebDriver.Options manage = webDriver.manage();
61 54 Site site = task.getSite();
  55 + site.setUserAgent(Agent.getRandom());
62 56 if (site.getCookies() != null) {
63 57 for (Map.Entry<String, String> cookieEntry : site.getCookies()
64 58 .entrySet()) {
... ... @@ -68,6 +62,18 @@ public class SeleniumDownloader extends AbstractDownloader {
68 62 }
69 63 }
70 64  
  65 + log.info("downloading page " + request.getUrl());
  66 +
  67 + webDriver.get(request.getUrl());
  68 + try {
  69 + if (sleepTime > 0) {
  70 + Thread.sleep(sleepTime);
  71 + }
  72 + } catch (InterruptedException e) {
  73 + e.printStackTrace();
  74 + }
  75 +
  76 +
71 77 /*
72 78 * TODO You can add mouse event or other processes
73 79 *
... ... @@ -75,7 +81,7 @@ public class SeleniumDownloader extends AbstractDownloader {
75 81 */
76 82 try {
77 83 //休眠3秒就是为了动态的数据渲染完成后在进行获取
78   - Thread.sleep(30000);
  84 + Thread.sleep(3000);
79 85 } catch (InterruptedException e) {
80 86 throw new RuntimeException(e);
81 87 }
... ... @@ -91,7 +97,11 @@ public class SeleniumDownloader extends AbstractDownloader {
91 97 log.warn("download page {} error", request.getUrl(), e);
92 98 onError(request, task, e);
93 99 } finally {
94   -
  100 + if (webDriver != null) {
  101 + webDriver.close();
  102 + webDriver.quit();
  103 + webDriver = null;
  104 + }
95 105 }
96 106 return page;
97 107 }
... ...
src/main/resources/user-agent/User-Agents.txt
... ... @@ -3,19 +3,15 @@ Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera
3 3 Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50
4 4 Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0
5 5 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2
6   -Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36
7 6 Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11
8 7 Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16
9 8 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36
10 9 Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko
11 10 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11
12 11 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER
13   -Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)
14 12 Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)"
15   -Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)
16 13 Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)
17 14 Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0
18   -Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)
19 15 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36
20 16 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36
21 17 Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36
... ...