Commit 1680e1818f9683472a638fab23d8514617cc7fd8

Authored by 凌世锦
1 parent b17c8d02

matter官网

Too many changes to show.

To preserve performance only 15 of 16 files are displayed.

src/main/java/com/canrd/webmagic/controller/MatterController.java 0 → 100644
  1 +package com.canrd.webmagic.controller;
  2 +
  3 +import com.canrd.webmagic.common.constant.ServerResult;
  4 +import com.canrd.webmagic.processor.MatterPragePcoessor;
  5 +import com.canrd.webmagic.processor.download.SeleniumDownloader;
  6 +import io.swagger.annotations.Api;
  7 +import io.swagger.annotations.ApiOperation;
  8 +import org.apache.logging.log4j.core.util.UuidUtil;
  9 +import org.springframework.web.bind.annotation.GetMapping;
  10 +import org.springframework.web.bind.annotation.RequestMapping;
  11 +import org.springframework.web.bind.annotation.RestController;
  12 +import us.codecraft.webmagic.Spider;
  13 +
  14 +import javax.annotation.Resource;
  15 +
  16 +@RestController
  17 +@RequestMapping("/matter/article")
  18 +@Api("Matter")
  19 +public class MatterController {
  20 + @Resource
  21 + private MatterPragePcoessor matterPragePcoessor;
  22 +
  23 + @Resource
  24 + private SeleniumDownloader seleniumDownloader;
  25 +
  26 + @GetMapping("/start")
  27 + @ApiOperation("start")
  28 + public ServerResult start() {
  29 + Spider.create(new MatterPragePcoessor())
  30 + // 添加这个Spider要爬取的网页地址
  31 + .addUrl("https://www.cell.com/matter/fulltext/S2590-2385(21)00631-7#%20")
  32 + .setUUID(UuidUtil.getTimeBasedUuid().toString())
  33 + .setDownloader(seleniumDownloader.setSleepTime(30000))
  34 + // 开启5个线程执行,并开始爬取
  35 + .thread(5).run();
  36 + return ServerResult.success();
  37 + }
  38 +}
... ...
src/main/java/com/canrd/webmagic/driver/ChromeBuildDriver.java deleted 100644 → 0
1   -package com.canrd.webmagic.driver;
2   -
3   -import org.openqa.selenium.chrome.ChromeOptions;
4   -
5   -import java.util.Arrays;
6   -
7   -public class ChromeBuildDriver {
8   - public ChromeOptions build(String DriverPath){
9   - ChromeOptions options = new ChromeOptions();
10   - // 添加代理,这里使用的代理是示例,需要替换为实际的代理服务器地址和端口
11   -// options.addArguments("--proxy-server=http://proxy-server:port");
12   -
13   - // 禁用JavaScript,有时这能帮助绕过Cloudflare的检查
14   - options.addArguments("--disable-javascript");
15   -
16   - // 禁用浏览器扩展,如果知道Cloudflare使用了特定的扩展,可以禁用它
17   - options.addArguments("--disable-extensions");
18   -
19   - // 禁用本地缓存,确保每次访问都从服务器获取
20   - options.addArguments("--disable-application-cache");
21   -
22   - // 禁止策略化
23   - options.addArguments("--disable-infobars");
24   - // 解决DevToolsActivePort文件不存在的报错
25   - options.addArguments("--no-sandbox");
26   - // 指定浏览器分辨
27   - options.addArguments("window-size=1920x3000");
28   - // 谷歌文档提到需要加上这个属性来规避bug
29   - options.addArguments("--disable-gpu");
30   - // 隐身模式(无痕模式)
31   - options.addArguments("--incognito");
32   - // 最大化运行(全屏窗口),不设置,取元素会报错
33   - options.addArguments("--start-maximized");
34   - // 禁用浏览器正在被自动化程序控制的提示
35   - options.addArguments("--disable-infobars");
36   - // 隐藏滚动条, 应对一些特殊页面
37   - options.addArguments("--hide-scrollbars");
38   - // 不加载图片, 提升速度
39   - options.addArguments("blink-settings=imagesEnabled=false");
40   - // 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
41   - options.addArguments("--headless");
42   - //禁用 blink 特征
43   - options.addArguments("disable-blink-features=AutomationControlled");
44   - options.setExperimentalOption("excludeSwitches", Arrays.asList("enable-automation"));
45   - options.setExperimentalOption("useAutomationExtension", false);
46   - options.addArguments("--remote-allow-origins=*");
47   - options.setBinary(DriverPath);
48   - return options;
49   - }
50   -}
src/main/java/com/canrd/webmagic/processor/ChemicalPagePcoessor.java
1 1 package com.canrd.webmagic.processor;
2 2  
  3 +import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
3 4 import us.codecraft.webmagic.Page;
  5 +import us.codecraft.webmagic.Site;
  6 +import us.codecraft.webmagic.Spider;
4 7 import us.codecraft.webmagic.processor.PageProcessor;
  8 +import us.codecraft.webmagic.selector.Html;
5 9  
6 10 public class ChemicalPagePcoessor implements PageProcessor {
7   -
  11 + private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
8 12 @Override
9 13 public void process(Page page) {
10 14  
  15 + };
  16 +
  17 + public void doArticleContent(Page page){
  18 + Html html = page.getHtml();
  19 +
  20 + }
  21 +
  22 + public static void main(String[] args) {
  23 + // 创建一个Spider,并把我们的处理器放进去
  24 + Spider.create(new ChemicalPagePcoessor())
  25 + // 添加这个Spider要爬取的网页地址
  26 + .addUrl("https://www.nature.com/nature/research-articles?sort=PubDate&page=1")
  27 + .addPipeline(new ArticlePipeline())
  28 + // 开启5个线程执行,并开始爬取
  29 + .thread(5).run();
11 30 }
12 31 }
... ...
src/main/java/com/canrd/webmagic/processor/MatterPragePcoessor.java 0 → 100644
  1 +package com.canrd.webmagic.processor;
  2 +
  3 +import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
  4 +import org.springframework.stereotype.Component;
  5 +import us.codecraft.webmagic.Page;
  6 +import us.codecraft.webmagic.Site;
  7 +import us.codecraft.webmagic.Spider;
  8 +import us.codecraft.webmagic.processor.PageProcessor;
  9 +import us.codecraft.webmagic.selector.Html;
  10 +@Component
  11 +public class MatterPragePcoessor implements PageProcessor {
  12 + private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
  13 + @Override
  14 + public void process(Page page) {
  15 +
  16 + }
  17 +
  18 + @Override
  19 + public Site getSite() {
  20 + return PageProcessor.super.getSite();
  21 + }
  22 +
  23 + public void doArticleContent(Page page){
  24 + Html html = page.getHtml();
  25 + String articleCode = page.getUrl().get();
  26 +// html.xpath()
  27 + }
  28 +
  29 + public static void main(String[] args) {
  30 + // 创建一个Spider,并把我们的处理器放进去
  31 + Spider.create(new MatterPragePcoessor())
  32 + // 添加这个Spider要爬取的网页地址
  33 + .addUrl("https://www.cell.com/matter/fulltext/S2590-2385(21)00631-7#%20")
  34 + .addPipeline(new ArticlePipeline())
  35 + // 开启5个线程执行,并开始爬取
  36 + .thread(5).run();
  37 + }
  38 +}
... ...
src/main/java/com/canrd/webmagic/processor/driver/MyChromeDriver.java
... ... @@ -16,7 +16,9 @@ import java.util.Arrays;
16 16 */
17 17 @Component
18 18 public class MyChromeDriver implements BrowserDriver{
19   - private static final String WIN_DRIVER_PATH = "D:\\driver\\chrome\\chromedriver-win64\\chromedriver.exe";
  19 +// private static final String WIN_DRIVER_PATH = "D:\\driver\\chrome\\chromedriver-win64\\chromedriver.exe";
  20 +
  21 + private static final String WIN_DRIVER_PATH = "C:\\Program Files\\Google\\Chrome\\Application\\chromedriver.exe";
20 22 private static final String LINUX_DRIVER_PATH = "/home/canrd/webmagic/chrome/chromedriver-linux64/chromedriver";
21 23 private static final String WIN_BINARY_PATH = "D:\\driver\\chrome\\chrome-headless-shell-win64\\chrome-headless-shell.exe";
22 24 private static final String LINUX_BINARY_PATH = "/home/canrd/webmagic/chrome/chrome-linux64/chrome";
... ...
target/classes/application-local.yml
... ... @@ -57,13 +57,13 @@ spring:
57 57 testWhileIdle: true
58 58 testOnBorrow: true
59 59 testOnReturn: true
60   - password: canrd@2024
  60 + password: 123456
61 61 time-between-eviction-runs-millis: 1000
62   - url: jdbc:mysql://39.108.227.113:3307/webmagic?useUnicode=true&characterEncoding=UTF-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=Asia/Shanghai&useSSL=false&autoReconnect=true&failOverReadOnly=false&maxReconnects=10&allowMultiQueries=true&useAffectedRows=true&autoReconnectForPools=true
  62 + url: jdbc:mysql://localhost:3306/webmagic?useUnicode=true&characterEncoding=UTF-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=Asia/Shanghai&useSSL=false&autoReconnect=true&failOverReadOnly=false&maxReconnects=10&allowMultiQueries=true&useAffectedRows=true&autoReconnectForPools=true
63 63 username: root
64 64 redis:
65 65 database: 0
66   - host: 39.108.227.113
  66 + host: localhost
67 67 lettuce:
68 68 pool:
69 69 max-active: 2000
... ...
target/classes/com/canrd/webmagic/config/SeleniumConfig.class
No preview for this file type
target/classes/com/canrd/webmagic/controller/MatterController.class 0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/ChemicalPagePcoessor.class
No preview for this file type
target/classes/com/canrd/webmagic/processor/MatterPragePcoessor.class 0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/download/SeleniumDownloader.class
No preview for this file type
target/classes/com/canrd/webmagic/processor/driver/BrowserDriver.class 0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/driver/MyChromeDriver.class 0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/driver/MyEdgeDriver.class 0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/driver/MyFirefoxDriver.class 0 → 100644
No preview for this file type