Commit 1680e1818f9683472a638fab23d8514617cc7fd8

Authored by 凌世锦
1 parent b17c8d02

matter官网

Too many changes to show.

To preserve performance only 15 of 16 files are displayed.

src/main/java/com/canrd/webmagic/controller/MatterController.java 0 → 100644
  1 +package com.canrd.webmagic.controller;
  2 +
  3 +import com.canrd.webmagic.common.constant.ServerResult;
  4 +import com.canrd.webmagic.processor.MatterPragePcoessor;
  5 +import com.canrd.webmagic.processor.download.SeleniumDownloader;
  6 +import io.swagger.annotations.Api;
  7 +import io.swagger.annotations.ApiOperation;
  8 +import org.apache.logging.log4j.core.util.UuidUtil;
  9 +import org.springframework.web.bind.annotation.GetMapping;
  10 +import org.springframework.web.bind.annotation.RequestMapping;
  11 +import org.springframework.web.bind.annotation.RestController;
  12 +import us.codecraft.webmagic.Spider;
  13 +
  14 +import javax.annotation.Resource;
  15 +
  16 +@RestController
  17 +@RequestMapping("/matter/article")
  18 +@Api("Matter")
  19 +public class MatterController {
  20 + @Resource
  21 + private MatterPragePcoessor matterPragePcoessor;
  22 +
  23 + @Resource
  24 + private SeleniumDownloader seleniumDownloader;
  25 +
  26 + @GetMapping("/start")
  27 + @ApiOperation("start")
  28 + public ServerResult start() {
  29 + Spider.create(new MatterPragePcoessor())
  30 + // 添加这个Spider要爬取的网页地址
  31 + .addUrl("https://www.cell.com/matter/fulltext/S2590-2385(21)00631-7#%20")
  32 + .setUUID(UuidUtil.getTimeBasedUuid().toString())
  33 + .setDownloader(seleniumDownloader.setSleepTime(30000))
  34 + // 开启5个线程执行,并开始爬取
  35 + .thread(5).run();
  36 + return ServerResult.success();
  37 + }
  38 +}
src/main/java/com/canrd/webmagic/driver/ChromeBuildDriver.java deleted 100644 → 0
1 -package com.canrd.webmagic.driver;  
2 -  
3 -import org.openqa.selenium.chrome.ChromeOptions;  
4 -  
5 -import java.util.Arrays;  
6 -  
7 -public class ChromeBuildDriver {  
8 - public ChromeOptions build(String DriverPath){  
9 - ChromeOptions options = new ChromeOptions();  
10 - // 添加代理,这里使用的代理是示例,需要替换为实际的代理服务器地址和端口  
11 -// options.addArguments("--proxy-server=http://proxy-server:port");  
12 -  
13 - // 禁用JavaScript,有时这能帮助绕过Cloudflare的检查  
14 - options.addArguments("--disable-javascript");  
15 -  
16 - // 禁用浏览器扩展,如果知道Cloudflare使用了特定的扩展,可以禁用它  
17 - options.addArguments("--disable-extensions");  
18 -  
19 - // 禁用本地缓存,确保每次访问都从服务器获取  
20 - options.addArguments("--disable-application-cache");  
21 -  
22 - // 禁止策略化  
23 - options.addArguments("--disable-infobars");  
24 - // 解决DevToolsActivePort文件不存在的报错  
25 - options.addArguments("--no-sandbox");  
26 - // 指定浏览器分辨  
27 - options.addArguments("window-size=1920x3000");  
28 - // 谷歌文档提到需要加上这个属性来规避bug  
29 - options.addArguments("--disable-gpu");  
30 - // 隐身模式(无痕模式)  
31 - options.addArguments("--incognito");  
32 - // 最大化运行(全屏窗口),不设置,取元素会报错  
33 - options.addArguments("--start-maximized");  
34 - // 禁用浏览器正在被自动化程序控制的提示  
35 - options.addArguments("--disable-infobars");  
36 - // 隐藏滚动条, 应对一些特殊页面  
37 - options.addArguments("--hide-scrollbars");  
38 - // 不加载图片, 提升速度  
39 - options.addArguments("blink-settings=imagesEnabled=false");  
40 - // 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败  
41 - options.addArguments("--headless");  
42 - //禁用 blink 特征  
43 - options.addArguments("disable-blink-features=AutomationControlled");  
44 - options.setExperimentalOption("excludeSwitches", Arrays.asList("enable-automation"));  
45 - options.setExperimentalOption("useAutomationExtension", false);  
46 - options.addArguments("--remote-allow-origins=*");  
47 - options.setBinary(DriverPath);  
48 - return options;  
49 - }  
50 -}  
src/main/java/com/canrd/webmagic/processor/ChemicalPagePcoessor.java
1 package com.canrd.webmagic.processor; 1 package com.canrd.webmagic.processor;
2 2
  3 +import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
3 import us.codecraft.webmagic.Page; 4 import us.codecraft.webmagic.Page;
  5 +import us.codecraft.webmagic.Site;
  6 +import us.codecraft.webmagic.Spider;
4 import us.codecraft.webmagic.processor.PageProcessor; 7 import us.codecraft.webmagic.processor.PageProcessor;
  8 +import us.codecraft.webmagic.selector.Html;
5 9
6 public class ChemicalPagePcoessor implements PageProcessor { 10 public class ChemicalPagePcoessor implements PageProcessor {
7 - 11 + private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
8 @Override 12 @Override
9 public void process(Page page) { 13 public void process(Page page) {
10 14
  15 + };
  16 +
  17 + public void doArticleContent(Page page){
  18 + Html html = page.getHtml();
  19 +
  20 + }
  21 +
  22 + public static void main(String[] args) {
  23 + // 创建一个Spider,并把我们的处理器放进去
  24 + Spider.create(new ChemicalPagePcoessor())
  25 + // 添加这个Spider要爬取的网页地址
  26 + .addUrl("https://www.nature.com/nature/research-articles?sort=PubDate&page=1")
  27 + .addPipeline(new ArticlePipeline())
  28 + // 开启5个线程执行,并开始爬取
  29 + .thread(5).run();
11 } 30 }
12 } 31 }
src/main/java/com/canrd/webmagic/processor/MatterPragePcoessor.java 0 → 100644
  1 +package com.canrd.webmagic.processor;
  2 +
  3 +import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
  4 +import org.springframework.stereotype.Component;
  5 +import us.codecraft.webmagic.Page;
  6 +import us.codecraft.webmagic.Site;
  7 +import us.codecraft.webmagic.Spider;
  8 +import us.codecraft.webmagic.processor.PageProcessor;
  9 +import us.codecraft.webmagic.selector.Html;
  10 +@Component
  11 +public class MatterPragePcoessor implements PageProcessor {
  12 + private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
  13 + @Override
  14 + public void process(Page page) {
  15 +
  16 + }
  17 +
  18 + @Override
  19 + public Site getSite() {
  20 + return PageProcessor.super.getSite();
  21 + }
  22 +
  23 + public void doArticleContent(Page page){
  24 + Html html = page.getHtml();
  25 + String articleCode = page.getUrl().get();
  26 +// html.xpath()
  27 + }
  28 +
  29 + public static void main(String[] args) {
  30 + // 创建一个Spider,并把我们的处理器放进去
  31 + Spider.create(new MatterPragePcoessor())
  32 + // 添加这个Spider要爬取的网页地址
  33 + .addUrl("https://www.cell.com/matter/fulltext/S2590-2385(21)00631-7#%20")
  34 + .addPipeline(new ArticlePipeline())
  35 + // 开启5个线程执行,并开始爬取
  36 + .thread(5).run();
  37 + }
  38 +}
src/main/java/com/canrd/webmagic/processor/driver/MyChromeDriver.java
@@ -16,7 +16,9 @@ import java.util.Arrays; @@ -16,7 +16,9 @@ import java.util.Arrays;
16 */ 16 */
17 @Component 17 @Component
18 public class MyChromeDriver implements BrowserDriver{ 18 public class MyChromeDriver implements BrowserDriver{
19 - private static final String WIN_DRIVER_PATH = "D:\\driver\\chrome\\chromedriver-win64\\chromedriver.exe"; 19 +// private static final String WIN_DRIVER_PATH = "D:\\driver\\chrome\\chromedriver-win64\\chromedriver.exe";
  20 +
  21 + private static final String WIN_DRIVER_PATH = "C:\\Program Files\\Google\\Chrome\\Application\\chromedriver.exe";
20 private static final String LINUX_DRIVER_PATH = "/home/canrd/webmagic/chrome/chromedriver-linux64/chromedriver"; 22 private static final String LINUX_DRIVER_PATH = "/home/canrd/webmagic/chrome/chromedriver-linux64/chromedriver";
21 private static final String WIN_BINARY_PATH = "D:\\driver\\chrome\\chrome-headless-shell-win64\\chrome-headless-shell.exe"; 23 private static final String WIN_BINARY_PATH = "D:\\driver\\chrome\\chrome-headless-shell-win64\\chrome-headless-shell.exe";
22 private static final String LINUX_BINARY_PATH = "/home/canrd/webmagic/chrome/chrome-linux64/chrome"; 24 private static final String LINUX_BINARY_PATH = "/home/canrd/webmagic/chrome/chrome-linux64/chrome";
target/classes/application-local.yml
@@ -57,13 +57,13 @@ spring: @@ -57,13 +57,13 @@ spring:
57 testWhileIdle: true 57 testWhileIdle: true
58 testOnBorrow: true 58 testOnBorrow: true
59 testOnReturn: true 59 testOnReturn: true
60 - password: canrd@2024 60 + password: 123456
61 time-between-eviction-runs-millis: 1000 61 time-between-eviction-runs-millis: 1000
62 - url: jdbc:mysql://39.108.227.113:3307/webmagic?useUnicode=true&characterEncoding=UTF-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=Asia/Shanghai&useSSL=false&autoReconnect=true&failOverReadOnly=false&maxReconnects=10&allowMultiQueries=true&useAffectedRows=true&autoReconnectForPools=true 62 + url: jdbc:mysql://localhost:3306/webmagic?useUnicode=true&characterEncoding=UTF-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=Asia/Shanghai&useSSL=false&autoReconnect=true&failOverReadOnly=false&maxReconnects=10&allowMultiQueries=true&useAffectedRows=true&autoReconnectForPools=true
63 username: root 63 username: root
64 redis: 64 redis:
65 database: 0 65 database: 0
66 - host: 39.108.227.113 66 + host: localhost
67 lettuce: 67 lettuce:
68 pool: 68 pool:
69 max-active: 2000 69 max-active: 2000
target/classes/com/canrd/webmagic/config/SeleniumConfig.class
No preview for this file type
target/classes/com/canrd/webmagic/controller/MatterController.class 0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/ChemicalPagePcoessor.class
No preview for this file type
target/classes/com/canrd/webmagic/processor/MatterPragePcoessor.class 0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/download/SeleniumDownloader.class
No preview for this file type
target/classes/com/canrd/webmagic/processor/driver/BrowserDriver.class 0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/driver/MyChromeDriver.class 0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/driver/MyEdgeDriver.class 0 → 100644
No preview for this file type
target/classes/com/canrd/webmagic/processor/driver/MyFirefoxDriver.class 0 → 100644
No preview for this file type