Commit fc48df03d3045b0ebb63bee84448f6f72538f8e8

Authored by 谢茂盛
1 parent fde047e7

feat:

1、science 爬取
src/main/java/com/canrd/webmagic/controller/Science4JournalController.java
1 1 package com.canrd.webmagic.controller;
2 2  
3 3 import com.canrd.webmagic.common.constant.ServerResult;
  4 +import com.canrd.webmagic.common.utils.KeywordUtil;
4 5 import com.canrd.webmagic.processor.Science4JournalSearchPageProcessor;
5 6 import com.canrd.webmagic.processor.download.SeleniumDownloader;
6 7 import org.apache.logging.log4j.core.util.UuidUtil;
... ... @@ -32,7 +33,7 @@ public class Science4JournalController {
32 33 * @return
33 34 */
34 35 @GetMapping("/start")
35   - public ServerResult start(@RequestParam(value = "keyword") String keyword, @RequestParam(value = "indexSize") Integer indexSize) {
  36 + public ServerResult start(@RequestParam(value = "indexSize") Integer indexSize) {
36 37 for (int i = 0; i <= indexSize; i++) {
37 38 Spider.create(science4JournalSearchPageProcessor)
38 39 .addUrl("https://www.science.org/journal/science/insights?startPage=" + i)
... ... @@ -46,5 +47,24 @@ public class Science4JournalController {
46 47 }
47 48  
48 49  
  50 + /**
  51 + * @return
  52 + */
  53 + @GetMapping("/search")
  54 + public ServerResult search(@RequestParam(value = "indexSize") Integer indexSize) {
  55 + for (int i = 0; i <= indexSize; i++) {
  56 + for (String keyword : KeywordUtil.getKeyWordList()) {
  57 + Spider.create(science4JournalSearchPageProcessor)
  58 + .addUrl("https://www.science.org/action/doSearch?AllField=" + keyword + "&pageSize=20&startPage=" + i)
  59 + // 开启5个线程执行,并开始爬取
  60 + .setUUID(UuidUtil.getTimeBasedUuid().toString())
  61 + .setDownloader(seleniumDownloader)
  62 + .thread(5).run();
  63 + }
  64 + }
  65 +
  66 + return ServerResult.success();
  67 + }
  68 +
49 69 }
50 70  
... ...
src/main/java/com/canrd/webmagic/processor/Science4JournalArticlePageProcessor.java
... ... @@ -3,6 +3,7 @@ package com.canrd.webmagic.processor;
3 3 import com.alibaba.fastjson.JSONArray;
4 4 import com.alibaba.fastjson.JSONObject;
5 5 import com.canrd.webmagic.common.utils.DateUtil;
  6 +import com.canrd.webmagic.common.utils.StringUtils;
6 7 import com.canrd.webmagic.domain.ArticleTypeEnum;
7 8 import com.canrd.webmagic.domain.dto.ArticleDO;
8 9 import com.canrd.webmagic.processor.config.Agent;
... ... @@ -57,8 +58,14 @@ public class Science4JournalArticlePageProcessor implements PageProcessor {
57 58 Selectable headSelectable = html.xpath("//div[@class='article-container']/article/header");
58 59  
59 60 String title = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='core-lede']/div/text()").get();
  61 + if (StringUtils.isBlank(title)) {
  62 + title = html.xpath("//div[@class='article-container']/article/header").xpath("//h1[@property='name']/text()").get();
  63 + }
60 64  
61 65 String articleDesc = html.xpath("//div[@class='article-container']/article").xpath("//section[@id='bodymatter']/div/div/text()").get();
  66 + if (StringUtils.isBlank(articleDesc)) {
  67 + articleDesc = html.xpath("//div[@class='article-container']/article").xpath("//div[@role='paragraph']/text()").get();
  68 + }
62 69  
63 70 String publishTime = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='core-self-citation']").xpath("//div[@class='core-date-published']/span/text()").get();
64 71 Date publishTimeDateTime = null;
... ... @@ -72,7 +79,11 @@ public class Science4JournalArticlePageProcessor implements PageProcessor {
72 79 List<Selectable> authorNodes = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='contributors']/span/span/span").nodes();
73 80 StringBuffer authorName = new StringBuffer();
74 81 for (Selectable node : authorNodes) {
75   - authorName.append(node.xpath("//a/span/text()").get()).append(" ");
  82 + String name = node.xpath("//a/span/text()").get();
  83 + if (StringUtils.isBlank(name)) {
  84 + continue;
  85 + }
  86 + authorName.append(name).append(" ");
76 87 }
77 88  
78 89  
... ...
src/main/java/com/canrd/webmagic/processor/Science4JournalSearchPageProcessor.java
... ... @@ -53,7 +53,51 @@ public class Science4JournalSearchPageProcessor implements PageProcessor {
53 53 */
54 54 @Override
55 55 public void process(Page page) {
56   - doArticleList(page);
  56 + if (page.getUrl().get().contains("doSearch")) {
  57 + doSearch(page);
  58 + } else {
  59 + doArticleList(page);
  60 + }
  61 + }
  62 +
  63 + /**
  64 + * @param page
  65 + */
  66 + private void doSearch(Page page) {
  67 + String url = page.getUrl().get();
  68 + /**
  69 + * 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容
  70 + * 1、通过$或css()方法获取到该page html下某元素dom
  71 + */
  72 + Selectable selectable = page.getHtml().xpath("//div[@class=' search-result__body titles-results ']").select(new XpathSelector("div[@class='card pb-3 mb-4 border-bottom']"));
  73 + List<Selectable> nodes = selectable.nodes();
  74 +
  75 + /**
  76 + * 获取到指定的dom后,从这些dom中提取元素内容。
  77 + */
  78 + for (int i = 0; i <= nodes.size() - 1; i++) {
  79 + String title = nodes.get(i).xpath("//div[@class='card pb-3 mb-4 border-bottom']/div").xpath("//div[@class='d-flex justify-content-between align-items-end']/div/span/h2/a/text()").get();
  80 + String time = nodes.get(i).xpath("//div[@class='card-meta align-middle mb-2 text-uppercase text-darker-gray']/span").nodes().get(2).xpath("//time/text()").get();
  81 + String link = nodes.get(i).links().get();
  82 + SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH);
  83 + try {
  84 + Date publishTimeDateTime = formatter.parse(time);
  85 + if (!publishTimeDateTime.before(DateUtil.localDate2Date(DateUtil.parseDate("2000-01-01", DateUtil.DATE)))) {
  86 +// page.addTargetRequest(link);
  87 + Spider.create(science4JournalArticlePageProcessor)
  88 + .addUrl(link)
  89 + .addPipeline(articlePipeline)
  90 + .setDownloader(seleniumDownloader)
  91 + .setUUID(UuidUtil.getTimeBasedUuid().toString())
  92 + // 开启5个线程执行,并开始爬取
  93 + .thread(1).run();
  94 + log.info("关键字文章列表链接:{},标题:{},文章链接:{}", url, title, link);
  95 + }
  96 + } catch (ParseException e) {
  97 + e.printStackTrace();
  98 + }
  99 + }
  100 +
57 101 }
58 102  
59 103 /**
... ...