Science4JournalArticlePageProcessor.java
5.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
package com.canrd.webmagic.processor;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.canrd.webmagic.common.utils.DateUtil;
import com.canrd.webmagic.domain.ArticleTypeEnum;
import com.canrd.webmagic.domain.dto.ArticleDO;
import com.canrd.webmagic.processor.config.Agent;
import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
import java.util.Locale;
import java.util.Objects;
/**
* @author: xms
* @description: TODO
* @date: 2024/4/1 14:19
* @version: 1.0
*/
@Slf4j
@Component
public class Science4JournalArticlePageProcessor implements PageProcessor {
// 抓取网站的相关配置,包括编码、抓取间隔、重试次数等
private Site site = Site.me().setRetryTimes(3).setSleepTime(5).setUserAgent(Agent.getRandom());
/**
* 定制爬虫逻辑的核心接口,在这里编写抽取逻辑
*
* @param page
*/
@Override
public void process(Page page) {
doArticleContent(page);
}
/**
* @param page
*/
private void doArticleContent(Page page) {
//解析页面
Html html = page.getHtml();
String articleCode = page.getUrl().get();
Selectable headSelectable = html.xpath("//div[@class='article-container']/article/header");
String title = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='core-lede']/div/text()").get();
String articleDesc = html.xpath("//div[@class='article-container']/article").xpath("//section[@id='bodymatter']/div/div/text()").get();
String publishTime = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='core-self-citation']").xpath("//div[@class='core-date-published']/span/text()").get();
Date publishTimeDateTime = null;
SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH);
try {
publishTimeDateTime = formatter.parse(publishTime);
} catch (ParseException e) {
e.printStackTrace();
}
List<Selectable> authorNodes = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='contributors']/span/span/span").nodes();
StringBuffer authorName = new StringBuffer();
for (Selectable node : authorNodes) {
authorName.append(node.xpath("//a/span/text()").get()).append(" ");
}
JSONArray authorEmail = new JSONArray();
List<Selectable> authorEmailSelectables = html.xpath("//div[@class='article-container']/article/header/div").xpath("//div[@class='contributors']").xpath("//span[@class='authors']").xpath("//span[@role='list']").xpath("//span[@property='author']").nodes();
for (Selectable authorEmailSelectable : authorEmailSelectables) {
String givenName = authorEmailSelectable.xpath("//span[@property='givenName']/text()").get();
String familyName = authorEmailSelectable.xpath("//span[@property='familyName']/text()").get();
String email = authorEmailSelectable.xpath("//a[@property='email']/text()").get();
JSONObject jsonObject = new JSONObject();
jsonObject.put("authorEmailName", givenName + "" + familyName);
jsonObject.put("email", email);
authorEmail.add(jsonObject);
}
log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorEmail.toJSONString());
page.putField("article", ArticleDO.builder()
.articleType(ArticleTypeEnum.SCIENCE.getType())
.articleCode(articleCode)
.authorName(authorName.toString())
.title(title)
.publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE))
.emailInfo(authorEmail.toJSONString())
.articleDesc(articleDesc)
.authorAddress(null)
.referenceInfo(null).build());
}
@Override
public Site getSite() {
return site;
}
public void setSite(Site site) {
this.site = site;
}
public static void main(String[] args) {
// 创建一个Spider,并把我们的处理器放进去
Spider.create(new Science4JournalArticlePageProcessor())
// 添加这个Spider要爬取的网页地址
.addUrl("https://www.science.org/journal/science/insights?startPage=0")
.addPipeline(new ArticlePipeline())
// 开启5个线程执行,并开始爬取
.thread(5).run();
}
}