Science4SpjArticlePageProcessor.java
5.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
package com.canrd.webmagic.processor;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.canrd.webmagic.common.utils.DateUtil;
import com.canrd.webmagic.common.utils.StringUtils;
import com.canrd.webmagic.domain.ArticleTypeEnum;
import com.canrd.webmagic.domain.dto.ArticleDO;
import com.canrd.webmagic.processor.config.Agent;
import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
import java.util.Locale;
import java.util.Objects;
/**
* @author: xms
* @description: TODO
* @date: 2024/4/1 14:19
* @version: 1.0
*/
@Slf4j
@Component
public class Science4SpjArticlePageProcessor implements PageProcessor {
// 抓取网站的相关配置,包括编码、抓取间隔、重试次数等
private Site site = Site.me().setRetryTimes(3).setSleepTime(5).setUserAgent(Agent.getRandom());
/**
* 定制爬虫逻辑的核心接口,在这里编写抽取逻辑
*
* @param page
*/
@Override
public void process(Page page) {
doArticleContent(page);
}
/**
* @param page
*/
private void doArticleContent(Page page) {
//解析页面
Html html = page.getHtml();
String articleCode = page.getUrl().get();
Selectable articleSelectable = html.xpath("//article[@xmlns='http://www.w3.org/1999/xhtml']");
Selectable headSelectable = articleSelectable.xpath("//header/div");
String title = headSelectable.xpath("//div[@class='core-lede']/div/text()").get();
if (StringUtils.isBlank(title)) {
title = headSelectable.xpath("//h1[@property='name']/text()").get();
}
String articleDesc = articleSelectable.xpath("//div[@role='paragraph']/text()").get();
if (StringUtils.isBlank(articleDesc)) {
articleDesc = html.xpath("//div[@class='article-container']/article").xpath("//div[@role='paragraph']/text()").get();
}
String publishTime = headSelectable.xpath("//span[@property='datePublished']/text()").get();
Date publishTimeDateTime = null;
SimpleDateFormat formatter = new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH);
try {
publishTimeDateTime = formatter.parse(publishTime);
} catch (ParseException e) {
e.printStackTrace();
}
List<Selectable> authorNodes = headSelectable.xpath("//span[@property='author']").nodes();
StringBuffer authorName = new StringBuffer();
for (Selectable node : authorNodes) {
String giveName = node.xpath("//span[@property='givenName']/text()").get();
String familyName = node.xpath("//span[@property='familyName']/text()").get();
if (StringUtils.isBlank(giveName) && StringUtils.isBlank(familyName)) {
continue;
}
authorName.append(giveName).append(" ").append(familyName).append(",");
}
JSONArray authorEmail = new JSONArray();
List<Selectable> authorEmailSelectables = headSelectable.xpath("//span[@property='author']").nodes();
for (Selectable authorEmailSelectable : authorEmailSelectables) {
String givenName = authorEmailSelectable.xpath("//span[@property='givenName']/text()").get();
String familyName = authorEmailSelectable.xpath("//span[@property='familyName']/text()").get();
String email = authorEmailSelectable.xpath("//a[@property='email']/text()").get();
if (StringUtils.isBlank(email)) {
continue;
}
JSONObject jsonObject = new JSONObject();
jsonObject.put("authorEmailName", givenName + " " + familyName);
jsonObject.put("email", email);
authorEmail.add(jsonObject);
}
log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorEmail.toJSONString());
page.putField("article", ArticleDO.builder()
.articleType(ArticleTypeEnum.SCIENCE_SPJ.getType())
.articleCode(articleCode)
.authorName(authorName.toString())
.title(title)
.publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE))
.emailInfo(authorEmail.toJSONString())
.articleDesc(articleDesc)
.authorAddress(null)
.referenceInfo(null).build());
}
@Override
public Site getSite() {
return site;
}
public void setSite(Site site) {
this.site = site;
}
public static void main(String[] args) {
// 创建一个Spider,并把我们的处理器放进去
Spider.create(new Science4SpjArticlePageProcessor())
// 添加这个Spider要爬取的网页地址
.addUrl("https://www.science.org/journal/science/insights?startPage=0")
.addPipeline(new ArticlePipeline())
// 开启5个线程执行,并开始爬取
.thread(5).run();
}
}