Univie4PhysnanoArticlePageProcessor.java
4.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
package com.canrd.webmagic.processor;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.canrd.webmagic.common.utils.DateUtil;
import com.canrd.webmagic.common.utils.StringUtils;
import com.canrd.webmagic.domain.ArticleTypeEnum;
import com.canrd.webmagic.domain.dto.ArticleDO;
import com.canrd.webmagic.processor.config.Agent;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;
import java.util.Objects;
/**
* @author: xms
* @description: TODO
* @date: 2024/4/1 14:19
* @version: 1.0
*/
@Slf4j
@Component
public class Univie4PhysnanoArticlePageProcessor implements PageProcessor {
// 抓取网站的相关配置,包括编码、抓取间隔、重试次数等
private Site site = Site.me().setRetryTimes(3).setSleepTime(5).setUserAgent(Agent.getRandom());
/**
* 定制爬虫逻辑的核心接口,在这里编写抽取逻辑
*
* @param page
*/
@Override
public void process(Page page) {
doArticleContent(page);
}
/**
* @param page
*/
private void doArticleContent(Page page) {
//解析页面
Html html = page.getHtml();
String articleCode = page.getUrl().get();
String title = html.xpath("//h1[@class=' content-element-margin']/text()").get();
String articleDesc = html.xpath("//dl[@class='row']").xpath("//dd[@class='col-sm-9 content-element-margin-small']").nodes().get(1).xpath("//p/text()").get();
Date publishTimeDateTime = null;
SimpleDateFormat formatter = new SimpleDateFormat("dd-yyyy", Locale.ENGLISH);
String publishTime = html.xpath("//dl[@class='row']").xpath("//dd[@class='col-sm-9']").nodes().get(10).xpath("//dd/text()").get();
try {
publishTimeDateTime = formatter.parse(publishTime.trim());
} catch (ParseException e) {
try {
publishTime = html.xpath("//dl[@class='row']").xpath("//dd[@class='col-sm-9']").nodes().get(9).xpath("//dd/text()").get();
publishTimeDateTime = formatter.parse(publishTime.trim());
}catch (Exception e1) {
}
}
StringBuffer authorName = new StringBuffer();
authorName.append(html.xpath("//dl[@class='row']/").xpath("//dd[@class='col-sm-9 content-element-margin-small']").nodes().get(0).xpath("//dd/text()").get());
StringBuffer authorAddress = new StringBuffer();
authorAddress.append(html.xpath("//dl[@class='row']/").xpath("//dd[@class='col-sm-9 content-element-margin-small']").nodes().get(3).xpath("//dd/text()").get());
JSONArray authorEmail = new JSONArray();
String contractStr = html.xpath("//div[@class='col-md-3 sidebar content-element-margin']/aside/address/text()").get();
JSONObject jsonObject = new JSONObject();
if (StringUtils.isNotBlank(contractStr)) {
String authorEmailName = contractStr.split(":")[0];
String telephone = contractStr.split(":")[1];
jsonObject.put("authorEmailName", authorEmailName);
jsonObject.put("telephone", telephone);
}
String splitStr = html.xpath("//div[@class='col-md-3 sidebar content-element-margin']/aside/address/a").get().replaceAll("<span>","")
.replaceAll("</span>","").replaceAll("</a>","");
if (StringUtils.isNotBlank(splitStr)) {
String email = splitStr.split(">")[1];
jsonObject.put("email", email);
}
authorEmail.add(jsonObject);
log.info("文章链接:{},发布时间:{},标题:{},作者:{},邮箱信息:{}", articleCode, publishTime, title, authorEmail.toJSONString());
page.putField("article", ArticleDO.builder()
.articleType(ArticleTypeEnum.UNIVIE_PHYSNANO.getType())
.articleCode(articleCode)
.authorName(authorName.toString())
.title(title)
.publishTime(Objects.isNull(publishTimeDateTime) ? publishTime : DateUtil.format(publishTimeDateTime, DateUtil.DATE))
.emailInfo(authorEmail.toJSONString())
.articleDesc(articleDesc)
.authorAddress(authorAddress.toString())
.referenceInfo(null).build());
}
@Override
public Site getSite() {
return site;
}
public void setSite(Site site) {
this.site = site;
}
}