NatureSearchPageProcessor.java
5.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
package com.canrd.webmagic.processor;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.canrd.webmagic.common.utils.StringUtils;
import com.canrd.webmagic.domain.dto.NatureArticleDO;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;
import us.codecraft.webmagic.selector.XpathSelector;
import java.util.List;
import java.util.Objects;
/**
* @author: xms
* @description: TODO
* @date: 2024/4/1 14:19
* @version: 1.0
*/
@Component
public class NatureSearchPageProcessor implements PageProcessor {
// 抓取网站的相关配置,包括编码、抓取间隔、重试次数等
private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
/**
* 定制爬虫逻辑的核心接口,在这里编写抽取逻辑
*
* @param page
*/
@Override
public void process(Page page) {
if (page.getUrl().get().contains("search")) {
doArticleList(page);
} else {
doArticleContent(page);
}
}
private void doArticleContent(Page page) {
//解析页面
Html html = page.getHtml();
String[] urlArr = page.getUrl().get().split("/");
String articleCode = urlArr[urlArr.length - 1];
Selectable headSelectable = html.xpath("//div[@class='c-article-header']/header");
List<Selectable> authorEmailSelectables = html.xpath("//p[@id='corresponding-author-list']/a").nodes();
String title = headSelectable.xpath("//div/h1/text()").get();
if (StringUtils.isBlank(title)) {
title = headSelectable.xpath("//h1/text()").get();
}
String publishTime = headSelectable.xpath("//ul").nodes().get(0).xpath("//li").nodes().get(2).xpath("//li/time/text()").get();
Selectable authorSelectable = headSelectable.xpath("//ul").nodes().get(1).select(new XpathSelector("li[@class='c-article-author-list__item']"));
List<Selectable> authorNodes = authorSelectable.nodes();
StringBuffer authorName = new StringBuffer();
for (Selectable node : authorNodes) {
authorName.append(node.xpath("//a/text()"));
}
JSONArray array = new JSONArray();
for (Selectable authorEmailSelectable : authorEmailSelectables) {
String[] split = authorEmailSelectable.xpath("//a").links().get().split(":");
String email = Objects.isNull(split) ? "" : split[split.length - 1];
String authorEmailName = authorEmailSelectable.xpath("//a/text()").get();
JSONObject jsonObject = new JSONObject();
jsonObject.put("authorEmailName", authorEmailName);
jsonObject.put("email", email);
array.add(jsonObject);
}
System.out.println("code:" + articleCode + ",发布时间:" + publishTime + ",标题:" + title + ",作者:" + authorName + ",邮箱信息:" + array.toJSONString());
page.putField("article", NatureArticleDO.builder()
.articleCode(articleCode)
.authorName(authorName.toString())
.title(title)
.publishTime(publishTime)
.emailInfo(array.toJSONString()).build());
}
private void doArticleList(Page page) {
String url = page.getUrl().get();
String[] split = url.split("=");
Integer pageIndex = Integer.parseInt(split[split.length - 1]);
/**
* 通过page.getHtml()可以获取到main函数中Spider.create(new BaiduHotSearchPageProcessor()).addUrl中的地址的网页内容
* 1、通过$或css()方法获取到该page html下某元素dom
*/
Selectable selectable = page.getHtml().$(".app-article-list-row").select(
new XpathSelector("li[@class='app-article-list-row__item']")
);
List<Selectable> nodes = selectable.nodes();
/**
* 获取到指定的dom后,从这些dom中提取元素内容。
*/
for (int i = 1; i <= nodes.size() - 1; i++) {
Selectable node = nodes.get(i).$(".u-full-height").nodes().get(2).nodes().get(0).$(".u-full-height").select(new XpathSelector("a[@class='c-card__link u-link-inherit']")).nodes().get(0);
String link = node.$("a", "href").get();
page.addTargetRequest(link);
String link1 = node.links().get();
String title = node.$("a", "text").get();
System.out.printf("%d、%s,访问地址:%s%n", i, title, link1);
}
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
// 创建一个Spider,并把我们的处理器放进去
Spider.create(new NatureSearchPageProcessor())
// 添加这个Spider要爬取的网页地址
.addUrl("https://www.nature.com/search?q=battery&page=1")
.addPipeline(new NatureArticlePipeline())
// 开启5个线程执行,并开始爬取
.thread(5).run();
}
}