NatureMaterialController.java 1.91 KB
package com.canrd.webmagic.controller;

import com.canrd.webmagic.common.constant.ServerResult;
import com.canrd.webmagic.processor.MatterPagePcoessor;
import com.canrd.webmagic.processor.NatureMaterialPagePcoessor;
import com.canrd.webmagic.processor.download.SeleniumDownloader;
import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
import io.swagger.annotations.Api;
import io.swagger.annotations.ApiOperation;
import org.apache.logging.log4j.core.util.UuidUtil;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.scheduler.RedisScheduler;

import javax.annotation.Resource;

@RestController
@RequestMapping("/nature-material/article")
@Api("Nature")
public class NatureMaterialController {
    @Resource
    private NatureMaterialPagePcoessor natureMaterialPagePcoessor;
    @Resource
    private ArticlePipeline articlePipeline;

    @GetMapping("/start")
    @ApiOperation("start")
    public ServerResult start() {
        Spider.create(natureMaterialPagePcoessor)
                // 添加这个Spider要爬取的网页地址
                .addUrl("https://www.nature.com/nmat/articles")
                .addUrl("https://www.nature.com/search?q=battery&journal=nmat&order=relevance")
                .addUrl("https://www.nature.com/search?q=batteries&journal=nmat&order=relevance")
                .addUrl("https://www.nature.com/search?q=battery")
                .addUrl("https://www.nature.com/nature/research-articles")
                .setUUID(UuidUtil.getTimeBasedUuid().toString())
                .addPipeline(articlePipeline)
                .setScheduler(new RedisScheduler("127.0.0.1"))
                // 开启5个线程执行,并开始爬取
                .thread(60).run();
        return ServerResult.success();
    }
}