package com.canrd.webmagic.controller; import com.canrd.webmagic.common.constant.ServerResult; import com.canrd.webmagic.processor.NatureSearchPageProcessor; import com.canrd.webmagic.processor.download.Downloader; import com.canrd.webmagic.processor.pipeline.ArticlePipeline; import org.springframework.web.bind.annotation.GetMapping; import org.springframework.web.bind.annotation.RequestMapping; import org.springframework.web.bind.annotation.RequestParam; import org.springframework.web.bind.annotation.RestController; import us.codecraft.webmagic.Spider; import javax.annotation.Resource; /** * nature-文章信息(NatureArticle)表控制层 * * @author makejava * @since 2024-04-07 18:39:41 */ @RestController @RequestMapping("/nature/article") public class ArticleController { @Resource private NatureSearchPageProcessor natureSearchPageProcessor; @Resource private ArticlePipeline articlePipeline; @Resource private Downloader downloader; /** * @return */ @GetMapping("/start") public ServerResult start(@RequestParam(value = "keyword") String keyword, @RequestParam(value = "indexSize") Integer indexSize) { for (int i = 1; i <= indexSize; i++) { Spider.create(natureSearchPageProcessor) .addUrl("https://www.nature.com/nature/research-articles?sort=PubDate&page=" + i) .addPipeline(articlePipeline) // .setDownloader(downloader.newIpDownloader()) // 开启5个线程执行,并开始爬取 .thread(5).run(); } return ServerResult.success(); } }