NatureController.java 2.92 KB
package com.canrd.webmagic.controller;

import com.canrd.webmagic.common.constant.ServerResult;
import com.canrd.webmagic.common.utils.KeywordUtil;
import com.canrd.webmagic.common.utils.StringUtils;
import com.canrd.webmagic.processor.NatureSearchPageProcessor;
import com.canrd.webmagic.processor.download.Downloader;
import com.canrd.webmagic.processor.pipeline.ArticlePipeline;
import org.apache.logging.log4j.core.util.UuidUtil;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import us.codecraft.webmagic.Spider;

import javax.annotation.Resource;

/**
 * nature-文章信息(NatureArticle)表控制层
 *
 * @author makejava
 * @since 2024-04-07 18:39:41
 */
@RestController
@RequestMapping("/nature/article")
public class NatureController {

    @Resource
    private NatureSearchPageProcessor natureSearchPageProcessor;

    @Resource
    private ArticlePipeline articlePipeline;

    @Resource
    private Downloader downloader;

    /**
     * @return
     */
    @GetMapping("/start")
    public ServerResult start(@RequestParam(value = "indexSize") Integer indexSize) {
        for (int i = 1; i <= indexSize; i++) {
            Spider.create(natureSearchPageProcessor)
                    .addUrl("https://www.nature.com/nature/research-articles?sort=PubDate&page=" + i)
                    .addPipeline(articlePipeline)
//                    .setDownloader(downloader.newIpDownloader())
                    // 开启5个线程执行,并开始爬取
                    .thread(5).run();
        }

        return ServerResult.success();
    }

    /**
     * @return
     */
    @GetMapping("/search")
    public ServerResult search(@RequestParam(value = "indexSize") Integer indexSize,
                               @RequestParam(value = "orderBy") String orderBy,
                               @RequestParam(value = "journal") String journal) {
        StringBuffer url = new StringBuffer("https://www.nature.com/search?q=");
        for (int i = 1; i <= indexSize; i++) {
            for (String keyword : KeywordUtil.getKeyWordList()) {
                if (StringUtils.isBlank(orderBy)) {
                    url.append(keyword).append("&page=" + i);
                } else {
                    url.append(keyword).append("&order=" + orderBy).append("&page=" + i);
                }
                if (StringUtils.isNotBlank(journal)) {
                    url.append("&journal=" + journal);
                }
                Spider.create(natureSearchPageProcessor)
                        .addUrl(url.toString())
                        // 开启5个线程执行,并开始爬取
                        .setUUID(UuidUtil.getTimeBasedUuid().toString())
                        .thread(5).run();
            }
        }
        return ServerResult.success();
    }
}