WebMagic springboot 多站点垂直数据采集(支持插队采集)

webmagic介绍

WebMagic是一个简单灵活的Java爬虫框架。基于WebMagic,你可以快速开发出一个高效、易维护的爬虫。(来自官网介绍)

webmagic 应用

定制 Downloader

/**
 * 自定义下载器,将下载失败的url记录到redis中
 */
@Slf4j
@Component
public class SpiderDownloader extends HttpClientDownloader {

    private static final String DOWNLOAD_START_MILLS = "download_start_mills";
    private static final String DOWNLOAD_EXPAND_MILLS = "download_expand_mills";

    @Override
    public Page download(Request request, Task task) {
        request.putExtra(DOWNLOAD_START_MILLS, System.currentTimeMillis());
        return super.download(request, task);
    }

    @Override
    protected void onSuccess(Request request) {
        super.onSuccess(request);

        calcExpandMills(request);
        log.info("download expand: {} ms, url: {}", request.getExtra(DOWNLOAD_EXPAND_MILLS), request.getUrl());
    }

    @Override
    protected void onError(Request request) {
        super.onError(request);

        calcExpandMills(request);
        log.info("download error!!! expand: {} ms, url: {}", request.getExtra(DOWNLOAD_EXPAND_MILLS), request.getUrl());

        // 将下载失败的url记录到redis

    }

    /**
     * 计算下载耗费毫秒数
     * @param request
     */
    private void calcExpandMills(Request request) {
        long downloadEndMills = System.currentTimeMillis();
        Object downloadStartMills = request.getExtra(DOWNLOAD_START_MILLS);
        if(downloadStartMills != null) {
            long expandMills = downloadEndMills - Long.valueOf(downloadStartMills.toString()).longValue();
            request.putExtra(DOWNLOAD_EXPAND_MILLS, expandMills);
        }
    }
}

定制 Scheduler(基于 redis redisson)

/**
 * 自定义调度器,将下载器传递过来的请求保存到redis中,进行url去重,弹出请求
 */
@Slf4j
@Component
@NoArgsConstructor
public class SpiderRedisScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler, DuplicateRemover {

    /**
     * 用于存放url的队列
     */
    private static final String QUEUE_PREFIX = "queue_";

    /**
     * 用于存放插队url的队列
     */
    private static final String QUEUE_JUMP = "jump_";
    /**
     * 用于对url去重
     */
    private static final String SET_PREFIX = "set_";

    @Resource
    private CrawlerRuleService crawlerRuleService;

    @Autowired
    private RedissonClient redissonClient;


    protected String getSetKey(Task task) {
        return StringUtils.join(RedisKeyConst.spiderKeySpace, SET_PREFIX, task.getUUID());
    }

    protected String getQueueKey(Task task,boolean isJump) {
        return StringUtils.join( RedisKeyConst.spiderKeySpace, (isJump?QUEUE_JUMP:QUEUE_PREFIX), task.getUUID());
    }

    @Override
    public void resetDuplicateCheck(Task task) {
        RSet<Object> urlSet = redissonClient.getSet(getSetKey(task));
        urlSet.delete();
    }

    @Override
    public void push(Request request, Task task) {
        if (this.shouldReserved(request) || this.noNeedToRemoveDuplicate(request) || !isDuplicate(request, task)) {
            this.pushWhenNoDuplicate(request, task);
        }
    }

    @Override
    public boolean isDuplicate(Request request, Task task) {
        RSetCache<Object> urlSet = redissonClient.getSetCache(getSetKey(task));
        boolean has = urlSet.contains(request.getUrl());
        if(Boolean.FALSE.equals(has)) {
            // 将url加入到redis set中
            urlSet.add(request.getUrl(),60, TimeUnit.MINUTES);
            return false;
        } else {
            return true;
        }
    }

    @Override
    protected void pushWhenNoDuplicate(Request request, Task task) {
        CrawlerRequestDto requestInfo = request.getExtra(RequestConst.REQUEST_INFO);
        if(null == requestInfo){
            return;
        }
        boolean jump = requestInfo.isJump();
        // 将request推入redis队列中
        RDeque<Object> deque = redissonClient.getDeque(getQueueKey(task,jump));
        Integer type = requestInfo.getType();
        if(CrawlerTypeEnum.CONTENT.getType().equals(type)){
            deque.addLast(request);
            return;
        }
        deque.addFirst(request);
    }

    @Override
    public Request poll(Task task) {
        // 从队列中弹出一个url
        RDeque<Object> jumpDeque = redissonClient.getDeque(getQueueKey(task, true));
        Request request = pollWithStatus(jumpDeque);
        if(null != request){
            return request;
        }
        RDeque<Object> deque = redissonClient.getDeque(getQueueKey(task,false));
        return pollWithStatus(deque);
    }

    private Request pollWithStatus(RDeque<Object> deque){
        while (true){
            Request request = (Request)deque.pollFirst();
            if(request == null) {
                return null;
            }
            CrawlerRequestDto requestInfo = request.getExtra(RequestConst.REQUEST_INFO);
            if(null == requestInfo){
                return null;
            }
            CrawlerRule crawlerInfo = crawlerRuleService.getByRuleId(requestInfo.getRuleId());
            if(null == crawlerInfo){
                return null;
            }
            int openStatus = crawlerInfo.getOpenStatus();
            if(openStatus == 0){
                log.info("url:{},规则:{} 被关闭",request.getUrl(),crawlerInfo.getRuleName());
                continue;
            }
            return request;
        }
    }


    @Override
    public int getLeftRequestsCount(Task task) {
        RDeque<Object> jumpDeque = redissonClient.getDeque(getQueueKey(task,true));
        RDeque<Object> deque = redissonClient.getDeque(getQueueKey(task,false));
        return jumpDeque.size() + deque.size();
    }

    public int getLeftRequestsCount(String domain){
        if(StrUtil.isBlank(domain)){
            return 0;
        }
        RDeque<Object> deque = redissonClient.getDeque(RedisKeyConst.spiderKeySpace + QUEUE_PREFIX + domain);
        return deque.size();
    }


    @Override
    public int getTotalRequestsCount(Task task) {
        RSetCache<Object> urlSet = redissonClient.getSetCache(getSetKey(task));
        return urlSet.size();
    }

    public int getTotalRequestsCount(String domain) {
        if(StrUtil.isBlank(domain)){
            return 0;
        }
        RSetCache<Object> urlSet = redissonClient.getSetCache(RedisKeyConst.spiderKeySpace + SET_PREFIX + domain);
        return urlSet.size();
    }


    public void delAllRequest(String domain){
        RDeque<Object> deque = redissonClient.getDeque(RedisKeyConst.spiderKeySpace + QUEUE_PREFIX + domain);
        deque.delete();
        RSetCache<Object> urlSet = redissonClient.getSetCache(RedisKeyConst.spiderKeySpace + SET_PREFIX + domain);
        urlSet.delete();
    }
}

定制 Pipeline

/**
 * @Description: implements Pipeline 数据持久化
 */
@Slf4j
@Component
public class SpiderSavePipLine implements Pipeline {

    @SneakyThrows
    @Override
    public void process(ResultItems resultItems, Task task) {
        Request request = resultItems.getRequest();
        CrawlerRequestDto requestInfo = request.getExtra(RequestConst.REQUEST_INFO);
        if(null == requestInfo){
            return;
        }
        Integer type = requestInfo.getType();
        if(CrawlerTypeEnum.DETAIL.getType().equals(type)){
            //详情页信息保存
        }
        if(CrawlerTypeEnum.CONTENT.getType().equals(type)){
            //内容页
          
        }
    }
}

定制 SpiderListener


/**
 * 自定义监听器,统计处理request的成功数和失败数,同时将处理失败的request收集起来
 */
@Component
public class SpiderEventListener implements SpiderListener {
    private final AtomicInteger successCount = new AtomicInteger(0);
    private final AtomicInteger failCount = new AtomicInteger(0);

    private List<Request> failRequests = new CopyOnWriteArrayList<>();

    @Resource
    private RedissonClient redissonClient;

    @Resource
    private CrawlerRuleService crawlerRuleService;

    @Override
    public void onSuccess(Request request) {
        successCount.incrementAndGet();
        CrawlerRequestDto requestInfo = request.getExtra(RequestConst.REQUEST_INFO);
        if(null == requestInfo){
            return;
        }
        String countDownSpace = requestInfo.getCountDownSpace();
        if(StrUtil.isNotBlank(countDownSpace)){
            if(null == redissonClient){
                return;
            }
            RCountDownLatch latch = redissonClient.getCountDownLatch(countDownSpace);
            latch.countDown();
        }
    }

    @Override
    public void onError(Request request) {
        failRequests.add(request);
        failCount.incrementAndGet();
    }

    public AtomicInteger getSuccessCount() {
        return successCount;
    }

    public AtomicInteger getFailCount() {
        return failCount;
    }

    public List<Request> getFailRequests() {
        return failRequests;
    }


    /**
     * 根据规则id获取规则信息
     * @param ruleId
     * @return
     */
    private CrawlerRule getCrawlerInfo(String ruleId){
        if(null == ruleId){
            return null;
        }
        CrawlerRule crawlerRule = crawlerRuleService.getByRuleId(ruleId);
        if(StrUtil.isBlank(crawlerRule.getId())){
            return null;
        }
        return crawlerRule;
    }
}

定制 PageProcessor

@Slf4j
@Component
public class SpiderProcessor implements PageProcessor {

    @Resource
    private NovelProcessorFactory novelProcessorFactory;

    @Resource
    private CrawlerRuleService crawlerRuleService;


    @Value("${spider.retryTimes:3000}")
    private int retryTimes = 3000;

    @Value("${spider.sleepTime:3000}")
    private int sleepTime = 3000;

    @Value("${spider.timeOut:60000}")
    private int timeOut = 60000;


    @Override
    public void process(Page page) {
        CrawlerRequestDto requestInfo =
                page.getRequest().getExtra(RequestConst.REQUEST_INFO);
        int type = requestInfo.getType();
        CrawlerRule crawlerInfo = getCrawlerInfo(requestInfo.getRuleId());
        if(null == crawlerInfo){
            return;
        }
        int openStatus = crawlerInfo.getOpenStatus();
        if(openStatus == 0){
            //规则已被关闭
            log.info("规则:{} 被关闭",crawlerInfo.getRuleName());
            return;
        }
        NovelProcessor novelProcessor = novelProcessorFactory.getProcessor(type);
        if(null == novelProcessor){
            return;
        }
        novelProcessor.process(page,requestInfo,crawlerInfo);
    }

    /**
     * 根据规则id获取规则信息
     * @param ruleId
     * @return
     */
    private CrawlerRule getCrawlerInfo(String ruleId){
        if(null == ruleId){
            return null;
        }
        CrawlerRule crawlerRule = crawlerRuleService.getByRuleId(ruleId);
        if(StrUtil.isBlank(crawlerRule.getId())){
            return null;
        }
        return crawlerRule;
    }


    @Override
    public Site getSite() {
        Site site = Site.me().setRetryTimes(retryTimes).setSleepTime(sleepTime).setTimeOut(timeOut);
        site.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3");
        site.addHeader("Accept-Encoding", "gzip, deflate");
        site.addHeader("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8");
        site.addHeader("Cache-Control", "max-age=0");
        site.addHeader("Connection", "keep-alive");
//        site.addHeader("Cookie", "Hm_lvt_42e120beff2c918501a12c0d39a4e067=1566530194,1566819135,1566819342,1566963215; Hm_lpvt_42e120beff2c918501a12c0d39a4e067=1566963215");
//        site.addHeader("Host", "www.yousuu.com");
        site.addHeader("Upgrade-Insecure-Requests", "1");
        site.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36");

        return site;
    }
}

根据采集规则获取不同processor

@Component
public class NovelProcessorFactory {

    private static Map<Integer, NovelProcessor> NOVEL_PROCESSOR_MAP = new HashMap<>();

    @Autowired
    private void setProcessorStrategy(
           NovelListProcessor novelListProcessor,
           NovelInfoProcessor novelInfoProcessor,
           NovelContentProcessor novelContentProcessor
    ){
        NovelProcessorFactory.NOVEL_PROCESSOR_MAP.put(
                CrawlerTypeEnum.LIST.getType(),novelListProcessor);
        NovelProcessorFactory.NOVEL_PROCESSOR_MAP.put(
                CrawlerTypeEnum.DETAIL.getType(),novelInfoProcessor);
        NovelProcessorFactory.NOVEL_PROCESSOR_MAP.put(
                CrawlerTypeEnum.CONTENT.getType(),novelContentProcessor);
    }

    public NovelProcessor getProcessor(Integer crawlerType){
        NovelProcessor novelProcessor = NOVEL_PROCESSOR_MAP.get(crawlerType);
        return novelProcessor;
    }
}

定制 Spider

public class SpiderStartContainer extends Spider {

    private SpiderEventListener listener;

    private RedissonClient redissonClient;

    private int status = 0;

    public SpiderStartContainer(PageProcessor pageProcessor, SpiderEventListener listener, RedissonClient redissonClient) {
        super(pageProcessor);
        this.redissonClient = redissonClient;

        List<SpiderListener> spiderListeners = this.getSpiderListeners();
        if(CollectionUtils.isEmpty(spiderListeners)) {
            spiderListeners = new ArrayList<>();
        }
        this.listener = listener;
        spiderListeners.add(listener);
        this.setSpiderListeners(spiderListeners);
    }

    public static SpiderStartContainer create(PageProcessor pageProcessor,SpiderEventListener listener,RedissonClient redissonClient) {
        return new SpiderStartContainer(pageProcessor,listener, redissonClient);
    }

    @Override
    public void run() {
        super.run();
        if(scheduler != null && scheduler instanceof DuplicateRemover) {
            ((DuplicateRemover) scheduler).resetDuplicateCheck(this);
        }
        //爬虫任务完成时处理爬取失败的请求
        List<Request> failRequests = listener.getFailRequests();
        if(CollectionUtils.isNotEmpty(failRequests)) {

        }
    }

    @Override
    public void close() {
        super.close();
        this.status = 3;
    }

    public void spiderClose(String countDownSpace) {
        this.close();
        spiderCountDown(countDownSpace);
    }

    public void spiderStop(String countDownSpace){
        this.stop();
        this.status = this.stat.intValue();
        spiderCountDown(countDownSpace);
    }

    public void spiderStart(){
        spiderStart(null);
    }


    public void spiderStart(String countDownSpace){
        logger.info("===================== check spider:"+this.getUUID()+" status =====================");
        if(this.getSpiderStatus() == 3){
            logger.info("===================== spider:"+this.getUUID()+" is close! spider will be restart! =====================");
            this.spiderStop(null);
        }
        while (this.getSpiderStatus() == 3){
            logger.info("===================== spider:"+this.getUUID()+" is to stop =====================");
        }
        if(this.stat.intValue() == 2 || this.stat.intValue() == 0){
            this.start();

            this.status = this.stat.intValue();
        }
        logger.info("===================== spider:"+this.getUUID()+" is start =====================");
        spiderCountDown(countDownSpace);
    }



    public void spiderJumpQueue(Request request){
        CrawlerRequestDto requestInfo = request.getExtra(RequestConst.REQUEST_INFO);
        if(null == requestInfo){
            return;
        }
        requestInfo.setJump(true);
        this.addRequest(request);
        spiderStart();
    }

    public String getSpiderUUID() {
        return this.getUUID();
    }

    public Integer getSpiderStatus(){
        if(status == 3){
            return status;
        }
        return stat.get();
    }

    public int getLeftRequestsCount(String domain){
        SpiderRedisScheduler scheduler = (SpiderRedisScheduler)this.getScheduler();
        if(null != scheduler){
            return scheduler.getLeftRequestsCount(domain);
        }
        return 0;
    }

    public int getTotalRequestsCount(String domain){
        SpiderRedisScheduler scheduler = (SpiderRedisScheduler)this.getScheduler();
        if(null != scheduler){
            return scheduler.getTotalRequestsCount(domain);
        }
        return 0;
    }

    private void spiderCountDown(String countDownSpace){
        if(null == redissonClient){
            return;
        }
        if(StrUtil.isNotBlank(countDownSpace)){
            RCountDownLatch latch = redissonClient.getCountDownLatch(countDownSpace);
            latch.countDown();
        }
    }

}

spider生产者:

@Slf4j
@Configuration
public class SpiderStartContainerFactory {
    @Value("${spider.threadNum:0}")
    private int threadNum;
    /**
     * 线程统一管理
     */
    private static ExecutorService executorService;

    @Resource
    private SpiderProcessor spiderProcessor;

    @Resource
    private SpiderEventListener spiderEventListener;

    @Resource
    private SpiderDownloader spiderDownloader;

    @Resource
    private SpiderRedisScheduler spiderRedisScheduler;

    @Resource
    private SpiderSavePipLine spiderSavePipLine;

    @Resource
    private RedissonClient redissonClient;

    private static final Map<String, SpiderStartContainer> startContainerMap = Collections.synchronizedMap(new HashMap<>());

    public SpiderStartContainer getStartContainer(String domain) {
        if(startContainerMap.get(domain) != null){
            return startContainerMap.get(domain);
        }
        SpiderStartContainer spiderStartContainer =
                SpiderStartContainer.create(spiderProcessor, spiderEventListener, redissonClient);
        spiderStartContainer.setDownloader(spiderDownloader);
        spiderStartContainer.setScheduler(spiderRedisScheduler);
        spiderStartContainer.setPipelines(Collections.singletonList(spiderSavePipLine));
        spiderStartContainer.setExecutorService(executorService);
        startContainerMap.put(domain,spiderStartContainer);
        return spiderStartContainer;
    }

    public SpiderStartContainer getStartContainer(Request request) {
        CrawlerRequestDto requestInfo = request.getExtra(RequestConst.REQUEST_INFO);
        if(null == requestInfo){
            return null;
        }
        UrlBuilder urlBuilder = UrlBuilder.ofHttp(requestInfo.getUrl(), CharsetUtil.CHARSET_UTF_8);
        String domain = urlBuilder.getHost();
        return getStartContainer(domain);
    }

    @PostConstruct
    public void getExecutorService(){
        //线程数 cpu * 2
        if (threadNum == 0) {
            threadNum = Runtime.getRuntime().availableProcessors() * 2;
        }
        log.info("================ spider thread num :{} ================", threadNum);
        SpiderStartContainerFactory.executorService =  Executors.newFixedThreadPool(threadNum);
    }
}

数据采集启动

/**
 * @Description: 采集启动,总线程,集成remote支持远程调用
 */
@Slf4j
@Component
public class SpiderStart implements ApplicationRunner, SpiderContainerRemote,SpiderJobStartRemote {

    @Autowired
    private SpiderStartContainerFactory spiderStartContainerFactory;

    @Resource
    private CrawlerRuleService crawlerRuleService;

    private int status;

    @Override
    public void run(ApplicationArguments args){
        this.start();
    }

    @Override
    public void start() {
        List<CrawlerRule> allRules = crawlerRuleService.getAll();
        List<CrawlerRule> collect = allRules.stream()
                .filter(rule -> rule.getOpenStatus() == 1)
                .collect(Collectors.toList());
        for (CrawlerRule crawlerRule : collect) {
            CrawlerListRule listRule = crawlerRule.getListRule();
            String sourceUrl = listRule.getSourceUrl();
            int pageStartRule = listRule.getPageStartRule();
            String url = sourceUrl.replace(RequestConst.PAGE_REPLACE,pageStartRule+"");
            Request request = new Request(url);
            CrawlerRequestDto requestInfo = CrawlerRequestDto.builder()
                    .url(url)
                    .ruleId(crawlerRule.getRuleId())
                    .type(CrawlerTypeEnum.LIST.getType())
                    .build();
            request.putExtra(RequestConst.REQUEST_INFO,requestInfo);
            SpiderStartContainer spiderStartContainer = spiderStartContainerFactory.getStartContainer(getDomain(url));
            spiderStartContainer.addRequest(request);
            spiderStartContainer.spiderStart();
            status = 1;
        }
    }



    @Override
    public int getStarStatus() {
        return this.status;
    }



    public int getStatus() {
        return status;
    }

    public void setStatus(int status) {
        this.status = status;
    }

    private String getDomain(String url){
        UrlBuilder urlBuilder = UrlBuilder.ofHttp(url, CharsetUtil.CHARSET_UTF_8);
        return urlBuilder.getHost();
    }

    @Override
    public Integer getSpiderStatus(String domain) {
        SpiderStartContainer spiderStartContainer = spiderStartContainerFactory.getStartContainer(domain);
        if(spiderStartContainer == null){
            return null;
        }
        return spiderStartContainer.getSpiderStatus();
    }

    @Override
    public void spiderClose(String domain, String countDownSpace) {
        SpiderStartContainer spiderStartContainer = spiderStartContainerFactory.getStartContainer(domain);
        spiderStartContainer.spiderClose(null);
    }

    @Override
    public void spiderStop(String domain, String countDownSpace) {
        SpiderStartContainer spiderStartContainer = spiderStartContainerFactory.getStartContainer(domain);
        spiderStartContainer.spiderStop(null);
    }

    @Override
    public void spiderStart(String domain) {
        SpiderStartContainer spiderStartContainer = spiderStartContainerFactory.getStartContainer(domain);
            spiderStartContainer.spiderStart();
    }

    @Override
    public void spiderStart(String domain, String countDownSpace) {
        SpiderStartContainer spiderStartContainer = spiderStartContainerFactory.getStartContainer(domain);
        spiderStartContainer.spiderStart(countDownSpace);
    }

    @Override
    public void spiderJumpQueue(Request request) {
        SpiderStartContainer spiderStartContainer = spiderStartContainerFactory.getStartContainer(getDomain(request.getUrl()));
        spiderStartContainer.spiderJumpQueue(request);
    }

    @Override
    public void spiderJumpQueue(Request request, String domain) {
        SpiderStartContainer spiderStartContainer = spiderStartContainerFactory.getStartContainer(domain);
        spiderStartContainer.spiderJumpQueue(request);
    }
}

开源地址

github开源地址(飞鸟小说):https://github.com/caobinrg/birds-novel
gitee开源地址(飞鸟小说) : https://gitee.com/caobinrg/birds-novel

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值