关于otter监控告警使用

Experience-摆渡

已于 2025-03-02 15:02:29 修改

阅读量1.1k

点赞数 24

分类专栏： otter专题文章标签：分布式中间件 otter

于 2024-11-30 17:08:19 首次发布

本文链接：https://blog.csdn.net/wukuncsdn/article/details/144155787

版权

otter专题专栏收录该内容

5 篇文章

订阅专栏

一、背景

近期在使用otter完成单机房单向同步时，常常遇到channel假死的情况，导致Pipeline同步停止，系统表数据同步停止，影响生产环境用户数据查询相关的功能，虽然事后能够通过停channel后再启用channel重新启用同步任务，恢复需要同步的数据，但时常出现该问题而不能及时发现让人头疼不已。通过查询相关的资料有发现早有同学遇到过类似的问题，初步认为是otter的调度算法导致死锁导致任务停止，但目前仍然没有石锤，且otter开源团队目前未对该问题进行官方解答。具体问题描述可参考：https://github.com/alibaba/otter/issues/911

为了解决该问题，曾尝试在线下复现该问题，但是以失败告终，然后就换了一个思路：是否能及时发现同步停止的问题呢？按照这个思路我看到了官方其实目前支持五种同步的告警：延迟、Pipeline延迟、Process延迟、Position延迟、异常监控告警，同时还提供了告警自我恢复机制；通过使用测试其中的Pipeline延迟机制并开启自我恢复机制，发现确实能够及时的完成告警，而且在触发自我恢复阀的情况，系统能够自动完成channel的stop，然后自动再start，channel的同步任务恢复正常，现就针对otter监控告警机制进行说明。

二、otter监控机制解析：

首先得看一下otter监控机制的流程：1.首先我们通过otter的控制台为channel下的Pipeline配置监控。2.otter-manager在启动时会启用一个单线程的定时任务线程池，定时任务每120秒执行一次。2.1 该线程池任务在执行时会查询当前所有启用的监控记录；2.2 然后对监控规则按照Pipeline进行分组；2.3 遍历分组的后的具体规则。2.4 查询Pipeline信息，然后根据Pipeline中的channelId查询zookeeper上对应channel的状态，若channel的状态节点为null或者未停止状态则不执行后监控逻辑。2.5 根据不同的监控类型执行具体的判断逻辑；若当前的统计的数据满足监控告警的条件则执行告警逻辑，若开启了自我恢复机制则尝试恢复channel任务同步。源码分析如下：

#一、SelfMonitor中的start()开启监控
private synchronized void start() {
        if (executor == null) {
            // 创建定时任务线程池，单线程
            executor = new ScheduledThreadPoolExecutor(DEFAULT_POOL, new NamedThreadFactory("Self-Monitor"),
                                                       new ThreadPoolExecutor.CallerRunsPolicy());
        }
        if (future == null) {
            // 每120秒执行一次
            future = executor.scheduleWithFixedDelay(new Runnable() {
                public void run() {
                    try {
                        // 调用GlobalMonitor#explore()
                        monitor.explore();
                    } catch (Exception e) {
                        log.error("self-monitor failed.", e);
                    }
                }
            }, interval, interval, TimeUnit.SECONDS);
        }
    }

#二、GlobalMonitor#explore() 获取监控列表，默认是并行执行监控任务
public void explore() {
        // 查询了所有启用的监控列表通知根据Pipeline进行分组
        Map<Long, List<AlarmRule>> rules = alarmRuleService.getAlarmRules(AlarmRuleStatus.ENABLE);
        if (!CollectionUtils.isEmpty(rules)) {
            if (needConcurrent) {
                concurrentProcess(rules);
            } else {// 串行
                serialProcess(rules);
            }
        } else {
            log.warn("no enabled alarm rule at all. Check the rule setting please!");
        }

        // 自动恢复机制
        if (recoveryPaused) {
            List<Long> channelIds = channelService.listAllChannelId();
            if (needConcurrent) {
                concurrentProcess(channelIds);
            } else {// 串行
                serialProcess(channelIds);
            }
        }
    }

#三、GlobalMonitor#concurrentProcess()将每个Pipeline的监控列表提交给线程池去执行
private void concurrentProcess(Map<Long, List<AlarmRule>> rules) {
        ExecutorCompletionService completionExecutor = new ExecutorCompletionService(executor);
        List<Future> futures = new ArrayList<Future>();
        for (Entry<Long, List<AlarmRule>> entry : rules.entrySet()) {
            final List<AlarmRule> alarmRules = entry.getValue();
            futures.add(completionExecutor.submit(new Callable<Object>() {

                @Override
                public Object call() throws Exception {
                    pipelineMonitor.explore(alarmRules);
                    return null;
                }
            }));
        }

        List<Throwable> exceptions = new ArrayList<Throwable>();
        int index = 0;
        int size = futures.size();
        while (index < size) {
            try {
                Future<?> future = completionExecutor.take();
                future.get();
            } catch (InterruptedException e) {
                exceptions.add(e);
            } catch (ExecutionException e) {
                exceptions.add(e);
            }
            index++;
        }

        if (!exceptions.isEmpty()) {
            StringBuilder sb = new StringBuilder(exceptions.size() + " exception happens in global monitor\n");
            sb.append("exception stack start :\n");
            for (Throwable t : exceptions) {
                sb.append(ExceptionUtils.getStackTrace(t));
            }
            sb.append("exception stack end \n");
            throw new IllegalStateException(sb.toString());
        }
    }

#四、PipelineMonitor#explore(List<AlarmRule> rules)完成不同类型的监控分发执行
public void explore(List<AlarmRule> rules) {
        Long pipelineId = rules.get(0).getPipelineId();
        Pipeline pipeline = pipelineService.findById(pipelineId);
        // 如果处于stop状态，则忽略报警
        ChannelStatus status = arbitrateManageService.channelEvent().status(pipeline.getChannelId());
        if (status == null || status.isStop()) {
            return;
        }

        List<AlarmRule> delayTimeRules = new LinkedList<AlarmRule>();
        List<AlarmRule> exceptonRules = new LinkedList<AlarmRule>();
        List<AlarmRule> pipelineTimeoutRules = new LinkedList<AlarmRule>();
        List<AlarmRule> processTimeoutRules = new LinkedList<AlarmRule>();
        List<AlarmRule> positionTimeoutRules = new LinkedList<AlarmRule>();

        Date now = new Date();
        for (AlarmRule rule : rules) {
            switch (rule.getMonitorName()) {
                case DELAYTIME:
                    if (checkEnable(rule, now)) {
                        delayTimeRules.add(rule);
                    }
                    break;
                case EXCEPTION:
                    if (checkEnable(rule, now)) {
                        exceptonRules.add(rule);
                    }
                    break;
                case PIPELINETIMEOUT:
                    if (checkEnable(rule, now)) {
                        pipelineTimeoutRules.add(rule);
                    }
                    break;
                case PROCESSTIMEOUT:
                    if (checkEnable(rule, now)) {
                        processTimeoutRules.add(rule);
                    }
                    break;
                case POSITIONTIMEOUT:
                    if (checkEnable(rule, now)) {
                        positionTimeoutRules.add(rule);
                    }
                    break;
                default:
                    break;
            }
        }

        if (!delayTimeRules.isEmpty()) {
            delayStatRuleMonitor.explore(delayTimeRules);
        }

        if (!pipelineTimeoutRules.isEmpty()) {
            pipelineTimeoutRuleMonitor.explore(pipelineTimeoutRules);
        }

        if (!processTimeoutRules.isEmpty()) {
            processTimeoutRuleMonitor.explore(processTimeoutRules);
        }

        if (!positionTimeoutRules.isEmpty()) {
            positionTimeoutRuleMonitor.explore(positionTimeoutRules);
        }
    }

#五、具体的监控实现类完成监控规则检查实施告警和自我恢复机制处理