2021-06-16 datax,running异常一直0 byteSpeed

本文档展示了DataX中AbstractScheduler类的代码片段,主要关注当任务运行异常,长时间保持RUNNING状态且byteSpeed为0时,如何进行超时处理,将状态设为FAILED并记录日志。该机制确保了任务执行的可靠性,防止任务卡死导致的调度问题。

业务类AbstractScheduler,具体代码

下面展示一些 内联代码片

//2021-06-16 wx 为了设置一个时间段,加入一直是running,而且byteSpeed为0,说明卡住了,设置5分钟之后就直接返回failed
long finalTime = System.currentTimeMillis()+300000;
/*----------------------------------------2021-06-16 wx-------功能:卡在running部分,不往下走了,设置5分钟之后就直接抛出failed------------------------------------------*/
                Long byteSpeed =  nowJobContainerCommunication.getLongCounter(CommunicationTool.BYTE_SPEED);
                if(now>finalTime&&nowJobContainerCommunication.getState() == State.RUNNING && byteSpeed==0){//如果超出了规定时间,还是再running那就赋予FAILED
                    nowJobContainerCommunication.setState(State.FAILED);
                    LOG.info("Scheduler out off time---------------------------- running 异常,超时处理!!!");//调度 卡住超时
                }
                /*----------------------------------------2021-06-16 wx-------功能:卡在running部分,不往下走了,设置5分钟之后就直接抛出failed------------------------------------------*/

业务类AbstractScheduler

下面展示一些 内联代码片

package com.alibaba.datax.core.job.scheduler;

import com.alibaba.datax.common.exception.DataXException;
import com.alibaba.datax.common.util.Configuration;
import com.alibaba.datax.core.statistics.communication.Communication;
import com.alibaba.datax.core.statistics.communication.CommunicationTool;
import com.alibaba.datax.core.statistics.container.communicator.AbstractContainerCommunicator;
import com.alibaba.datax.core.util.ErrorRecordChecker;
import com.alibaba.datax.core.util.FrameworkErrorCode;
import com.alibaba.datax.core.util.container.CoreConstant;
import com.alibaba.datax.dataxservice.face.domain.enums.State;
import org.apache.commons.lang.Validate;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.List;

public abstract class AbstractScheduler {
    private static final Logger LOG = LoggerFactory
            .getLogger(AbstractScheduler.class);

    private ErrorRecordChecker errorLimit;

    private AbstractContainerCommunicator containerCommunicator;

    private Long jobId;

    public Long getJobId() {
        return jobId;
    }

    public AbstractScheduler(AbstractContainerCommunicator containerCommunicator) {
        this.containerCommunicator = containerCommunicator;
    }

    public void schedule(List<Configuration> configurations) {
        Validate.notNull(configurations,
                "scheduler配置不能为空");
        int jobReportIntervalInMillSec = configurations.get(0).getInt(
                CoreConstant.DATAX_CORE_CONTAINER_JOB_REPORTINTERVAL, 30000);
        int jobSleepIntervalInMillSec = configurations.get(0).getInt(
                CoreConstant.DATAX_CORE_CONTAINER_JOB_SLEEPINTERVAL, 10000);

        this.jobId = configurations.get(0).getLong(
                CoreConstant.DATAX_CORE_CONTAINER_JOB_ID);

        errorLimit = new ErrorRecordChecker(configurations.get(0));

        /**
         * 给 taskGroupContainer 的 Communication 注册
         */
        this.containerCommunicator.registerCommunication(configurations);

        int totalTasks = calculateTaskCount(configurations);
        startAllTaskGroup(configurations);

        Communication lastJobContainerCommunication = new Communication();

        long lastReportTimeStamp = System.currentTimeMillis();

        long finalTime = System.currentTimeMillis()+300000;//2021-06-16 wx 为了设置一个时间段,加入一直是running,而且byteSpeed为0,说明卡住了,设置5分钟之后就直接返回failed
        try {
            while (true) {
                /**
                 * step 1: collect job stat
                 * step 2: getReport info, then report it
                 * step 3: errorLimit do check
                 * step 4: dealSucceedStat();
                 * step 5: dealKillingStat();
                 * step 6: dealFailedStat();
                 * step 7: refresh last job stat, and then sleep for next while
                 *
                 * above steps, some ones should report info to DS
                 *
                 */
                Communication nowJobContainerCommunication = this.containerCommunicator.collect();
                nowJobContainerCommunication.setTimestamp(System.currentTimeMillis());
                LOG.debug(nowJobContainerCommunication.toString());

                //汇报周期
                long now = System.currentTimeMillis();
                if (now - lastReportTimeStamp > jobReportIntervalInMillSec) {
                    Communication reportCommunication = CommunicationTool
                            .getReportCommunication(nowJobContainerCommunication, lastJobContainerCommunication, totalTasks);

                    this.containerCommunicator.report(reportCommunication);
                    lastReportTimeStamp = now;
                    lastJobContainerCommunication = nowJobContainerCommunication;
                }

                errorLimit.checkRecordLimit(nowJobContainerCommunication);



                if (nowJobContainerCommunication.getState() == State.SUCCEEDED) {
                    LOG.info("Scheduler accomplished all tasks.");//调度 完成了所有任务
                    break;
                }

                /*----------------------------------------2021-06-16 wx-------功能:卡在running部分,不往下走了,设置5分钟之后就直接抛出failed------------------------------------------*/
                Long byteSpeed =  nowJobContainerCommunication.getLongCounter(CommunicationTool.BYTE_SPEED);
                if(now>finalTime&&nowJobContainerCommunication.getState() == State.RUNNING && byteSpeed==0){//如果超出了规定时间,还是再running那就赋予FAILED
                    nowJobContainerCommunication.setState(State.FAILED);
                    LOG.info("Scheduler out off time---------------------------- running 异常,超时处理!!!");//调度 卡住超时
                }
                /*----------------------------------------2021-06-16 wx-------功能:卡在running部分,不往下走了,设置5分钟之后就直接抛出failed------------------------------------------*/

                if (isJobKilling(this.getJobId())) {
                    dealKillingStat(this.containerCommunicator, totalTasks);
                } else if (nowJobContainerCommunication.getState() == State.FAILED) {
                    dealFailedStat(this.containerCommunicator, nowJobContainerCommunication.getThrowable());
                }

                Thread.sleep(jobSleepIntervalInMillSec);// 循环睡10s
            }
        } catch (InterruptedException e) {
            // 以 failed 状态退出
            LOG.error("捕获到InterruptedException异常!", e);

            throw DataXException.asDataXException(
                    FrameworkErrorCode.RUNTIME_ERROR, e);
        }

    }

    protected abstract void startAllTaskGroup(List<Configuration> configurations);

    protected abstract void dealFailedStat(AbstractContainerCommunicator frameworkCollector, Throwable throwable);

    protected abstract void dealKillingStat(AbstractContainerCommunicator frameworkCollector, int totalTasks);

    private int calculateTaskCount(List<Configuration> configurations) {
        int totalTasks = 0;
        for (Configuration taskGroupConfiguration : configurations) {
            totalTasks += taskGroupConfiguration.getListConfiguration(
                    CoreConstant.DATAX_JOB_CONTENT).size();
        }
        return totalTasks;
    }

//    private boolean isJobKilling(Long jobId) {
//        Result<Integer> jobInfo = DataxServiceUtil.getJobInfo(jobId);
//        return jobInfo.getData() == State.KILLING.value();
//    }

    protected  abstract  boolean isJobKilling(Long jobId);
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值