yarn flink,spark实时程序监控

背景: 生产上的spark和flink程序会挂掉,虽然是基于阿里云配置了邮箱告警,但是由于程序的重启策略配置,导致重启后的程序,下次挂掉不会再被监控到。所以需要手动监控yarn,

实现方案

        yarn 提供的api, 可以访问集群yarn web ui 接口数据,对Application Type进行过滤 ,每2分钟调用api,对比前后2次运行的实时程序,判断第一次的列表是否都还存在,不存在则发送邮件

 1.环境配置

   导入依赖,导入相关hadoop配置文件 yarn-site.xml,core-site.xml,hdfs-site.xml

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-yarn-api</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-yarn-client</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>javax.mail</groupId>
            <artifactId>mail</artifactId>
            <version>1.4.7</version>
        </dependency>

实现代码

   比对前后2次RUNNING 的job 将后面缺失的job标记为失败,发送邮件,发送邮件的util放在了github,具体需要配置可以参考我之前的博客https://blog.csdn.net/yumingzhu1/article/details/88576291

import com.google.common.collect.Sets;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ApplicationReport;
import org.apache.hadoop.yarn.api.records.YarnApplicationState;
import org.apache.hadoop.yarn.client.api.YarnClient;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException;
import util.MailUtils;

import java.io.IOException;
import java.net.ConnectException;
import java.util.*;
import java.util.concurrent.TimeUnit;

/**
 * @Description  监控yarn上的flink 或者spark程序
 * @auther eamon.yu
 * @date 2021-07-13
 */
public class YarnMonitor {

    /**
     * 用于存储上一次查询的runing结果,
     */
    static List<String> lastRunJob = new ArrayList<>();


    /**
     * 获取任务的applicationId
     *
     * @param jobName
     * @return
     */
    public static String getAppId(String jobName) {
        YarnClient client = YarnClient.createYarnClient();
        Configuration conf = new Configuration();
        client.init(conf);
        client.start();
        EnumSet<YarnApplicationState> appStates = EnumSet.noneOf(YarnApplicationState.class);
        if (appStates.isEmpty()) {
            appStates.add(YarnApplicationState.RUNNING);
            appStates.add(YarnApplicationState.ACCEPTED);
            appStates.add(YarnApplicationState.SUBMITTED);
        }
        List<ApplicationReport> appsReport = null;
        try {
            //返回EnumSet<YarnApplicationState>中个人任务状态的所有任务
            appsReport = client.getApplications(appStates);
        } catch (YarnException | IOException e) {
            e.printStackTrace();
        }
        assert appsReport != null;
        for (ApplicationReport appReport : appsReport) {
            //获取任务名
            String jn = appReport.getName();
            String applicationType = appReport.getApplicationType();
            if (jn.equals(jobName) && "Apache Flink".equals(applicationType)) {
                try {
                    client.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
                return appReport.getApplicationId().toString();
            }
        }
        try {
            client.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return null;
    }

    /**
     * 根据任务的applicationId去获取任务的状态
     *
     * @param appId
     * @return
     */
    public static YarnApplicationState getState(String appId) {
        YarnClient client = YarnClient.createYarnClient();
        Configuration conf = new Configuration();
        client.init(conf);
        client.start();
        ApplicationId applicationId = ApplicationId.fromString(appId);
        //	        ApplicationId appId = ConverterUtils.toApplicationId(appId);
        YarnApplicationState yarnApplicationState = null;
        try {
            ApplicationReport applicationReport = client.getApplicationReport(applicationId);
            yarnApplicationState = applicationReport.getYarnApplicationState();
        } catch (YarnException | IOException e) {
            e.printStackTrace();
        }
        try {
            client.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return yarnApplicationState;
    }

    public static void main(String[] args) throws Exception {
        while (true) {
            String value = yarnContainsSparkOrFlinkJob();
            if (StringUtils.isNotBlank(value)) {
                String content = value + "\t 可能出现异常,尽快处理";
                System.out.println(content);
                MailUtils.send(content);

            }
            TimeUnit.SECONDS.sleep(120);

        }
    }

    /**
     * 判断任务名为appName的任务,是否在yarn中运行,状态为RUNNING
     *
     * @param appName
     * @return
     */
    public static boolean yarnIsContains(String appName) {
        Configuration conf = new YarnConfiguration();
        YarnClient yarnClient = YarnClient.createYarnClient();
        yarnClient.init(conf);
        yarnClient.start();
        boolean isContains = false;
        List<ApplicationReport> applications = new ArrayList<ApplicationReport>();
        try {
            //applications = yarnClient.getApplications(EnumSet.of(YarnApplicationState.RUNNING, YarnApplicationState.FINISHED));
            applications = yarnClient.getApplications(EnumSet.of(YarnApplicationState.RUNNING));
            System.out.println(applications.size());
            for (ApplicationReport application : applications) {
                String name = application.getName();
                if (name.equals(appName)) {
                    System.out.println("ApplicationId ============> " + application.getApplicationId());
                    System.out.println("name ============> " + application.getName());
                    System.out.println("queue ============> " + application.getQueue());
                    System.out.println("user ============> " + application.getUser());
                    System.out.println(applications);
                    isContains = true;
                }
            }
            /*
             * if(applications.contains(appName)) {
             * System.out.println("ApplicationId ============> "+applications.get(0).
             * getApplicationId());
             * System.out.println("name ============> "+applications.get(0).getName());
             * System.out.println("queue ============> "+applications.get(0).getQueue());
             * System.out.println("queue ============> "+applications.get(0).getUser());
             * System.out.println(applications); }
             */
        } catch (YarnException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            yarnClient.stop();
        }
        return isContains;
    }

    /**
     * 监测spark和flink 相关程序
     *
     * @param
     * @param runningJob 需要监测的job
     * @return
     */
    public static String yarnContainsSparkOrFlinkJob() {
        Configuration conf = new YarnConfiguration();
        YarnClient yarnClient = YarnClient.createYarnClient();
        yarnClient.init(conf);
        yarnClient.start();
        List<String> thisRunningJob = new ArrayList<>();

        try {
            List<ApplicationReport> applications = yarnClient.getApplications(Sets.newHashSet("Apache Flink", "SPARK"),
                    EnumSet.of(YarnApplicationState.RUNNING));
            Map<String,String> map=new HashMap<>();
            for (ApplicationReport application : applications) {
                if (application.getYarnApplicationState().equals(YarnApplicationState.RUNNING)) {
                    //spark 任务中的Thrift JDBC/ODBC Server 不用添加
                    if (!application.getName().equals("Thrift JDBC/ODBC Server")) {
                        thisRunningJob.add(application.getName());
                        map.put(application.getName(),application.getApplicationId().toString());
                    }
                }
            }
            //判断上次运行的和这次运行的任务,如果不存在,任务算做失败
            lastRunJob.removeAll(thisRunningJob);
            if (lastRunJob.size() > 0) {
                String value = String.join(",", lastRunJob);
                lastRunJob.clear();
                lastRunJob.addAll(thisRunningJob);
                return value;
            }
            lastRunJob.addAll(thisRunningJob);


        } catch (YarnException e) {
            e.printStackTrace();
        } catch (ConnectException e) {
            //不做处理
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            yarnClient.stop();
        }
        return null;
    }


}

   static List<String> lastRunJob = new ArrayList<>();
    /**
     * 监测spark和flink 相关程序
     *
     * @param
     * @param runningJob 需要监测的job
     * @return
     */
    public static String yarnContainsSparkOrFlinkJob() {
        Configuration conf = new YarnConfiguration();
        YarnClient yarnClient = YarnClient.createYarnClient();
        yarnClient.init(conf);
        yarnClient.start();
        List<String> thisRunningJob = new ArrayList<>();

        try {
            List<ApplicationReport> applications = yarnClient.getApplications(Sets.newHashSet("Apache Flink", "SPARK"),
                    EnumSet.of(YarnApplicationState.RUNNING));
            Map<String,String> map=new HashMap<>();
            for (ApplicationReport application : applications) {
                if (application.getYarnApplicationState().equals(YarnApplicationState.RUNNING)) {
                    //spark 任务中的Thrift JDBC/ODBC Server 不用添加
                    if (!application.getName().equals("Thrift JDBC/ODBC Server")) {
                        thisRunningJob.add(application.getName());
                        map.put(application.getName(),application.getApplicationId().toString());
                    }
                }
            }
            //判断上次运行的和这次运行的任务,如果不存在,任务算做失败
            lastRunJob.removeAll(thisRunningJob);
            if (lastRunJob.size() > 0) {
                String value = String.join(",", lastRunJob);
                lastRunJob.clear();
                lastRunJob.addAll(thisRunningJob);
                return value;
            }
            lastRunJob.addAll(thisRunningJob);


        } catch (YarnException e) {
            e.printStackTrace();
        } catch (ConnectException e) {
            //不做处理
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            yarnClient.stop();
        }
        return null;
    }
    public static void main(String[] args) throws Exception {
        while (true) {
            String value = yarnContainsSparkOrFlinkJob();
            if (StringUtils.isNotBlank(value)) {
                String content = value + "\t 可能出现异常,尽快处理";
                System.out.println(content);
                MailUtils.send(content);

            }
            TimeUnit.SECONDS.sleep(120);

        }
    }

github 对应地址: https://github.com/yumingzhu/testYarn

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值