背景: 生产上的spark和flink程序会挂掉,虽然是基于阿里云配置了邮箱告警,但是由于程序的重启策略配置,导致重启后的程序,下次挂掉不会再被监控到。所以需要手动监控yarn,
实现方案
yarn 提供的api, 可以访问集群yarn web ui 接口数据,对Application Type进行过滤 ,每2分钟调用api,对比前后2次运行的实时程序,判断第一次的列表是否都还存在,不存在则发送邮件
1.环境配置
导入依赖,导入相关hadoop配置文件 yarn-site.xml,core-site.xml,hdfs-site.xml
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-api</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>javax.mail</groupId>
<artifactId>mail</artifactId>
<version>1.4.7</version>
</dependency>
实现代码
比对前后2次RUNNING 的job 将后面缺失的job标记为失败,发送邮件,发送邮件的util放在了github,具体需要配置可以参考我之前的博客https://blog.csdn.net/yumingzhu1/article/details/88576291
import com.google.common.collect.Sets;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ApplicationReport;
import org.apache.hadoop.yarn.api.records.YarnApplicationState;
import org.apache.hadoop.yarn.client.api.YarnClient;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException;
import util.MailUtils;
import java.io.IOException;
import java.net.ConnectException;
import java.util.*;
import java.util.concurrent.TimeUnit;
/**
* @Description 监控yarn上的flink 或者spark程序
* @auther eamon.yu
* @date 2021-07-13
*/
public class YarnMonitor {
/**
* 用于存储上一次查询的runing结果,
*/
static List<String> lastRunJob = new ArrayList<>();
/**
* 获取任务的applicationId
*
* @param jobName
* @return
*/
public static String getAppId(String jobName) {
YarnClient client = YarnClient.createYarnClient();
Configuration conf = new Configuration();
client.init(conf);
client.start();
EnumSet<YarnApplicationState> appStates = EnumSet.noneOf(YarnApplicationState.class);
if (appStates.isEmpty()) {
appStates.add(YarnApplicationState.RUNNING);
appStates.add(YarnApplicationState.ACCEPTED);
appStates.add(YarnApplicationState.SUBMITTED);
}
List<ApplicationReport> appsReport = null;
try {
//返回EnumSet<YarnApplicationState>中个人任务状态的所有任务
appsReport = client.getApplications(appStates);
} catch (YarnException | IOException e) {
e.printStackTrace();
}
assert appsReport != null;
for (ApplicationReport appReport : appsReport) {
//获取任务名
String jn = appReport.getName();
String applicationType = appReport.getApplicationType();
if (jn.equals(jobName) && "Apache Flink".equals(applicationType)) {
try {
client.close();
} catch (IOException e) {
e.printStackTrace();
}
return appReport.getApplicationId().toString();
}
}
try {
client.close();
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
/**
* 根据任务的applicationId去获取任务的状态
*
* @param appId
* @return
*/
public static YarnApplicationState getState(String appId) {
YarnClient client = YarnClient.createYarnClient();
Configuration conf = new Configuration();
client.init(conf);
client.start();
ApplicationId applicationId = ApplicationId.fromString(appId);
// ApplicationId appId = ConverterUtils.toApplicationId(appId);
YarnApplicationState yarnApplicationState = null;
try {
ApplicationReport applicationReport = client.getApplicationReport(applicationId);
yarnApplicationState = applicationReport.getYarnApplicationState();
} catch (YarnException | IOException e) {
e.printStackTrace();
}
try {
client.close();
} catch (IOException e) {
e.printStackTrace();
}
return yarnApplicationState;
}
public static void main(String[] args) throws Exception {
while (true) {
String value = yarnContainsSparkOrFlinkJob();
if (StringUtils.isNotBlank(value)) {
String content = value + "\t 可能出现异常,尽快处理";
System.out.println(content);
MailUtils.send(content);
}
TimeUnit.SECONDS.sleep(120);
}
}
/**
* 判断任务名为appName的任务,是否在yarn中运行,状态为RUNNING
*
* @param appName
* @return
*/
public static boolean yarnIsContains(String appName) {
Configuration conf = new YarnConfiguration();
YarnClient yarnClient = YarnClient.createYarnClient();
yarnClient.init(conf);
yarnClient.start();
boolean isContains = false;
List<ApplicationReport> applications = new ArrayList<ApplicationReport>();
try {
//applications = yarnClient.getApplications(EnumSet.of(YarnApplicationState.RUNNING, YarnApplicationState.FINISHED));
applications = yarnClient.getApplications(EnumSet.of(YarnApplicationState.RUNNING));
System.out.println(applications.size());
for (ApplicationReport application : applications) {
String name = application.getName();
if (name.equals(appName)) {
System.out.println("ApplicationId ============> " + application.getApplicationId());
System.out.println("name ============> " + application.getName());
System.out.println("queue ============> " + application.getQueue());
System.out.println("user ============> " + application.getUser());
System.out.println(applications);
isContains = true;
}
}
/*
* if(applications.contains(appName)) {
* System.out.println("ApplicationId ============> "+applications.get(0).
* getApplicationId());
* System.out.println("name ============> "+applications.get(0).getName());
* System.out.println("queue ============> "+applications.get(0).getQueue());
* System.out.println("queue ============> "+applications.get(0).getUser());
* System.out.println(applications); }
*/
} catch (YarnException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
yarnClient.stop();
}
return isContains;
}
/**
* 监测spark和flink 相关程序
*
* @param
* @param runningJob 需要监测的job
* @return
*/
public static String yarnContainsSparkOrFlinkJob() {
Configuration conf = new YarnConfiguration();
YarnClient yarnClient = YarnClient.createYarnClient();
yarnClient.init(conf);
yarnClient.start();
List<String> thisRunningJob = new ArrayList<>();
try {
List<ApplicationReport> applications = yarnClient.getApplications(Sets.newHashSet("Apache Flink", "SPARK"),
EnumSet.of(YarnApplicationState.RUNNING));
Map<String,String> map=new HashMap<>();
for (ApplicationReport application : applications) {
if (application.getYarnApplicationState().equals(YarnApplicationState.RUNNING)) {
//spark 任务中的Thrift JDBC/ODBC Server 不用添加
if (!application.getName().equals("Thrift JDBC/ODBC Server")) {
thisRunningJob.add(application.getName());
map.put(application.getName(),application.getApplicationId().toString());
}
}
}
//判断上次运行的和这次运行的任务,如果不存在,任务算做失败
lastRunJob.removeAll(thisRunningJob);
if (lastRunJob.size() > 0) {
String value = String.join(",", lastRunJob);
lastRunJob.clear();
lastRunJob.addAll(thisRunningJob);
return value;
}
lastRunJob.addAll(thisRunningJob);
} catch (YarnException e) {
e.printStackTrace();
} catch (ConnectException e) {
//不做处理
} catch (IOException e) {
e.printStackTrace();
} finally {
yarnClient.stop();
}
return null;
}
}
static List<String> lastRunJob = new ArrayList<>();
/**
* 监测spark和flink 相关程序
*
* @param
* @param runningJob 需要监测的job
* @return
*/
public static String yarnContainsSparkOrFlinkJob() {
Configuration conf = new YarnConfiguration();
YarnClient yarnClient = YarnClient.createYarnClient();
yarnClient.init(conf);
yarnClient.start();
List<String> thisRunningJob = new ArrayList<>();
try {
List<ApplicationReport> applications = yarnClient.getApplications(Sets.newHashSet("Apache Flink", "SPARK"),
EnumSet.of(YarnApplicationState.RUNNING));
Map<String,String> map=new HashMap<>();
for (ApplicationReport application : applications) {
if (application.getYarnApplicationState().equals(YarnApplicationState.RUNNING)) {
//spark 任务中的Thrift JDBC/ODBC Server 不用添加
if (!application.getName().equals("Thrift JDBC/ODBC Server")) {
thisRunningJob.add(application.getName());
map.put(application.getName(),application.getApplicationId().toString());
}
}
}
//判断上次运行的和这次运行的任务,如果不存在,任务算做失败
lastRunJob.removeAll(thisRunningJob);
if (lastRunJob.size() > 0) {
String value = String.join(",", lastRunJob);
lastRunJob.clear();
lastRunJob.addAll(thisRunningJob);
return value;
}
lastRunJob.addAll(thisRunningJob);
} catch (YarnException e) {
e.printStackTrace();
} catch (ConnectException e) {
//不做处理
} catch (IOException e) {
e.printStackTrace();
} finally {
yarnClient.stop();
}
return null;
}
public static void main(String[] args) throws Exception {
while (true) {
String value = yarnContainsSparkOrFlinkJob();
if (StringUtils.isNotBlank(value)) {
String content = value + "\t 可能出现异常,尽快处理";
System.out.println(content);
MailUtils.send(content);
}
TimeUnit.SECONDS.sleep(120);
}
}
github 对应地址: https://github.com/yumingzhu/testYarn