Yarn任务监控--dingding告警

【目的】:定期一小时监控Yarn上的任务  并且如果失败任务有新增 就告警推送@所有人。

【方法】:采用Yarn api方式 每隔一个小时进行一次采集 并把状态保留下来推送出去

并把当前失败任务数和上一个小时任务失败数做对比 如果大于上一个小时任务失败数 就告警出来@所有人

【效果图】

话不多说 直接上代码:

首先上yarn api采集并保存的代码: 

       url_hdfs_namenode = f"http://{ip}/api/v2/instance/product/Hadoop/service/yarn_resourcemanager"

        response_yarn_list = session.get(url=url_hdfs_namenode).json()['data']['list']

        try:

            for yarn in response_yarn_list:

                if yarn['status'] == 'running':

                    yarn_data = session.get(f"http://{yarn['ip']}:18088/ws/v1/cluster/metrics",

                                            verify=False).json()['clusterMetrics']

                    print('yarn', '   ', 'resource_manager', '   ', '提交的任务数目', '   ', yarn['ip'], '   ',

                          yarn_data['appsSubmitted'])

                    print('yarn', '   ', 'resource_manager', '   ', '完成的任务数目', '   ', yarn['ip'], '   ',

                          yarn_data['appsCompleted'])

                    print('yarn', '   ', 'resource_manager', '   ', '挂起的任务数目', '   ', yarn['ip'], '   ',

                          yarn_data['appsPending'])

                    print('yarn', '   ', 'resource_manager', '   ', '运行的任务数目', '   ', yarn['ip'], '   ',

                          yarn_data['appsRunning'])

                    print('yarn', '   ', 'resource_manager', '   ', '失败的任务数目', '   ', yarn['ip'], '   ',

                          yarn_data['appsFailed'])

                    print('yarn', '   ', 'resource_manager', '   ', '集群总内存大小(GB)', '   ', yarn['ip'], '   ',

                          yarn_data['totalMB'] / 1024)

                    print('yarn', '   ', 'resource_manager', '   ', '集群已用内存大小(GB)', '   ', yarn['ip'], '   ',

                          yarn_data['allocatedMB'] / 1024)

                    print('yarn', '   ', 'resource_manager', '   ', '集群可用内存大小(GB)', '   ', yarn['ip'], '   ',

                          yarn_data['availableMB'] / 1024)

                    print('yarn', '   ', 'resource_manager', '   ', '集群总虚拟核数', '   ', yarn['ip'], '   ',

                          yarn_data['totalVirtualCores'])

                    print('yarn', '   ', 'resource_manager', '   ', '集群已分配虚拟核数', '   ', yarn['ip'], '   ',

                          yarn_data['allocatedVirtualCores'])

                    print('yarn', '   ', 'resource_manager', '   ', '集群可用虚拟核数', '   ', yarn['ip'], '   ',

                          yarn_data['availableVirtualCores'])





                    csv.writer(f).writerow(

                        ['yarn', 'resource_manager', '提交的任务数目', yarn['ip'], yarn_data['appsSubmitted']])

                    csv.writer(f).writerow(

                        ['yarn', 'resource_manager', '完成的任务数目', yarn['ip'], yarn_data['appsCompleted']])

                    csv.writer(f).writerow(

                        ['yarn', 'resource_manager', '挂起的任务数目', yarn['ip'], yarn_data['appsPending']])

                    csv.writer(f).writerow(

                        ['yarn', 'resource_manager', '运行的任务数目', yarn['ip'], yarn_data['appsRunning']])

                    csv.writer(f).writerow(['yarn', 'resource_manager', '失败的任务数目', yarn['ip'], yarn_data['appsFailed']])

                    csv.writer(f).writerow(

                        ['yarn', 'resource_manager', '集群总内存大小(GB)', yarn['ip'], yarn_data['totalMB'] / 1024])

                    csv.writer(f).writerow(

                        ['yarn', 'resource_manager', '集群已用内存大小(GB)', yarn['ip'], yarn_data['allocatedMB'] / 1024])

                    csv.writer(f).writerow(

                        ['yarn', 'resource_manager', '集群可用内存大小(GB)', yarn['ip'], yarn_data['availableMB'] / 1024])

                    csv.writer(f).writerow(

                        ['yarn', 'resource_manager', '集群总虚拟核数', yarn['ip'], yarn_data['totalVirtualCores']])

                    csv.writer(f).writerow(

                        ['yarn', 'resource_manager', '集群已分配虚拟核数', yarn['ip'], yarn_data['allocatedVirtualCores']])

                    csv.writer(f).writerow(

                        ['yarn', 'resource_manager', '集群可用虚拟核数', yarn['ip'], yarn_data['availableVirtualCores']])





                break









        except Exception as e:

            print(e)

对比当前时间和上一次时间的任务情况

def filed_task_add():

    time_H = get_file_time_H()

    prev_hour = get_prev_hour_time_H()

    i = 0

    for filename in os.listdir('./yarn/'):

        # i = i+1

        # print(i)

        if time_H in filename:

            with open(os.path.join('./yarn/', filename), 'r') as f:

                for line in f:

                    if '失败的任务数目' in line:

                        failed_tasks_current = int(line.split(',')[-1])

                        print(f"当前失败的任务数:{failed_tasks_current}")

                        break





        elif prev_hour in filename:

            with open(os.path.join('./yarn/', filename), 'r') as f:

                for line in f:

                    if '失败的任务数目' in line:

                        failed_tasks_prev = int(line.split(',')[-1])

                        print(f"上一次失败任务数: {failed_tasks_prev}")

                        break

    try:

        if failed_tasks_current > failed_tasks_prev:

            print(f"任务失败数增加: \n 上一次失败任务数:{failed_tasks_prev} \n当前任务失败数: {failed_tasks_current}")

            return True, failed_tasks_current, failed_tasks_prev

        else:

            print("任务失败并没有增加.")

            return False, failed_tasks_current

    except Exception as e:

        print(e)

        return False, failed_tasks_current

dingding发送代码就不上啦 大家都很熟悉啦

完成~

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值