#!/usr/bin/python3
# encoding: utf-8
#filename: process-check-self-healing.py
#author: gaohaixiang
#writetime:202206141535
import re
import time
import subprocess
import os
"""
# 使用注意事项:
涉及多项检测及自愈功能,按实际需求更改脚本
连续多次监测到cpu,内存大于某个值,重启
或者检测到服务不是active,重启服务,根据实际情况进行定制
假如一份服务有多个进程,则将多个进程的使用率加起来,计算该服务的总使用率做为服务的使用率
此脚本适用一个服务多个进程,一个服务一个进程
不适合几个服务几个进程,否则会将多个服务全部杀死重启
脚本中进程与服务区别:
进程需要使用命令进行杀死及启动
服务可以使用 systemctl 命令停止及启动,如 systemctl restart httpd
top取值说明:
"top -c -b -d 2 -n 3 -w 512"
# 取3次值,每次取值时间间隔2秒,-c 显示完整的COMMAND,-b显示全部的进程,-w显示宽度,512为最大设置
mem和cpu取的值除以3,获取3次得平均值
"""
# top命令执行
def TOP_monitor(topCMD):
# topCMD = "top -n 1"
# topCMDresult = subprocess.getoutput(topCMD)
top_info = subprocess.Popen(topCMD, stdout=subprocess.PIPE)
out, err = top_info.communicate()
topCMDresult = out.decode('unicode-escape')
print("topCMDresult",topCMDresult)
print(topCMD)
return topCMDresult
# 从top命令结果获取内存使用率
def TOP_MEM_monitor(topCMDresult,PRO_NAME):
#print(topCMDresult)
# PRO_NAME = "top"
PRO_MEM_imformations = 0.0
top_lines = topCMDresult.split("\n")
#print(top_lines)
for top_line in top_lines:
# print(top_line)
if re.findall(PRO_NAME,top_line):
PRO_information = top_line.lstrip().split()
# print("进程PID,用户,优先级,优先级,虚拟内存,物理内存,共享内存,进程状态,cpu,mem,占用cpu时间,进程名")
# print(PRO_information)
PRO_MEM_imformations = PRO_MEM_imformations + float(PRO_information[9])
# print(PRO_MEM_imformation)
#print(PRO_MEM_imformations)
return PRO_MEM_imformations
# 从top命令结果获取CPU使用率
def TOP_CPU_monitor(topCMDresult,PRO_NAME):
# print(topCMDresult)
#PRO_NAME = "top"
PRO_CPU_imformations = 0.0
top_lines = topCMDresult.split("\n")
# print(top_lines)
for top_line in top_lines:
# print(top_line)
if re.findall(PRO_NAME, top_line):
PRO_information = top_line.lstrip().split()
# print("进程PID,用户,优先级,优先级,虚拟内存,物理内存,共享内存,进程状态,cpu,mem,占用cpu时间,进程名")
print(PRO_information)
PRO_CPU_imformation = float(PRO_information[8])
PRO_CPU_imformations = PRO_CPU_imformations + PRO_CPU_imformation
# print(PRO_CPU_imformation)
#print(PRO_CPU_imformations)
return PRO_CPU_imformations
# 当前进程数量获取
def PRO_NUM_check(PRO_NAME):
PRO_NUM_CMD = "ps -ef |grep %s |grep -v grep | wc -l" % PRO_NAME
PRO_NUM = subprocess.getoutput(PRO_NUM_CMD)
return int(PRO_NUM)
# 当前服务状态获取
def PRO_system_check(PRO_NAME):
PRO_system_check_cmd = "systemctl status %s |grep 'Active:'|awk '{print $2}'" % PRO_NAME
checkCMDoutput = subprocess.getoutput(PRO_system_check_cmd)
return checkCMDoutput
# 使用命令启动进程
def PRO_CMD_start(PRO_CMD_start_cmd):
# subprocess.getoutput(PRO_CMD_start_cmd)
os.popen(PRO_CMD_start_cmd)
# return startCMDoutput,startCMDstatus
# 重启服务
def PRO_system_restart(PRO_NAME):
PRO_system_restart_cmd = "systemctl restart %s " % PRO_NAME
restartCMDoutput,restartCMDstatus = subprocess.getstatusoutput(PRO_system_restart_cmd)
return restartCMDoutput,restartCMDstatus
# 杀死进程
def PRO_CMD_kill(PRO_NAME):
PRO_CMD_kill_cmd = "ps -ef |grep %s|grep -v grep|awk '{print $2}'|xargs kill -9" % PRO_NAME
CMDoutput,CMDstatus = subprocess.getstatusoutput(PRO_CMD_kill_cmd)
return CMDoutput,CMDstatus
# 系统总CPU数量获取
def GET_CPU_NUM_total():
GET_CPU_NUM_total_cmd = "cat /proc/cpuinfo |grep processor|wc -l"
cpu_num_output = subprocess.getoutput(GET_CPU_NUM_total_cmd)
return cpu_num_output
# 系统总内存获取
def GET_MEM_NUM_total():
GET_MEM_NUM_taotal_cmd = "cat /proc/meminfo |grep 'MemTotal:'|awk '{print $2}'"
mem_num_output = subprocess.getoutput(GET_MEM_NUM_taotal_cmd)
return mem_num_output
if __name__ == '__main__':
starttime = time.time()
print ("Process is running...")
subprocess.getoutput("source /etc/profile")
ff = open("/data/processlog/process-check-self-healing.log","a+")
# 进程名称
PRO_NAME = "prometheus_new"
# top命令
# topCMD = "top -c -b -d 2 -n 3" # 取3次值,每次取值时间间隔2秒,-c 显示完整的COMMAND,-b显示全部的进程
topCMD = ["top", "-c", "-b", "-n", "3", "-d", "2","-w","512"]
# 进程数量
pro_num = 1
# 进程启动命令
PRO_CMD_start_cmd = "cd /data/prometheus && nohup ./prometheus_new --config.file=prometheus_new.yml --storage.tsdb.path=/data/prometheus/data --storage.tsdb.retention=7d --web.listen-address=0.0.0.0:9090 --storage.tsdb.min-block-duration=2h --storage.tsdb.max-block-duration=2h --log.level=info --web.enable-lifecycle --web.enable-admin-api >> prometheus.log 2>&1 &" # 其他的启动命令
# 限制进程CPU使用率,占用系统的总量
Limit_cpu_used_total = 0.9
# 限制进程内存使用率,占用系统的总量
Limit_mem_used_total = 0.9
"""
'''# service 服务检测,当服务状态不是active时候,进行服务
# 无限重启,知道服务状态达到active'''
PRO_system_check_outputs = ""
while PRO_system_check_outputs != 'active':
PRO_system_restart(PRO_NAME)
PRO_system_check_outputs = PRO_system_restart(PRO_NAME)
"""
'''# 进程数量检测,当进程数量达不到该有的数量时候,
# 进程进行无限重启,直到进程数量达到标准'''
pro_nums = 0
while pro_nums < pro_num:
PRO_CMD_start(PRO_CMD_start_cmd)
pro_nums = PRO_NUM_check(PRO_NAME)
ff.writelines(time.ctime() + "__________" + "prometheus num is __________ %s \n" % pro_nums)
'''# 进程占用cpu总量的90%及以上,重启进程'''
# 执行top命令
topCMDresult = TOP_monitor(topCMD)
# 获取cpu使用百分比
PRO_CPU_imformations = TOP_CPU_monitor(topCMDresult,PRO_NAME)/3
ff.writelines(time.ctime() + "__________" + "prometheus cpu used __________ %s \n" % PRO_CPU_imformations)
# 获取cpu的总量
cpu_num_output = int(GET_CPU_NUM_total())
if PRO_CPU_imformations/(cpu_num_output * 100) > Limit_cpu_used_total:
PRO_CMD_kill(PRO_NAME)
ff.writelines(time.ctime() + "__________" + "prometheus is killed __________ \n")
pro_nums = 0
while pro_nums < pro_num:
PRO_CMD_start(PRO_CMD_start_cmd)
pro_nums = PRO_NUM_check(PRO_NAME)
"""
'''# 进程占用cpu总量的90%及以上,重启服务'''
# 执行top命令
topCMDresult = TOP_monitor(topCMD)
# 获取cpu使用百分比
PRO_CPU_imformations = TOP_CPU_monitor(topCMDresult,PRO_NAME)/3
# 获取cpu的总量
cpu_num_output = GET_CPU_NUM_total()
if PRO_CPU_imformations/(cpu_num_output * 100) > Limit_cpu_used_total:
PRO_system_restart(PRO_NAME)
PRO_system_check_outputs = ""
while PRO_system_check_outputs != 'active':
PRO_system_restart(PRO_NAME)
PRO_system_check_outputs = PRO_system_restart(PRO_NAME)
'''# 进程占用内存总量的90%及以上,重启进程'''
# 执行top命令
topCMDresult = TOP_monitor(topCMD)
# 获取cpu使用百分比
PRO_MEM_imformations = TOP_MEM_monitor(topCMDresult,PRO_NAME)/3
# 获取cpu的总量
mem_num_output = GET_MEM_NUM_total()
if PRO_MEM_imformations > Limit_mem_used_total:
PRO_CMD_kill(PRO_NAME)
pro_nums = 0
while pro_nums < pro_num:
PRO_CMD_start(PRO_CMD_start_cmd)
pro_nums = PRO_NUM_check(PRO_NAME)
'''# 进程占用内存总量的90%及以上,重启服务'''
# 执行top命令
topCMDresult = TOP_monitor(topCMD)
# 获取cpu使用百分比
PRO_MEM_imformations = TOP_MEM_monitor(topCMDresult,PRO_NAME)/3
# 获取cpu的总量
mem_num_output = GET_MEM_NUM_total()
if PRO_MEM_imformations > Limit_mem_used_total:
PRO_system_restart(PRO_NAME)
PRO_system_check_outputs = ""
while PRO_system_check_outputs != 'active':
PRO_system_restart(PRO_NAME)
PRO_system_check_outputs = PRO_system_restart(PRO_NAME)
"""
ff.close()
endtime = time.time()
print (endtime-starttime)
监测进程使用cpu及内存,重启进程
于 2022-06-15 15:59:28 首次发布