监测进程使用cpu及内存,重启进程

#!/usr/bin/python3
# encoding: utf-8
#filename: process-check-self-healing.py
#author: gaohaixiang
#writetime:202206141535

import re
import time
import subprocess
import os


"""
# 使用注意事项:
涉及多项检测及自愈功能,按实际需求更改脚本

连续多次监测到cpu,内存大于某个值,重启
或者检测到服务不是active,重启服务,根据实际情况进行定制

假如一份服务有多个进程,则将多个进程的使用率加起来,计算该服务的总使用率做为服务的使用率

此脚本适用一个服务多个进程,一个服务一个进程
不适合几个服务几个进程,否则会将多个服务全部杀死重启

脚本中进程与服务区别:
进程需要使用命令进行杀死及启动
服务可以使用 systemctl 命令停止及启动,如 systemctl restart httpd

top取值说明:
"top -c -b -d 2 -n 3 -w 512" 
# 取3次值,每次取值时间间隔2秒,-c 显示完整的COMMAND,-b显示全部的进程,-w显示宽度,512为最大设置
mem和cpu取的值除以3,获取3次得平均值

"""

# top命令执行
def TOP_monitor(topCMD):
    # topCMD = "top -n 1"
    # topCMDresult = subprocess.getoutput(topCMD)

    top_info = subprocess.Popen(topCMD, stdout=subprocess.PIPE)
    out, err = top_info.communicate()
    topCMDresult = out.decode('unicode-escape')

    print("topCMDresult",topCMDresult)
    print(topCMD)
    return topCMDresult

# 从top命令结果获取内存使用率
def TOP_MEM_monitor(topCMDresult,PRO_NAME):
    #print(topCMDresult)
    # PRO_NAME = "top"
    PRO_MEM_imformations = 0.0
    top_lines = topCMDresult.split("\n")
    #print(top_lines)
    for top_line in top_lines:
        # print(top_line)
        if re.findall(PRO_NAME,top_line):
            PRO_information = top_line.lstrip().split()
            # print("进程PID,用户,优先级,优先级,虚拟内存,物理内存,共享内存,进程状态,cpu,mem,占用cpu时间,进程名")
            # print(PRO_information)
            PRO_MEM_imformations = PRO_MEM_imformations + float(PRO_information[9])
            # print(PRO_MEM_imformation)

    #print(PRO_MEM_imformations)
    return PRO_MEM_imformations

# 从top命令结果获取CPU使用率
def TOP_CPU_monitor(topCMDresult,PRO_NAME):

    # print(topCMDresult)
    #PRO_NAME = "top"
    PRO_CPU_imformations = 0.0
    top_lines = topCMDresult.split("\n")
    # print(top_lines)
    for top_line in top_lines:
        # print(top_line)
        if re.findall(PRO_NAME, top_line):
            PRO_information = top_line.lstrip().split()
            # print("进程PID,用户,优先级,优先级,虚拟内存,物理内存,共享内存,进程状态,cpu,mem,占用cpu时间,进程名")
            print(PRO_information)
            PRO_CPU_imformation = float(PRO_information[8])
            PRO_CPU_imformations = PRO_CPU_imformations + PRO_CPU_imformation
            # print(PRO_CPU_imformation)
    #print(PRO_CPU_imformations)
    return PRO_CPU_imformations

# 当前进程数量获取
def PRO_NUM_check(PRO_NAME):
    PRO_NUM_CMD = "ps -ef |grep %s |grep -v grep | wc -l" % PRO_NAME
    PRO_NUM = subprocess.getoutput(PRO_NUM_CMD)
    return int(PRO_NUM)

# 当前服务状态获取
def PRO_system_check(PRO_NAME):
    PRO_system_check_cmd = "systemctl status %s |grep 'Active:'|awk '{print $2}'" % PRO_NAME
    checkCMDoutput = subprocess.getoutput(PRO_system_check_cmd)
    return checkCMDoutput

# 使用命令启动进程
def PRO_CMD_start(PRO_CMD_start_cmd):
    # subprocess.getoutput(PRO_CMD_start_cmd)
    os.popen(PRO_CMD_start_cmd)
    # return startCMDoutput,startCMDstatus

# 重启服务
def PRO_system_restart(PRO_NAME):
    PRO_system_restart_cmd = "systemctl restart %s " % PRO_NAME
    restartCMDoutput,restartCMDstatus = subprocess.getstatusoutput(PRO_system_restart_cmd)
    return restartCMDoutput,restartCMDstatus

# 杀死进程
def PRO_CMD_kill(PRO_NAME):
    PRO_CMD_kill_cmd = "ps -ef |grep %s|grep -v grep|awk '{print $2}'|xargs kill -9" % PRO_NAME
    CMDoutput,CMDstatus = subprocess.getstatusoutput(PRO_CMD_kill_cmd)
    return CMDoutput,CMDstatus

# 系统总CPU数量获取
def GET_CPU_NUM_total():
    GET_CPU_NUM_total_cmd = "cat /proc/cpuinfo |grep processor|wc -l"
    cpu_num_output = subprocess.getoutput(GET_CPU_NUM_total_cmd)
    return cpu_num_output

# 系统总内存获取
def GET_MEM_NUM_total():
    GET_MEM_NUM_taotal_cmd = "cat /proc/meminfo |grep 'MemTotal:'|awk '{print $2}'"
    mem_num_output = subprocess.getoutput(GET_MEM_NUM_taotal_cmd)
    return mem_num_output

if __name__ == '__main__':
    starttime = time.time()
    print ("Process is running...")

    subprocess.getoutput("source /etc/profile")

    ff = open("/data/processlog/process-check-self-healing.log","a+")


    # 进程名称
    PRO_NAME = "prometheus_new"
    # top命令
    # topCMD = "top -c -b -d 2 -n 3" # 取3次值,每次取值时间间隔2秒,-c 显示完整的COMMAND,-b显示全部的进程
    topCMD = ["top", "-c", "-b", "-n", "3", "-d", "2","-w","512"]
    # 进程数量
    pro_num = 1
    # 进程启动命令
    PRO_CMD_start_cmd = "cd /data/prometheus && nohup ./prometheus_new --config.file=prometheus_new.yml --storage.tsdb.path=/data/prometheus/data --storage.tsdb.retention=7d --web.listen-address=0.0.0.0:9090 --storage.tsdb.min-block-duration=2h --storage.tsdb.max-block-duration=2h --log.level=info --web.enable-lifecycle --web.enable-admin-api >> prometheus.log 2>&1 &" # 其他的启动命令
    # 限制进程CPU使用率,占用系统的总量
    Limit_cpu_used_total = 0.9
    # 限制进程内存使用率,占用系统的总量
    Limit_mem_used_total = 0.9

    """
    '''# service 服务检测,当服务状态不是active时候,进行服务
    # 无限重启,知道服务状态达到active'''
    PRO_system_check_outputs = ""
    while PRO_system_check_outputs !=  'active':
        PRO_system_restart(PRO_NAME)
        PRO_system_check_outputs = PRO_system_restart(PRO_NAME)
    """

    '''# 进程数量检测,当进程数量达不到该有的数量时候,
    # 进程进行无限重启,直到进程数量达到标准'''
    pro_nums = 0
    while pro_nums < pro_num:
        PRO_CMD_start(PRO_CMD_start_cmd)
        pro_nums = PRO_NUM_check(PRO_NAME)
        ff.writelines(time.ctime() + "__________" + "prometheus num is __________ %s \n" % pro_nums)


    '''# 进程占用cpu总量的90%及以上,重启进程'''
    # 执行top命令
    topCMDresult = TOP_monitor(topCMD)
    # 获取cpu使用百分比
    PRO_CPU_imformations = TOP_CPU_monitor(topCMDresult,PRO_NAME)/3
    ff.writelines(time.ctime() +  "__________" + "prometheus cpu used __________ %s \n" % PRO_CPU_imformations)
    # 获取cpu的总量
    cpu_num_output = int(GET_CPU_NUM_total())
    if PRO_CPU_imformations/(cpu_num_output * 100) > Limit_cpu_used_total:
        PRO_CMD_kill(PRO_NAME)
        ff.writelines(time.ctime() + "__________" + "prometheus is killed __________  \n")
        pro_nums = 0
        while pro_nums < pro_num:
            PRO_CMD_start(PRO_CMD_start_cmd)
            pro_nums = PRO_NUM_check(PRO_NAME)

    """
    '''# 进程占用cpu总量的90%及以上,重启服务'''
    # 执行top命令
    topCMDresult = TOP_monitor(topCMD)
    # 获取cpu使用百分比
    PRO_CPU_imformations = TOP_CPU_monitor(topCMDresult,PRO_NAME)/3
    # 获取cpu的总量
    cpu_num_output = GET_CPU_NUM_total()
    if PRO_CPU_imformations/(cpu_num_output * 100) > Limit_cpu_used_total:
        PRO_system_restart(PRO_NAME)
        PRO_system_check_outputs = ""
        while PRO_system_check_outputs != 'active':
            PRO_system_restart(PRO_NAME)
            PRO_system_check_outputs = PRO_system_restart(PRO_NAME)


    '''# 进程占用内存总量的90%及以上,重启进程'''
    # 执行top命令
    topCMDresult = TOP_monitor(topCMD)
    # 获取cpu使用百分比
    PRO_MEM_imformations = TOP_MEM_monitor(topCMDresult,PRO_NAME)/3
    # 获取cpu的总量
    mem_num_output = GET_MEM_NUM_total()
    if PRO_MEM_imformations  > Limit_mem_used_total:
        PRO_CMD_kill(PRO_NAME)
        pro_nums = 0
        while pro_nums < pro_num:
            PRO_CMD_start(PRO_CMD_start_cmd)
            pro_nums = PRO_NUM_check(PRO_NAME)


    '''# 进程占用内存总量的90%及以上,重启服务'''
    # 执行top命令
    topCMDresult = TOP_monitor(topCMD)
    # 获取cpu使用百分比
    PRO_MEM_imformations = TOP_MEM_monitor(topCMDresult,PRO_NAME)/3
    # 获取cpu的总量
    mem_num_output = GET_MEM_NUM_total()
    if PRO_MEM_imformations  > Limit_mem_used_total:
        PRO_system_restart(PRO_NAME)
        PRO_system_check_outputs = ""
        while PRO_system_check_outputs != 'active':
            PRO_system_restart(PRO_NAME)
            PRO_system_check_outputs = PRO_system_restart(PRO_NAME)
    """

    ff.close()

    endtime = time.time()
    print (endtime-starttime)







  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值