监控Java cpu、内存自动抓取 jstack 日志脚本-改进版

原创已于 2023-06-13 13:45:25 修改 · 945 阅读

1 ·

CC 4.0 BY-SA版权

文章标签：

#java #linux #centos

于 2020-10-09 23:35:30 首次发布

运维专栏收录该内容

8 篇文章

订阅专栏

本文介绍了一款改进版的JVM性能监控工具，通过Python脚本实现对Linux系统下Java应用的CPU和内存使用情况进行实时监控，并在达到预设阈值时自动抓取jstack日志及发送短信预警。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

监控Java cpu、内存自动抓取 jstack 日志脚本-改进版

概述
环境

概述

linux 系统应用性能监控
之前写过一个单文件版linux 监控工具监控Java cpu、内存自动抓取 jstack 日志脚本-升级版，但是感觉用起来不方便，配置都要在源码里面改，不利于部署。
本次项目重新构造项目方式，独立配置文件，新增短信预警通知。

环境

python 2.7 ; centos 7

总体文件

file

配置文件

在这里插入图片描述

核心功能

源码详细注释
片段1

        # 获取cup， mem百分比命令
        # top -p  pid  -n1 : 获取进程 pid 的top 信息，只刷新一次。
        # sed -n '8p'   : 获取第8行
        # awk '{printf $9}' ： 获取第9列

        cup_comm = 'top -b -p  ' + pid + '  -n1|sed -n \'8p\'|awk \'{printf $9}\''
        # 内存
        mem_comm = 'top -b -p  ' + pid + '  -n1|sed -n \'8p\'|awk \'{printf $10}\''
        if(cup_comm=='' or mem_comm==''):
            return
        # 取得cpu 占用百分比
        cpu_perc_str = cm.getoutput(cup_comm)
        # 取得内存 占用百分比
        mem_perc_str = cm.getoutput(mem_comm)

片段2

    ps_comm = 'ps aux --sort=-rss|head -10'  # 获取前20行
    os_str = os.popen(ps_comm).read()  # 执行linux命令，并读取执行结果
    arr1 = os_str.split('\n')
    write_time = 'echo ' + time_str_format + ' >> ' + log_dir + 'ps_result.txt'
    os.system(write_time)
    for i in arr1[1:len(arr1)-1]:
        # 1、把多个空格合并成一个空格,并以 , 号分隔
        str1 = re.sub(' +', ',', i)
        # 2、按 , 符进行分割数据
        str2 = str1.split(',')
        # 过滤掉最后一行的空格
        mem_percent = str2[3]
        print '内存的百分比: %s' % mem_percent
        if(float(mem_percent) > 10.0):   # 百分比大于多少执行， 自定义!!!                                      <<------------------!!!!!!!!!

            write_ps = 'echo ' + str1 + ' >> ' + log_dir + 'ps_result.txt'
            senMsg(i)  # 发送短信
            os.system(write_ps)

代码

ConfigInfo.py

# -*- coding: utf-8 -*-

class ConfigInfo():
    def __init__(self):
        with open('monitor.properties', 'r') as dataFile:

            for line in dataFile.readlines():
                # 去掉换行符
                line = line.strip('\n')
                line = line.strip('\r')  # 去掉回车
                # 获取监听端口
                if line.rfind("application.ports", 0, len("application.ports")) != -1:
                    ports_str = (line.split("="))[1]
                    self.ports = [] if ports_str=='' else ports_str.split(",")
                # 监控进程id 信息
                elif line.rfind("process.pids", 0, len("process.pids")) != -1:
                    pids_str = (line.split("="))[1]
                    self.pids = [] if  ports_str=='' else pids_str.split(",")
                # 是否发送短信
                elif line.rfind("short.message.is", 0, len("short.message.is")) != -1:
                    self.sendMsgIs = (line.split("="))[1]
                # 日志存放目录
                elif line.rfind("base.log.dir", 0, len("base.log.dir")) != -1:
                    self.baseLogDir = (line.split("="))[1]
                # 短信发送时间间隔
                elif line.rfind("short.message.interval", 0, len("short.message.interval")) != -1:
                    self.msgInterval = (line.split("="))[1]
                # 发送的电话号码
                elif line.rfind("short.message.phone", 0, len("short.message.phone")) != -1:
                    phones_str = (line.split("="))[1]
                    self.phones = phones_str.split(",")
                # 监控时间频率
                elif line.rfind("monitor.frequency", 0, len("monitor.frequency")) != -1:
                    self.frequency = (line.split("="))[1]
                # 短信地址
                elif line.rfind("short.message.url", 0, len("short.message.url")) != -1:
                    self.sendMsgUrl = (line.split("="))[1]


if __name__ == '__main__':
    c = ConfigInfo()
    print c.ports
    print c.pids
    print c.sendMsgIs
    print c.baseLogDir

Server.py

# -*- coding: utf-8 -*-
import time
import commands as cm
import re
import os
import sys
from ConfigInfo import ConfigInfo
import ShortMsg

lastSendTime = 0  # 上一次发送短信时间
def senMsg(sendTxt):
    if(config.sendMsgIs == 'False'):
        print 'sendMsgIs is false'
        return
    global lastSendTime

    currentTime = int(time.time())
    if(lastSendTime != 0):
        difMin = currentTime - lastSendTime
        if(difMin < int(config.msgInterval)*60):
            print '时间间隔小于短信发送间隔， 短信未发送'
            return
    else:
        ShortMsg.senMsg(sendTxt, config.phones, config.sendMsgUrl)
        lastSendTime = currentTime

# 执行方法
def executeFun(pids):

    pids = list(set(pids))  # 去除重复
    for pid in pids:
        if(pid==''):
            return
        # 获取cup， mem百分比命令
        # top -p  pid  -n1 : 获取进程 pid 的top 信息，只刷新一次。
        # sed -n '8p'   : 获取第8行
        # awk '{printf $9}' ： 获取第9列

        cup_comm = 'top -b -p  ' + pid + '  -n1|sed -n \'8p\'|awk \'{printf $9}\''
        # 内存
        mem_comm = 'top -b -p  ' + pid + '  -n1|sed -n \'8p\'|awk \'{printf $10}\''
        if(cup_comm=='' or mem_comm==''):
            return
        # 取得cpu 占用百分比
        cpu_perc_str = cm.getoutput(cup_comm)
        # 取得内存 占用百分比
        mem_perc_str = cm.getoutput(mem_comm)
        if(cpu_perc_str=='' or mem_perc_str==''):
            return
        # 转成数字类型
        cpu_perc = float(cpu_perc_str)
        mem_perc = float(mem_perc_str)

        print cpu_perc
        print mem_perc
        # 获取时间串
        time_str = time.strftime("%Y%m%d%H%M%S", time.localtime())
        print time_str


        pid_exists = len(cm.getoutput('jps |grep ' + pid)) > 0           # pid_exists 是否存在java 进程 id

        if (cpu_perc > 1000.1):  # 百分比大于多少执行， 自定义!!!                                      <<------------------!!!!!!!!!

            if(pid_exists):
                jstack_comm = 'jstack -l ' + pid + ' >> ' + log_dir + 'gc_' + time_str +'_cpu_' + cpu_perc_str + '.log'  # 保存 jstack 日志
            else:
                text = '_cpu_' + cpu_perc_str+'time_'+ time_str
                jstack_comm = 'echo '+text+' >> ' + log_dir +'other_cup.txt'
            os.system(jstack_comm)
            senMsg(jstack_comm)  # 发送短信
        elif (mem_perc > 40.1):  # 百分比大于多少执行， 自定义!!!                                      <<------------------!!!!!!!!!
            if (pid_exists):
                jstack_comm = 'jstack -l ' + pid + ' >> ' + log_dir + 'gc_' + time_str +'_mem_' + mem_perc_str + '.log'   # 保存 jstack 日志
            else:
                text = '_mem_' + mem_perc_str + 'time_' + time_str
                jstack_comm = 'echo ' + text + ' >> ' + log_dir + 'other_mem.txt'
            os.system(jstack_comm)
            senMsg(jstack_comm)  # 发送短信
        # 删除7天前的 log 结尾文件
        rm_log_comm = 'find ' + log_dir + ' -mtime +7 -type f -name "*.log" -exec rm -f {} \;'
        os.system(rm_log_comm)

def getPidsByPorts(ports):
    pidList = []
    for port in ports:
        if(port==''):
            return
        # port = '8090'
        # 获取进程命令 pid_comm ;
        # netstat -ntpl|grep port  ： 获取端口号port 的进程信息
        # awk '{printf $7}' : 取第七列
        # cut -d/ -f1  ： 以 / 分割后取第一个值
        pid_comm = 'netstat -ntpl|grep ' + port + '|awk \'{printf $7}\'|cut -d/ -f1'
        # 取得进程 pid
        pid = cm.getoutput(pid_comm)
        if (pid == ''):
            print 'this port: %s no pid: ' % port
            continue
        else:
            pidList.append(pid)
    return pidList

def globalMonitor():
    free_comm = 'free'
    os_str = os.popen(free_comm).read()  # 执行linux命令，并读取执行结果
    arr1 = os_str.split('\n')
    write_time = 'echo ' + time_str_format + ' >> ' + log_dir + 'free_result.txt'
    os.system(write_time)
    for i in arr1[:len(arr1) - 1]:
        # 1、把多个空格合并成一个空格,并以 , 号分隔
        str1 = re.sub(' +', ',', i)


        write_free = 'echo ' + str1 + ' >> ' + log_dir + 'free_result.txt'

        os.system(write_free)

    ps_comm = 'ps aux --sort=-rss|head -10'  # 获取前10行
    os_str = os.popen(ps_comm).read()  # 执行linux命令，并读取执行结果
    arr1 = os_str.split('\n')

    write_time = 'echo ' + time_str_format + ' >> ' + log_dir + 'ps_result.txt'
    os.system(write_time)
    for i in arr1[1:len(arr1)-1]:
        # 1、把多个空格合并成一个空格,并以 , 号分隔
        str1 = re.sub(' +', ',', i)
        # 2、按 , 符进行分割数据
        str2 = str1.split(',')
        # 过滤掉最后一行的空格
        ps_top_pid = str2[1]  # pid
        mem_percent = str2[3]   # 内存占用百分比
        ps_top_exedir = str2[10:]  # 执行目录
        print 'PID: %s' % ps_top_pid
        print '内存的百分比: %s' % mem_percent
        print '执行目录: %s' % ps_top_exedir
        print '-------------------------------'
        if(float(mem_percent) > 30.0):   # 百分比大于多少执行， 自定义!!!                                      <<------------------!!!!!!!!!

            write_ps = 'echo ' + str1 + ' >> ' + log_dir + 'ps_result.txt'
            senMsg(i)  # 发送短信
            os.system(write_ps)


if __name__ == '__main__':
    # 获取时间串

    config = ConfigInfo()
    log_dir = config.baseLogDir  # 日志存放目录
    frequency = int(config.frequency)
    while (True):
        time_str_format = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        globalMonitor()
        pids = getPidsByPorts(config.ports)
        pids.extend(config.pids)  # 添加监控进程信息
        executeFun(pids)

        # 延时执行
        time.sleep(frequency)

ShortMsg.py

# -*- coding: utf-8 -*-
import time
import urllib
import urllib2
# 如果配置 short.message 为 True， 则需要修改以下方法内容。

def senMsg(sendTxt, phones, url):

    time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    data = {}
    url = url
    for phone in phones:
        data['mobilelist'] = phone  #
        data['content'] = sendTxt + ' ' + time_str
        params = urllib.urlencode(data)
        response = urllib2.urlopen('?'.join([url, '%s']) % params)
        print 'senMsg response: %s' % response.read()

monitor.properties

# 监控刷新的频率， 单位秒
monitor.frequency=10
# 要监控的进程端口号
application.ports=8085,8086
# 要监控的进程 id 号
process.pids=12345,23562
# 是否发送短信
short.message.is=False
# 发送短信的手机号码
short.message.phone=12345678901,12345678902
# 短信地址
short.message.url=http://localhost:9191/sms/phoneMsg/msmpush
# 上次发送短信和当前发送短信最小时间间隔， 单位分钟
short.message.interval=60
# 日志存放目录
base.log.dir=/hnpwqx/gclogs/app_server/