promethues提供了一些常用的exporter,但是在进程监控上,我们需要一些更加详细的监控,程序的存活,内存,CPU,句柄数,线程数等等。
下面我有一个示例,如何在机器上监控指定进程,并且推送到promethues。
#!/usr/bin/python
# -*- coding:utf-8 -*-
import glob
import logging
import os
import psutil
import requests
import socket
import struct
import time
import fcntl
from prometheus_client import Gauge, start_http_server
DataBase = {
"user": "prometheus",
"password": "AgTKZu4RkOZvqKZA",
"project": "ht",
"host": "127.0.0.1"
}
monitor = ['_cpu', '_memory', '_threads', '_fs', '_status', '_ctime']
logger = logging.getLogger("LOG FORMAT")
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
ch.setFormatter(formatter)
logger.addHandler(ch)
class GameServer(object):
def __init__(self):
pass
@property
def channel(self):
"""
:return: 读取游戏服数量,方法是从/data/下读取游戏服目录[当前使用渠道区分版本,每个渠道下有自己独立的登录服,游戏服]
"""
channel = glob.glob("/data/%s_*" % DataBase["project"])
return channel
@property
def mysql(self):
"""
:return: mysql数据库进程详细状态
"""
pid_file = glob.glob("/data/mysql_data/data/*.pid")[0]
with open(pid_file) as f:
pid = f.read()
p = psutil.Process(int(pid))
if p.is_running():
mem = p.memory_info().rss / 1024 / 1024
return mem
else:
return 0
@property
def local_ip(self):
"""
:return: 主机内网ip
"""
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
return socket.inet_ntoa(fcntl.ioctl(
s.fileno(),
0x8915, # SIOCGIFADDR
struct.pack('256s', 'eth0'[:15])
)[20:24])
@property
def wan_ip(self):
"""
:return: 腾讯云主机获取本机外网IP
"""
try:
r = requests.get('http://metadata.tencentyun.com/meta-data/public-ipv4', timeout=5)
data = r.content
except Exception as e:
logging.exception(e)
data = '0.0.0.0'
return data
class Action:
@staticmethod
def step_1(channels):
"""
:return: 读取渠道内的pid获取游戏的pid
"""
temp_dict = dict()
for channel in channels:
os.chdir(channel)
servers = glob.glob("*_*")
for server in servers:
if not os.path.isdir(server):
continue
try:
pid_file = "%s/run/%s.pid" % (server, "".join(server.split("_")))
if not os.path.exists(pid_file):
temp_dict[server] = 0
else:
with open('%s/run/%s.pid' % (server, "".join(server.split("_"))), 'r') as f:
pid = f.readline().strip('\n')
temp_dict[server] = pid
except Exception as e:
logging.exception(e)
temp_dict[server] = 0
return temp_dict
@staticmethod
def step_2(m_data=None):
"""
:return: 提取游戏服的重要指标作为监控项目
"""
monitor_data = dict()
for server, pid in m_data.iteritems():
if int(pid) > 0:
try:
d = dict()
p = psutil.Process(int(pid))
if p.is_running():
d['status'] = 1
d['cpu'] = p.cpu_percent(interval=0.1)
d['memory'] = p.memory_info().rss / 1024 / 1024
d['threads'] = p.num_threads()
d['fs'] = p.num_fds()
d['ctime'] = p.create_time()
monitor_data[server] = d
except Exception as e:
logging.exception(e)
d = dict()
d['cpu'] = 0
d['memory'] = 0
d['threads'] = 0
d['status'] = 0
d['fs'] = 0
d['ctime'] = 0
monitor_data[server] = d
else:
d = dict()
d['cpu'] = 0
d['memory'] = 0
d['threads'] = 0
d['status'] = 0
d['fs'] = 0
d['ctime'] = 0
monitor_data[server] = d
return monitor_data
@staticmethod
def create_series(role=None):
"""
:return: 创建 promethues gauge series 实例方法
"""
role_list = []
func = dict()
for item in monitor:
gague = Gauge(role.split("_")[0] + item, role + item, ['role', 'serverid', 'local_ip', 'wan_ip'])
func[role + item] = gague
return func
if __name__ == '__main__':
gs = GameServer()
wan_ip = gs.wan_ip
local_ip = gs.local_ip
channels = gs.channel
start_http_server(8111)
temp_roles_data = [x for x,y in Action.step_1(channels=channels).iteritems()]
roles = list(set([x.split("_")[0] for x in temp_roles_data]))
series_tmp = [ Action.create_series(x) for x in roles]
series = dict()
for d in series_tmp:
series.update(d)
while True:
s1 = Action.step_1(channels=channels)
s2 = Action.step_2(m_data=s1)
try:
for role, data in s2.iteritems():
key_role, key_id = role.split("_")[0], role.split("_")[1]
for m in monitor:
series.get(key_role + m).labels(role=key_role, serverid=key_id, local_ip=local_ip, wan_ip=wan_ip).set(data.get(m.strip("_")))
time.sleep(60) #60秒刷新一次数据
except Exception as e:
logging.exception(e)
启动程序,使用ip+端口访问web,就可以得到你想要的数据了。
在promethues页面我们使用采集的数据检查下
可以看到,我们自己的exporter也采集到了数据
这样我们就能在granfana里进行展现了
好了,自定义的exporter就完成了,下次我们说说promethues如何和consul搭配来完成我们的自动发现