压测内存泄露监控:比较压测结果,推送异常告警
一、问题:
游戏各个服务器存在内存泄露,需要在压测时监控定位问题。
二、 解决方案:
- 定时获取promotheus数据
- 比较两次数据,阙值大于1%时,推送结果到微信
memory_check.py
#!venv/bin/python
# -*- coding: utf-8 -*-
from apscheduler.schedulers.blocking import BlockingScheduler
from lib.statistics import Base
from util.toolkit import tk
from util.wechat_webhook import WechatWebHook
default_data = ''
def mem_task():
global default_data
print(f'default_data:::{default_data}')
# 获取数据
payload = Base(start=tk.current_time - 100, end=tk.current_time, step=100).node_mem
if default_data == '':
default_data = payload
print(f'current_data::: {type(payload)}, {payload}')
print(payload['auto-game-01-mem'])
game_rate = (float(payload['auto-game-01-mem'][1][1]) -
float(default_data['auto-game-01-mem'][1][1]))/float(
default_data['auto-game-01-mem'][1][1])
gateway_rate = (float(payload['auto-gateway-01-mem'][1][1]) -
float(default_data['auto-gateway-01-mem'][1][1])) / float(
default_data['auto-gateway-01-mem'][1][1])
if game_rate > 0.01 or gateway_rate > 0.01:
print(f'auto服内存警告::: game服 {"%.2f%%" % (cserver_rate * 100)};gateway服 {"%.2f%%" % (gateway_rate * 100)}')
msg = {'game': "%.2f%%" % (cserver_rate * 100),
'gateway': "%.2f%%" % (cserver_rate * 100)
# 异常警告
WechatWebHook.push_waning_msg(msg)
if __name__ == '__main__':
scheduler = BlockingScheduler()
scheduler.add_job(func=mem_task, trigger='cron', day_of_week='mon-sun', hour=1, minute=30, second=0)
# scheduler.add_job(func=mem_task, trigger='interval', seconds=2)
scheduler.start()
Base.py
import requests
from conf.settings import PROMETHEUS_URI
from util.logger import logger
from typing import Optional, Dict, List, Union
from conf.settings import SERVER_INFO
class Base:
def __init__(self, start: int, end: int, step: int = 15):
self.start = start
self.end = end
self.step = step
T = Optional[Dict[str, List[Union[int, str]]]]
def _init_data(self, *args, **kwargs):
...
@property
def hostname(self):
"""从配置中加载所有prometheus实例hostname"""
return "|".join([s["hostname"] for s in SERVER_INFO])
def _get_metrics(self, query: str) -> Optional[List[Dict[str, dict]]]:
"""从prometheus中获取metrics"""
response = requests.get(f"{PROMETHEUS_URI}/api/v1/query_range", params={
"query": query,
"start": self.start,
"end": self.end,
"step": self.step,
})
if response.status_code != 200:
logger.error(f"获取prometheus接口失败:::{response}:::{response.text}")
return
return response.json().get('data', {}).get('result')
@property
def node_mem(self) -> T:
"""获取内存数据"""
metrics = self._get_metrics(
query='(1 - (node_memory_MemAvailable_bytes{'
'origin_prometheus=~"",'
'job=~"perf-staging-prometheus",'
f'hostName=~"({self.hostname})"'
'} '
'/ '
'(node_memory_MemTotal_bytes{'
'origin_prometheus=~"",'
f'job=~"perf-staging-prometheus",hostName=~"({self.hostname})"}})))* 100'
)
mem_stats = {f"{m['metric'].get('hostName')}-mem": m['values'] for m in metrics}
return mem_stats
三、异常告警