1 获取Prometheus target数据
调用http://<prometheus.address>/api/v1/targets
并解析。
def getTargetsStatus(address):
url = address + '/api/v1/targets'
response = requests.request('GET', url)
if response.status_code == 200:
targets = response.json()['data']['activeTargets']
aliveNum, totalNum = 0, 0
downList = []
for target in targets:
totalNum += 1
if target['health'] == 'up':
aliveNum += 1
else:
downList.append(target['labels']['instance'])
print('-----------------------TargetsStatus--------------------------')
print(str(aliveNum) + ' in ' + str(totalNum) + ' Targets are alive !!!')
print('--------------------------------------------------------------')
for down in downList:
print('\033[31m\033[1m' + down + '\033[0m' + ' down !!!')
print('-----------------------TargetsStatus--------------------------')
else:
print('\033[31m\033[1m' + 'Get targets status failed!' + '\033[0m')
2 获取Prometheus 监控信息(cpu、mem、disks)
调用http://<prometheus.address>/api/v1/query?query=<expr>
并解析,其中expr为prometheus的查询语句。
### 定义cpu、mem、disks使用率的空字典
diskUsageDict = {}
cpuUsageDict = {}
memUsageDict = {}
### 定义采集时间间隔 s
monitorInterval = 5
### 定义超时告警时间 s
diskAlertTime = 5
cpuAlertTime = 300
memAlertTime = 300
### 定义告警阈值 %
diskThreshold = 80
cpuThreshold = 60
memThreshold = 70
def queryUsage(address, expr):
url = address + '/api/v1/query?query=' + expr
try:
return json.loads(requests.get(url=url).content.decode('utf8', 'ignore'))
except Exception as e:
print(e)
return {}
def orderUsageDict(usageDict, currentTime, monitorInterval):
'''
:param usageDict: 资源使用率字典
:param usageDict: 资源使用率字典
:param currentTime: 当前获取监控数据的时间节点
:return:
:description: 剔除字典中不满足连续超出阈值的数据
'''
for key in list(usageDict.keys()):
if currentTime - usageDict[key][1] >= monitorInterval:
usageDict.pop(key)
def getCurrentUsageGreater(address, record, threshold, usageDict, monitorInterval):
'''
:param address: Prometheus address
:param record: Prometheus rules record
:param threshold: 阈值
:param usageDict: 资源使用率字典
:param monitorInterval: 监控时间间隔
:return:
:description: 获取资源使用率大于阈值的数据
'''
expr = record + '>=' + str(threshold)
usage = queryUsage(address=address, expr=expr)
currentTime = 0
if 'data' in usage and usage['data']['result']:
for metric in usage['data']['result']:
instance = metric['metric']['instance']
if record == 'node:fs_usage:ratio' or record == 'node:fs_root_usage:ratio':
metricLabel = instance + ':' + metric['metric']['mountpoint']
else:
metricLabel = instance
utctime = metric['value'][0]
value = metric['value'][1]
describe = record.split(':')[1]
if not metricLabel in usageDict.keys():
usageDict[metricLabel] = (utctime, utctime, describe, value)
else:
startTime = usageDict.get(metricLabel)[0]
usageDict[metricLabel] = (startTime, utctime, describe, value)
currentTime = utctime
orderUsageDict(usageDict=usageDict, currentTime=currentTime, monitorInterval=monitorInterval)
def printUsageDict(usageDict, alertTime):
'''
:param usageDict: 资源使用率字典
:param alertTime: 监控告警时间
:return:
:description: 打印出超过监控告警时间的数据
'''
for key, value in usageDict.items():
deltaT = value[1] - value[0]
if deltaT >= alertTime:
print(key + ' ----- ' + value[2] + '\033[31m\033[1m ' + str(value[3]) + '\033[0m ----- lasted for\033[31m\033[1m %.2f \033[0mseconds' % deltaT)
def monitorUsageGreater(address):
'''
:param address: Prometheus address
:return:
:description: 持续监控并输出数据
'''
while True:
getCurrentUsageGreater(address, 'node:fs_usage:ratio', diskThreshold, diskUsageDict, monitorInterval)
printUsageDict(diskUsageDict, alertTime=diskAlertTime)
getCurrentUsageGreater(address, 'node:memory_usage:ratio', cpuThreshold, memUsageDict, monitorInterval)
printUsageDict(memUsageDict, alertTime=memAlertTime)
getCurrentUsageGreater(address, 'node:cpu_usage:ratio', memThreshold, cpuUsageDict, monitorInterval)
printUsageDict(cpuUsageDict, alertTime=cpuAlertTime)
time.sleep(monitorInterval)