http://sfau.lt/b5T0zO
因为目前服务器规模较小,使用zabbix,nagios 等开源的监控系统的必要性并不高,加上配置维护花费的时间成本,所以决定通过自己的脚本,配合saltstack来处理。
监控原理很简单,server端负责处理监控信息,agent 端负责收集信息,并统一发送到服务器端。
脚本目录
├── weixin.py
├── __init__.py
└── main.py
main.py
#!/usr/bin/python2.7
# -*- coding: utf-8 -*-
import time, socket, threading,json
from weixin import senddata,gettoken
def tcplink(sock, addr):
print 'New Connection from %s:%s...' % addr
res={}
while True:
data = sock.recv(1024)
time.sleep(1)
if data == 'exit' or not data:
break
res = data
handler(res)
sock.close()
print 'Connection from %s:%s closed.' % addr
return res
# 错误报告
def report(data):
content = ''
for d in data:
content = content + d + "\n"
print content
corpid = 'xxxxxxxxxxxx'
corpsecret = 'xxxxxxxxxxxxxxxxx'
accesstoken = gettoken(corpid, corpsecret)
msg = senddata(accesstoken, content)
print msg
print data
# 处理客户端消息,根据阈值判断
def handler(res):
try:
data = json.loads(res)
except Exception,e:
print e
print "Data type wrong."
return False
m_type = data['type']
# 服务器资源监控
if m_type == 1:
# ip
ip = data['ip']
# ip
name = data['name']
# cpu 利用率
cpu_use = data['cpu_use']
# cpu load (可以改进通过获取cpu核数来动态判断)
cpu_load = data['cpu_load']
# 内存 利用率
mem_use = data['mem_use']
# 磁盘利用率
disk_use = data['disk_use']
message = ["ip: %s" % ip, "name: %s" % name]
print ip,cpu_use,cpu_load,mem_use,disk_use
if cpu_use > 95:
message.append("cpu_use: %s" % cpu_use)
if cpu_load > 3:
message.append("cpu_load: %s" % cpu_load)
if mem_use > 85:
message.append("mem_use: %s" % mem_use)
if disk_use > 75:
message.append("disk_use: %s" % disk_use)
if message.__len__() > 2:
report(message)
return True
# 服务监控
elif m_type == 2:
print "service eyes..."
print data
message = ["oops some service down!"]
if data["status"] == 1:
message.append("message: %s" % data)
report(message)
return True
if __name__=="__main__":
print "Minitor Service Listening on 9999 port."
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.bind(('0.0.0.0', 9999))
s.listen(5)
while True:
sock, addr = s.accept()
t = threading.Thread(target=tcplink, args=(sock, addr))
t.start()
weixin.py
import requests
import json
import sys
def gettoken(corp_id, corp_secret):
gettoken_url = 'https://qyapi.weixin.qq.com/cgi-bin/gettoken?corpid=' + corp_id + '&corpsecret=' + corp_secret
try:
token_file = requests.get(gettoken_url)
except requests.HTTPError as e:
print(e.code)
print(e.read().decode("utf8"))
token_data = token_file.text.decode('utf-8')
token_json = json.loads(token_data)
token_json.keys()
token = token_json['access_token']
return token
def senddata(access_token,content):
send_url = 'https://qyapi.weixin.qq.com/cgi-bin/message/send?access_token=' + access_token
send_values = {
"touser":"187xxxxxxxx|185xxxxxxxx",
"msgtype":"text",
"agentid":"17",
monitor.py客户端
# monitor.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import division
import socket
import psutil
import os
# 内存
def getMonitor():
# 主机信息
name = socket.getfqdn(socket.gethostname())
ip = socket.gethostbyname(name)
# n内存
mem=psutil.virtual_memory()
mem_use = int((mem.available/mem.total)*100)
# cpu
cpuload_1, cpuload_5, cpuload_15 = os.getloadavg()
cpu_load = cpuload_5
# cpu_use = psutil.cpu_percent(1)
cpu = psutil.cpu_percent(interval=5, percpu=True)
cpu_count = psutil.cpu_count()
cpu_use_total = 0
for c in cpu:
cpu_use_total=cpu_use_total + c
cpu_use = cpu_use_total/cpu_count
# 磁盘
disk_use = psutil.disk_usage('/').percent
data = {
"ip": ip,
"name": name,
"cpu_load": cpu_load,
"cpu_use": cpu_use,
"mem_use": mem_use,
"disk_use": disk_use,
}
print str(data)
return str(data).replace("'", '"')
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
# 建立连接:
s.connect(('server_ip', 9999))
data = getMonitor()
try:
s.send(data)
s.close()
except Exception,e:
print e
s.close()
运行方式:
客户端
在saltstack 服务器上定时执行 监控脚本*/5 * * * * salt '*' cmd.script salt://scripts/monitor.py python_shell=true
服务器加入系统进程,侦听tcp端口