脚本背景:
我所在的公司为运营CDN业务的IDC公司,客户域名的流量图经常会出现毛刺,但是服务的域名非常多,每天挨个流量图看耗时耗力。因此用python写了个可以自动检测异常rrd里异常数值并发送报警邮件的脚本。
由于我们的rrd文件是以服务域名命名的,所以先在相应的API上获取服务域名,然后根据域名扫描rrd文件。我设的是扫描半小时的数值,每10分钟执行一次,大概有2000来个rrd文件,执行一次6、7秒左右。
代码如下:
#!/usr/bin/env python
#coding:utf-8
from pyrrd.graph import DEF,CDEF,AREA
from pyrrd.graph import Graph
from pyrrd.graph import ColorAttributes
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.mime.p_w_picpath import MIMEImage
from datetime import datetime
import calendar
import os
import time
import urllib2
import smtplib
import email
import sys
def graphrrd(files):
now_utc =calendar.timegm(datetime.utcnow().utctimetuple())
def1 = DEF(rrdfile=files, vname='back',dsName='RX')
def2 = DEF(rrdfile=files, vname='CDN',dsName='TX')
cdef1 = CDEF(vname='back_flow',rpn='%s,0.026,*' % def1.vname)
cdef2 = CDEF(vname='CDN_flow',rpn='%s,0.026,*' % def2.vname)
area1 = AREA(defObj=cdef1, color='#002A97FF', legend='back_flow')
area2 = AREA(defObj=cdef2, color='#00CF00FF', legend='CDN_flow')
ca = ColorAttributes()
ca.back = '#333333'
ca.canvas = '#333333'
ca.shadea = '#000000'
ca.shadeb = '#111111'
ca.mgrid = '#CCCCCC'
ca.axis = '#FFFFFF'
ca.frame = '#AAAAAA'
ca.font = '#FFFFFF'
ca.arrow = '#FFFFFF'
graphfile = p_w_picpath_dir
title_url=files[23:-4]
g = Graph(graphfile, start= now_utc-43200, end= now_utc,vertical_label='flow',title=title_url )
g.data.extend([def1, def2, cdef1, cdef2, area2, area1])
g.write()
def connect():
server=smtplib.SMTP(smtpserver)
server.ehlo()
server.login(smtpuser,smtppass)
return server
def sendmessage(server,to,subj,content):
msg = MIMEMultipart('related')
msg['Subject'] = subj
msg['From'] = smtpuser
msg['To'] = to
msg['Date'] = email.Utils.formatdate()
msgText = MIMEText(content,"html", "utf-8")
msg.attach(msgText)
fp = open(p_w_picpath_dir, 'rb')
msgImage = MIMEImage(fp.read())
fp.close()
msgImage.add_header('Content-ID', '<p_w_picpath1>')
msg.attach(msgImage)
try:
server.sendmail(smtpuser, to, msg.as_string())
except Exception ,ex:
print Exception,ex
print 'Error - send failed'
def aver(rrd_file,n=6):
global dict_data
sum1=0
sum2=0
sum3=0
data = os.popen('rrdtool fetch %s AVERAGE -s -1d | tail -%d | grep -v nan| grep -v RX ' % (rrd_file,n)).readlines()
if len(data)< (n/2):
log("[ERRORS: %s] has not enough record ! please check it!!\n" % rrd_file)
return []
for i in data:
if len(i) > 25:
dict_data[i[:10]]=i.strip()[12:].split()
for i in dict_data.values():
try:
sum1 = sum1+float(i[0])
sum2 = sum2+float(i[1])
sum3 = sum3+float(i[2])
except:
log('%s %s\n' % (rrd_file,i))
if sum2/len(data) < 3500000000:
log('WARNING: %s was less then 200M\n' % rrd_file)
return []
return [sum1/len(data),sum2/len(data),sum3/len(data)]
def check(average):
wrong_t=[]
for key in dict_data:
if float(dict_data[key][1])/average > 1.6:
wrong_t.append(key)
return wrong_t
def update(rrd_file,t,aver1,aver2,aver3):
global text
global dict_data
errors_time=os.popen('date -d "1970-01-01 UTC %s seconds"' % t).readline().strip()
content = '<br/><br/>%s 异常信息:<br/> 域名: %s <br/> 时间: %s<br/> 流量值: 回源带宽: %.2fM , cdn带宽 : %dM <br/> <br/>rrd 异常信息:<br/> 路径: %s<br/> UTC 时间: %s<br/> 异常值: [%s], [%s], [%s]<br/><br/><img src="cid:p_w_picpath1">' % (rrd_file[23:-4],rrd_file[23:-4],errors_time,float(dict_data[t][0])*8/300000000,int(float(dict_data[t][1])*8/300000000),rrd_file,t,dict_data[t][0],dict_data[t][1],dict_data[t][2])
write_error('[ %s ]: at[ %s(%s) ],the value was [%s] [%s] [%s] \n' %(rrd_file,errors_time,t,dict_data[t][0],dict_data[t][1],dict_data[t][2]))
text = text + content
def log(log_write):
f = open('%s/rrd_alt1.log' % rrd_bak, 'a')
f.write(log_write)
f.close()
def write_error(log_write):
f = open('%s/rrd_error1.log' % rrd_bak, 'a')
f.write(log_write)
f.close
def run_script(rrd_file):
global to_all
global text
aver_rrd=aver(rrd_file)
if len(aver_rrd) == 0:
return
wrong_time=check(aver_rrd[1])
if len(wrong_time)==0:
log('[%s] no errors !\n' % (rrd_file))
return
for t in wrong_time:
update(rrd_file,t,aver_rrd[0],aver_rrd[1],aver_rrd[2])
graphrrd(rrd_file)
if text:
for to in to_all:
server=connect()
sendmessage(server,to,subj,text)
log('sendmail to %s\n' % to)
if __name__=='__main__':
p_w_picpath_time=time.strftime("%d-%H-%M")
rrd_dir='/data/rrd/db/1/billing'
rrd_bak='/data/rrd/db/1/billing/bak'
smtpserver='xxx'
p_w_picpath_dir='%s/rrdgraph_%s.png' % (rrd_bak,p_w_picpath_time)
smtpuser='xxx'
smtppass='yyy'
to_all=['xxx','yyy']
subj='check the flow of CDN!!!!'
while True:
url_list=[]
local_time = time.strftime("%m-%d %H:%M:%S")
url=urllib2.urlopen('xxx').readlines()
for u in url:
a = "%s/%s.rrd" % (rrd_dir,u.strip())
url_list.append(a)
log("-"*60+"\n")
log("the script run time at %s \n" % local_time)
while len(url_list):
text=''
dict_data={}
rrd_file = url_list.pop()
if os.path.exists(rrd_file):
run_script(rrd_file)
else:
continue
log("-"*60+"\n")
break
邮件截图
转载于:https://blog.51cto.com/songzhe/1298826