1.复制check_ganglia.py到/usr/lib64/nagios/plugins
check_ganglia.py(自行修改的,官方的有BUG)
#!/usr/bin/env python
import sys
import getopt
import socket
import xml.parsers.expat
class GParser:
def __init__(self, host, metric):
self.inhost =0
self.inmetric = 0
self.value = None
self.host = host
self.metric = metric
def parse(self, file):
p = xml.parsers.expat.ParserCreate()
p.StartElementHandler = parser.start_element
p.Parse(file)
if self.value == None:
raise Exception('Host/value not found')
return float(self.value)
def start_element(self, name, attrs):
if name == "HOST":
if attrs["NAME"]==self.host:
self.inhost=1
elif self.inhost==1 and name == "METRIC":
if attrs["NAME"]==self.metric:
self.value=attrs["VAL"]
def usage():
print """Usage: check_ganglia \
-h|--host= -m|--metric= -w|--warning= \
-c|--critical= [-s|--server=] [-p|--port=] """
sys.exit(3)
if __name__ == "__main__":
##############################################################
ganglia_host = '127.0.0.1'
ganglia_port = 8649
host = None
metric = None
warning = None
critical = None
opposite = 0
try:
options, args = getopt.getopt(sys.argv[1:],
"h:m:w:c:s:p:",
["host=", "metric=", "warning=", "critical=", "server=", "port="],
)
except getopt.GetoptError, err:
print "check_gmond:", str(err)
usage()
sys.exit(3)
for o, a in options:
if o in ("-h", "--host"):
host = a
elif o in ("-m", "--metric"):
metric = a
elif o in ("-w", "--warning"):
warning = float(a)
elif o in ("-c", "--critical"):
critical = float(a)
elif o in ("-p", "--port"):
ganglia_port = int(a)
elif o in ("-s", "--server"):
ganglia_host = a
if critical == None or warning == None or metric == None or host == None:
usage()
sys.exit(3)
try:
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.connect((ganglia_host,ganglia_port))
parser = GParser(host, metric)
makefile = s.makefile("r")
linea = ""
for line in makefile.readlines():
line = line.replace("\n"," ")
linea += line
value = parser.parse(linea)
s.close()
except Exception, err:
print "CHECKGANGLIA UNKNOWN: Error while getting value \"%s\"" % (err)
sys.exit(3)
if critical > warning:
if value >= critical:
print "CHECKGANGLIA CRITICAL: %s is %.2f" % (metric, value)
sys.exit(2)
elif value >= warning:
print "CHECKGANGLIA WARNING: %s is %.2f" % (metric, value)
sys.exit(1)
else:
print "CHECKGANGLIA OK: %s is %.2f" % (metric, value)
sys.exit(0)
else:
if critical >=value:
print "CHECKGANGLIA CRITICAL: %s is %.2f" % (metric, value)
sys.exit(2)
elif warning >=value:
print "CHECKGANGLIA WARNING: %s is %.2f" % (metric, value)
sys.exit(1)
else:
print "CHECKGANGLIA OK: %s is %.2f" % (metric, value)
sys.exit(0)
2.创建/etc/nagios/objects/ganglia-services.cfg
define host {
use linux-server
host_name 1.1.1.1 # 名字随便起,监控的是1上的flume,就写1的ip
address 1.1.1.1 # 名字随便起,监控的是1上的flume,就写1的ip
}
define hostgroup {
hostgroup_name ganglia-servers
alias nagios server
members *
}
define servicegroup {
servicegroup_name ganglia-metrics
alias Ganglia Metrics
}
define command {
command_name check_ganglia
command_line /usr/lib64/nagios/plugins/check_ganglia.py -h mg -m $ARG1$ -w $ARG2$ -c $ARG3$ # -h 这个需要在命令行上执行脚本看用ip还是主机名合适
}
define service {
use generic-service
name ganglia-service
hostgroup_name ganglia-servers
service_groups ganglia-metrics
notifications_enabled 0
}
# 监控flume.CHANNEL.memoryChannel.EventPutSuccessCount,其他复制的改两个地方就行【service_description和check_command】
define service{
max_check_attempts 5 ;
normal_check_interval 3 ;
retry_check_interval 2 ;
check_period 24x7 ;
notification_interval 60 ;
notification_period 24x7 ;
notification_options w,u,c,r ;
contact_groups admins ;
use ganglia-service
service_description FLUME发送event数量 # 网页上显示用的
check_command check_ganglia!flume.CHANNEL.memoryChannel.EventPutSuccessCount!10!50 # 直接从ganglia标题上复制就行
}
3.修改contacts.cfg
vi /etc/nagios/objects/contacts.cfg
define contact{
contact_name nagiosadmin ; Short name of user
use generic-contact ; Inherit default values from generic-contact template (defined above)
alias Nagios Admin ; Full name of user
service_notification_period workhours ;
host_notification_period workhours ;
service_notification_options w,u,c,r ;
host_notification_options d,u,r ;
service_notification_commands notify-service-by-email ;
host_notification_commands notify-host-by-email ;
email 12345@qq.com; 【复制以后只改接收邮箱地址就行】
}
define contactgroup{
contactgroup_name admins
alias bfire
members nagiosadmin
}
4.修改nagios.cfg
vi /etc/nagios/nagios.cfg
加入cfg_file=/etc/nagios/objects/ganglia-services.cfg
5.重启nagios和apache
service nagios restart
service httpd restart
6.网页设置(http://ip/ganglia)
7.查看nagios日志
more /var/log/nagios/nagios.log
SERVICE NOTIFICATION代表邮件发送成功。
8.邮件配置
yum remove sendmail
service postfix restart
## 发送测试邮件
echo "how are you today" | mail -s "test" 12345@qq.com