zabbix自定义监控
1. 自定以监控进程
找到目录所在的文件取消行前注释将0修改为1。
[root@client etc]# pwd
/usr/local/etc
[root@client etc]# vim zabbix_agentd.conf
UnsafeUserParameters=1
在文件的最后一行添加下面的内容,这里是脚本所在的位置以及脚本所传的参数,$1是为脚本传的第一个参数,因为在实际的生产环境中不止有一个服务需要金控,脚本路径的地方是可以写命令的由于要实现的功能一条命令无法实现所以使用脚本。
UserParameter=check_process[*],/scripts/check_process.sh $1
然后再重启zabbix代理服务
[root@client etc]# pkill zabbix_agentd
[root@client etc]# zabbix_agentd
2. 编写脚本,脚本需放在统一的位置
[root@client ~]# mkdir /scripts
[root@client ~]# cd /scripts/
[root@client scripts]# ps -ef | grep httpd
root 130028 1 3 18:25 ? 00:00:00 /usr/sbin/httpd -DFOREGROUND
apache 130110 130028 0 18:25 ? 00:00:00 /usr/sbin/httpd -DFOREGROUND
apache 130111 130028 8 18:25 ? 00:00:00 /usr/sbin/httpd -DFOREGROUND
apache 130114 130028 7 18:25 ? 00:00:00 /usr/sbin/httpd -DFOREGROUND
apache 130115 130028 8 18:25 ? 00:00:00 /usr/sbin/httpd -DFOREGROUND
root 130537 19998 0 18:25 pts/0 00:00:00 grep --color=auto httpd
[root@client scripts]# vim check_process.sh
#!/bin/bash
inspect=$(ps -ef | grep -Ev "grep|$0" | grep -c "$1") //这里利用扩展正则把脚本本身和grep的httpd服务过滤掉。
if [ $inspect -eq 0 ];then
echo '1' //1代表有问题
else
echo '0' //0代表正常
fi
3. 执行脚本进行测试
[root@client scripts]# ./check_process.sh httpd
0
[root@client scripts]# systemctl status httpd
● httpd.service - The Apache HTTP Server
Loaded: loaded (/usr/lib/systemd/system/httpd.service; disabled; vendor prese>
Drop-In: /usr/lib/systemd/system/httpd.service.d
└─php-fpm.conf
Active: active (running) since Fri 2021-10-08 18:25:18 CST; 1min 23s ago
[root@client scripts]# ./check_process.sh httpd
1
[root@client scripts]# systemctl status httpd.service
● httpd.service - The Apache HTTP Server
Loaded: loaded (/usr/lib/systemd/system/httpd.service; disabled; vendor prese>
Drop-In: /usr/lib/systemd/system/httpd.service.d
└─php-fpm.conf
Active: inactive (dead)
Docs: man:httpd.service(8)
[root@client scripts]# systemctl stop httpd.service
[root@server ~]# zabbix_get -s 192.168.182.139 -k check_process[httpd]
1
[root@client scripts]# systemctl start httpd.service
[root@server ~]# zabbix_get -s 192.168.182.139 -k check_process[httpd]
0
4. 添加触发器
点击配置,在配置里面找到主机,然后点击。
点击item
创建item
这里的key值就是你配置文件中的key值
在监控里面的罪最新数据里面找到在item里面写的名字。然后查看最新500条,因为httpd为开启状态所以为0
5. 添加触发器
5.1 添加具体表达式
// 这里的名字最好见名知意
这里看到httpd说明报警成功
然后我们可以去客户端将httpd服务关闭,再查看最新的数据发现,变成了1。
[root@client etc]# systemctl stop httpd.service
[root@client etc]# ss -anlt
State Recv-Q Send-Q Local Address:Port Peer Address:Port
LISTEN 0 128 0.0.0.0:22 0.0.0.0:*
LISTEN 0 128 0.0.0.0:10050 0.0.0.0:*
LISTEN 0 128 [::]:22 [::]:*
6. 自定义监控进程
- 我们下面使用python脚本来监控日志文件,监控的关键字为Error
- 第一个参数为日志文件名(文件的相对路径和绝对路径都可)
- 第二个参数为seek position文件的路径(可选,如果不设置默认的路径为/tmp/logseek文件。相对路径和绝对路径都可以)
- 第三个参数为搜索关键字,默认为Error
执行这个脚本有一个前提条件就是需要主机上有python3,如果没有就安装一个python
[root@client etc]# yum -y install python3
[root@client scripts]# chmod +x log.py
[root@client scripts]# cat log.py
#!/usr/bin/env python3
import sys
import re
def prePos(seekfile):
global curpos
try:
cf = open(seekfile)
except IOError:
curpos = 0
return curpos
except FileNotFoundError:
curpos = 0
return curpos
else:
try:
curpos = int(cf.readline().strip())
except ValueError:
curpos = 0
cf.close()
return curpos
cf.close()
return curpos
def lastPos(filename):
with open(filename) as lfile:
if lfile.readline():
lfile.seek(0,2)
else:
return 0
lastPos = lfile.tell()
return lastPos
def getSeekFile():
try:
seekfile = sys.argv[2]
except IndexError:
seekfile = '/tmp/logseek'
return seekfile
def getKey():
try:
tagKey = str(sys.argv[3])
except IndexError:
tagKey = 'Error'
return tagKey
def getResult(filename,seekfile,tagkey):
destPos = prePos(seekfile)
curPos = lastPos(filename)
if curPos < destPos:
curpos = 0
try:
f = open(filename)
except IOError:
print('Could not open file: %s' % filename)
except FileNotFoundError:
print('Could not open file: %s' % filename)
else:
f.seek(destPos)
while curPos != 0 and f.tell() < curPos:
rresult = f.readline().strip()
global result
if re.search(tagkey, rresult):
result = 1
break
else:
result = 0
[root@client scripts]# cat log.py
#!/usr/bin/env python3
import sys
import re
def prePos(seekfile):
global curpos
try:
cf = open(seekfile)
except IOError:
curpos = 0
return curpos
except FileNotFoundError:
curpos = 0
return curpos
else:
try:
curpos = int(cf.readline().strip())
except ValueError:
curpos = 0
cf.close()
return curpos
cf.close()
return curpos
def lastPos(filename):
with open(filename) as lfile:
if lfile.readline():
lfile.seek(0,2)
else:
return 0
lastPos = lfile.tell()
return lastPos
def getSeekFile():
try:
seekfile = sys.argv[2]
except IndexError:
seekfile = '/tmp/logseek'
return seekfile
def getKey():
try:
tagKey = str(sys.argv[3])
except IndexError:
tagKey = 'Error'
return tagKey
def getResult(filename,seekfile,tagkey):
destPos = prePos(seekfile)
curPos = lastPos(filename)
if curPos < destPos:
curpos = 0
try:
f = open(filename)
except IOError:
print('Could not open file: %s' % filename)
except FileNotFoundError:
print('Could not open file: %s' % filename)
else:
f.seek(destPos)
while curPos != 0 and f.tell() < curPos:
rresult = f.readline().strip()
global result
if re.search(tagkey, rresult):
result = 1
break
else:
result = 0
with open(seekfile,'w') as sf:
sf.write(str(curPos))
finally:
f.close()
return result
if __name__ == "__main__":
result = 0
curpos = 0
tagkey = getKey()
seekfile = getSeekFile()
result = getResult(sys.argv[1],seekfile,tagkey)
print(result)
6.1 传递参数测试脚本
[root@client scripts]# ./log.py /var/log/httpd/error_log
0
[root@client scripts]# echo "Error" >> /var/log/httpd/error_log
[root@client scripts]# ./log.py /var/log/httpd/error_log
1
[root@client tmp]# rm -f logseek
6.2 修改客户端的配置文件
在最后一行添加下面的内容,并重启服务
[root@client scripts]# vim /usr/local/etc/zabbix_agentd.conf
UserParameter=check_log[*],/scripts/log.py $1 $2 $3
[root@client scripts]# pkill zabbix_agentd
[root@client scripts]# zabbix_agentd
[root@client scripts]# chmod 755 /var/log/httpd/
// 返回到服务端看能否获取到数据
[root@server ~]# zabbix_get -s 192.168.182.139 -k check_log[/var/log/httpd/error_log]
0
[root@client scripts]# echo "Error" >> /var/log/httpd/error_log
[root@server ~]# zabbix_get -s 192.168.182.139 -k check_log[/var/log/httpd/error_log]
1
6.3 在web界面配置监控项和触发器
主机 | IP |
---|---|
client (主) | 192.168.182.139 |
slave(从) | 192.168.182.141 |
7. 自定义监控mysql主从
7.1 写脚本,脚本需要在从库上监控
[root@slave scripts]# cat check_mysql.sh
#!/bin/bash
pass=1
count=$(mysql -uroot -p$pass -e "show slave status \G;" 2>/dev/null | grep -E 'Slave_IO_Running: Yes|Slave_SQL_Running: Yes' | grep -c Yes)
if [ $count -eq 2 ];then
echo '0'
else
echo '1'
fi
7.2 修改客户端的配置文件也就是从库的配置
[root@slave scripts]# vim /usr/local/etc/zabbix_agentd.conf
UnsafeUserParameters=1 //取消注释将0改为1
UserParameter=check_mysql,/scripts/check_mysql.sh
7.3 手动触发并验证
[root@slave scripts]# ./check_mysql.sh
0
mysql> show slave status\G
*************************** 1. row ***************************
Slave_IO_State: Waiting for master to send event
Master_Host: 192.168.182.139
Master_User: test
Master_Port: 3306
Connect_Retry: 60
Master_Log_File: mysql_bin.000005
Read_Master_Log_Pos: 154
Relay_Log_File: mysql_relay.000018
Relay_Log_Pos: 367
Relay_Master_Log_File: mysql_bin.000005
Slave_IO_Running: Yes
Slave_SQL_Running: Yes
mysql> stop slave;
Query OK, 0 rows affected (0.00 sec)
mysql> show slave status\G
*************************** 1. row ***************************
Slave_IO_State:
Master_Host: 192.168.182.139
Master_User: test
Master_Port: 3306
Connect_Retry: 60
Master_Log_File: mysql_bin.000005
Read_Master_Log_Pos: 154
Relay_Log_File: mysql_relay.000018
Relay_Log_Pos: 367
Relay_Master_Log_File: mysql_bin.000005
Slave_IO_Running: No
Slave_SQL_Running: No
[root@slave scripts]# ./check_mysql.sh
1
[root@server ~]# zabbix_get -s 192.168.182.141 -k check_mysql
1
7.4 web界面配置报警
- 创建主机
7.5 添加完成之后点击确定
7.5.1 添加触发器
添加完成之后点击插入
最后点击添加
8. 手动触发并查看
mysql> show slave status\G
*************************** 1. row ***************************
Slave_IO_State:
Master_Host: 192.168.182.139
Master_User: test
Master_Port: 3306
Connect_Retry: 60
Master_Log_File: mysql_bin.000005
Read_Master_Log_Pos: 154
Relay_Log_File: mysql_relay.000018
Relay_Log_Pos: 367
Relay_Master_Log_File: mysql_bin.000005
Slave_IO_Running: No
Slave_SQL_Running: No
9. MySQL主从延迟
[root@slave scripts]# cat check_replication_delay.sh
#!/bin/bash
pass=1
count=$(mysql -uroot -p$pass -e 'show slave status\G' 2>/dev/null | grep 'Behind'|awk '{print $2}')
if [ $count != NULL ];then
echo $count
else
echo '0'
fi
9.1 修改配置文件
[root@slave scripts]# tail -1 /usr/local/etc/zabbix_agentd.conf
UserParameter=check_replication_delay,/scripts/check_replication_delay.sh
测试
Seconds_Behind_Master: NULL
[root@server ~]# zabbix_get -s 192.168.182.141 -k check_replication_delay
0
9.2 创建主机
点击添加
9.3 添加监控项
9.4 点击添加
这里为0是为了看到效果,在实际的生产环境中要根据实际情况老设置延迟的阈值。