当你在某台机器上运行着某个daemon,假设有一天它由于异常崩溃或终止,也许你不会及时知道,如果你维护着多个类似的程序,或许他们还在不同的机器上时,情况可能会变的更糟。
再或者你想让一封邮件来告知你某个program是否执行完成,而不是隔一段时间去ps下,等等...
那么下面这个小工具都可以帮你完成。
功能:
监控进程,指定被监控的进程一旦由于异常退出,则发邮件通知,邮件中包含一些较为重要的信息,如机器ip,命令行,另外还可以考虑显示程序执行所在的目录
命令:
$sh detector_pid.sh -h
usage:
sh detector_pid.sh <start|stop|status|stopall> [pid]
参数说明:
start pid 启动对某个进程pid的监控
stop pid 停止某个监控程序
status 显示监控列表,包含3列,分别是监控进程pid,被监控进程pid,被监控进程command
stopall 停止监控列表里的所有监控
#!/bin/bash
# author: lijingcheng3359@gmail.com
function help()
{
echo -e 'usage:\nsh detector_pid.sh <start|stop|status|stopall> [pid]'
}
function init()
{
if [ ! -f '.detected' ] ; then
echo "touch .detected for record!"
touch '.detected'
fi
}
function start()
{
if [ "$1" = "" ] ; then
help
exit 1
fi
ps -p "$1" >/dev/null
if [ "$?" -eq 0 ] ; then
echo "start watch process $1!"
nohup sh detector_pid.sh daemon "$1" &
echo -e "$!\t"`ps -p $1 o pid,cmd | grep -v PID` >> .detected
else
echo "process $1 is not exist!"
fi
}
function stop()
{
if [ "$1" = "" ] ; then
help
exit 1
fi
echo "stop detector $1!"
if [ `grep -c "^$1" .detected` -eq 1 ] ; then #nice
kill $1
sed -i "/^$1/d" .detected
echo "kill done!"
else
echo "detector $1 is not exist!"
fi
}
function status()
{
sed -i /^$/d .detected
echo -e "DPID\tPID COMMADN\n`cat .detected`"
}
function stopall()
{
for pid in `awk '{print $1}' .detected`
do
echo "kill $pid"
kill $pid
done
echo "" > .detected
}
function daemon()
{
if [ "$1" = "" ] ; then
help
exit 1
fi
while true ; do
sleep 10
ps -p "$1"
if [ "$?" -ne 0 ] ; then
ret=`egrep " $1 " .detected | cut -d " " -f2-`
ip=`ifconfig eth0 | grep -w inet | cut -d":" -f2 | cut -d" " -f1`
user=`whoami`
python send_mail.py -s detector_pid -l "process $1 is not exist!<br />location:$user@$ip<br />detail:$ret" -t jingcheng.lijc
sed -i "/ $1 /d" .detected
exit 1
fi
done
}
#start!
if [ "$#" -eq 0 -o "$1" = "-h" ] ; then
help
exit 1
fi
init
case $1 in
"start" ) start $2;break;;
"stop" ) stop $2;break;;
"status" ) status;break;;
"stopall" ) stopall;break;;
"daemon" ) daemon $2;break;;
* ) echo "Input Error!. -h for help";exit 1;;
esac
补充说明:
1、程序中有部分tab由于显示原因变成了空格。
2、email部分是个脚本,需要自己准备。