#!bin/sh
. $HOME/.profile
curpath=$(cd "$(dirname "$0")"; pwd)
cd $curpath
export LANG=C
tmpfile=jobinfo.log
lproject=ETL_NEW #datastage的project
printMsg=""
cweek=`date +%w`
cday=`date +%e`
#gday=`expr substr $cday, 1 2` #单个数字的天时Grep的时候加一个, 比较有意思
Pday=`date +%Y%b%d` #2010mar22
today=`date +%Y%m%d` #20100322
logfile=d${today}.log
printMsg="${today} Job: "
fatalNum=0
filelog=$curpath/dssh.log
sendMessage ()
{
#短信告警----根据自己情况还定
}
#获取相关JOB的文件目录
getJobCategory ()
{
Jobname=$1
cd /home/dsadm
dssh
LOGTO ETL_NEW
select * from DS_JOBS WHERE NAME ='$Jobname';
qUIT
!
catname=`grep "Category" $filelog|awk '{print $2}'`
echo $catname
}
#Program Begin
crontab -l|grep "ETL_NEW"|awk '{print $2$1,$3,$4,$5,$9}' >cron.lst #生成任务列表
sed -e 's/1,2,3,4,5,6,0/*/g' cron.lst >cron.out
sed -e 's/1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31/*/g' cron.out >cron.lst
###增加没有加入定时的JOB监控
cat cron_hand.lst >>cron.lst
####改善建议,可以增加在1号的时候将上个月的LOG文件压缩一下。 后期实现
while true
do
ctime=`date +%H%M`
rm cron.bak
if [ "$ctime" -gt 2200 ] #程序退出
then
break;
fi
while read line
do
if [ "$line" = "" ]
then
break;
fi
#line里面有特殊字符*,在使用的时候需要用双引号引住,否则报错
dtime=`echo "$line"|awk '{print $1}'` #获取定时时间
dday=`echo "$line"|awk '{print $2}'` #获取定时日期
dweek=`echo "$line"|awk '{print $4}'` #获取定时星期
#echo "cccc:$dtime,${dweek},$cweek,$cday,$gday,$dday"
#####判断当前日期是否在定时日期列表中
flag=-1
if [ "$dday" != "*" ]
then
flag=`echo ${dday}|awk -F"," '{ for (i=1;i<=NF;i++) { if ( $i==dd ) {print "100";break }}}' "dd=$cday"`
fi
##判断应该当天运行的程序
if ([ "$dweek" = "*" ] || [ `echo ${dweek}|grep -q $cweek && echo 0||echo 1` -eq 0 ]) && #--比较星期几
([ "$dday" = "*" ] || [ "$dday" = "$cday" ] || [ "$flag" -eq 100 ]) #--比较天
then
if [ "$ctime" -gt "$dtime" ] ##时间已过定时
then
jobname=`echo "$line"|awk '{print $5}'` #获取任务名
#echo $lproject,$jobname,$logfile,$Cday,$today
dsjob -jobinfo $lproject $jobname>$tmpfile
jobstatus=""
jobstatus=`cat $tmpfile|grep "FAILED"`
jobpid=`cat $tmpfile|grep "Process"|awk -F ":" '{print $2}'`
logdate=`cat $tmpfile|grep "Job Start"|awk '{print $9$6substr(0$7,length(0$7) -1,length(0$7))}`
if [ "$Pday" != "$logdate" ]||[ "$jobstatus" != "" ] #日期不同或者错误
then
jobcate=`getJobCategory ${jobname}`;
printMsg="${printMsg}${jobcate}.${jobname};"
fatalNum=`expr $fatalNum + 1`
if [ "$fatalNum" -ge 5 ] && [ "$ctime" -gt 800 ]
then
printMsg="${printMsg} 出现错误,请及时处理."
#Send Message to Mobile
sendMessage "${printMsg}"
printMsg="${today} Job: "
fatalNum=0
fi
continue;
else
if [ "$jobpid" -eq 0 ] #进程运行已经结束的
then
continue;
fi
fi
fi
echo "$line">>cron.bak #正在运行的以及定时还未到的任务重新监控
fi
done if [ "$fatalNum" -gt 0 ] && [ "$ctime" -gt 800 ]
then
printMsg="${printMsg} 出现错误,请及时处理."
#Send Message to Mobile
sendMessage "${printMsg}"
printMsg="${today} Job: "
fatalNum=0
fi
mv cron.bak cron.lst
sleep 300
done
#Program END