#########################################################################

# File Name: monitor.sh

# Author: wugj

# mail: wugj@biomarker.com

# Created Time: 2015年11月16日 星期一 15时14分19秒

#########################################################################

#!/bin/bash

#监控系统负载、内存,超出则发出邮件警告

hostname=`hostname|sed 's/.local//g'`

#ip=`ifconfig |awk '{print $1,$2}'|egrep -e 'inet addr' -e 'Link'|egrep -v 'lo|127.0.0.1'|cut -f 2 -d ":"|sed 's/Link/:/g'`

ip=`ifconfig eth0|grep "inet addr"|cut -f 2 -d ":"|cut -f 1 -d " "`

echo ip:$ip

#cpu个数

cpu_num=`grep -c 'model name' /proc/cpuinfo`

echo cpu_num:$cpu_num

#统计节点状态信息日志路径

stat_path=/share/nas1/wugj/script/shell/log

echo persent static path:$stat_path

cur_time=`date +%y%m%d`

#节点状态文件

stat_file="$hostname`date +%y%m%d`.xls"


err_log="`date +%y%m`.log"

if [[ ! -f $stat_path/$err_log ]];

then touch $stat_path/$err_log

echo user     pid          host         date              command  >$stat_path/$err_log

fi

echo $stat_file

#设置平均负载的警告值

load_warn=0.70

#提取本机的静态变量

watc_cpu_test(){

#系统15分钟的负载

load_15=`uptime |awk '{print $NF}'`

echo load_15:  $load_15

#每个核心每15分钟负载

average_load=`echo "scale=3;a=$load_15/$cpu_num;if(length(a)==scale(a))print 0 ;print a"|bc`

echo $average_load

average_int=`echo $average_load|cut -f 1 -d "."`

echo average_int:$average_int

 

#当单个核心15分钟的平均负载值大于等于1.0(即个位整数大于0) ,直接发邮件告警;如果小于1.0则进行二次比较  

if (( $average_int > 0)); then  

      echo "$hostname15分钟的系统平均负载为$average_load,超过警戒值1.0,请立即处理!!!" 

else  

 

#当前系统15分钟平均负载值与告警值进行比较(当大于告警值0.70时会返回1,小于时会返回0 )  

load_now=`expr $average_load \> $load_warn`  

 

#如果系统单个核心15分钟的平均负载值大于告警值0.70(返回值为1),则发邮件给管理员  

 if (($load_now == 1)); then  

    echo "$hostname15分钟的系统平均负载达到 $average_load,超过警戒值0.70,请及时处理。"

fi  

 

fi  

}

function timeout()

{

waitsec=5

($*) & pid=$!

(sleep $waitsec && kill -HUP $pid) 2>/dev/null & watchdog=$!

#if command is finish

if wait $pid  2>/dev/null;then

pkill -HUP -P  $watchdog

wait $watchdog

fi


}

mem_quota=90 

function watch_mem() 

  memtotal=`cat /proc/meminfo |grep "MemTotal"|awk '{print $2}'` 

  memfree=`cat /proc/meminfo |grep "MemFree"|awk '{print $2}'` 

  cached=`cat /proc/meminfo |grep "^Cached"|awk '{print $2}'` 

  buffers=`cat /proc/meminfo |grep "Buffers"|awk '{print $2}'` 

 

#echo "hahhahah"

  mem_usage=$((100-memfree*100/memtotal-buffers*100/memtotal-cached*100/memtotal)) 

 

# echo mem_memssage:

  if [ $mem_usage -gt $mem_quota ];then 

mem_message="WARN! The Memory usage is over than $mem_usage%" 

kill_proc

     return 1

     else 

     return 0 

  fi

function get_cpu_info() 

{

 

  head -1 /proc/stat|awk '{used+=$2+$3+$4;unused+=$5+$6+$7+$8} END{print used,unused}' 

function proc_cpu_ps5() 

ps aux|sort -nk3r|head -n 5 >>$stat_path/$stat_file 

function proc_cpu_top5()

{

top -n 1 -b |sed -n '7'p >>$stat_path/$stat_file

top -n 1 -b | grep -v -E '^[[:alpha:]]|^$|COMMAND' |sort -k9nr|head -n 5 >>$stat_path/$stat_file

usr`=top -n 1 -b | grep -v -E '^[[:alpha:]]|^$|COMMAND' |sort -k9nr|head -n 1|awk -F " " '{print $2}'`

mail -s "cpu load high"  -c admin.list@biomarker.com.cn  $usr@biomarker.com.cn<$stat_paht/$stat_file

}

function kill_proc() 

{

#pid=`top -n 1 -b | grep -v -E '^[[:alpha:]]|^$|COMMAND|root' |sort -k6nr -k 9|head -n 1|awk -F " " '{print $1}'`

#mesg=`top -n 1 -b | grep -v -E '^[[:alpha:]]|^$|COMMAND|root' |sort -k6nr -k 9|head -n 1|awk -F " " '{print "\t"$6"\t"$12}'`

#usr=`top -n 1 -b | grep -v -E '^[[:alpha:]]|^$|COMMAND|root' |sort -k6nr -k 9|head -n 1|awk -F " " '{print $2}'`

#command=`top -n 1 -b -c | grep -v -E '^[[:alpha:]]|^$|COMMAND' |sort -k9nr|head -n 1|awk -F " " '{print $12}'`

top -n 1 -b -c |awk -F " " '{

res=$6; pid=$1;user=$2; com=$12 ;

if(res ~/m/){

res=res*1024 ;

print  user"\t"res"\t"pid"\t"com;

}

else if (res ~/g/){

res=res*1024*1024;

print  user"\t"res"\t"pid"\t"com;

else if (res ~/*/){

res=res*1;

print  user"\t"res"\t"pid"\t"com;

}'|sort -k2nr|head -n 1|awk -F " "  '{

system("kill -9 "$3"");

system("echo killed process "$3"");


system("echo "$0" "HOSTNAME" >> '"$stat_path/$err_log"'");

system("echo Dear "$1" , you process "$3" has been killed of '"$hostname"' at `date`|mail -s "killed  pid warn"  admin.list@biomarker.com.cn "$1"@biomarker.com.cn");

}'

#echo $pid|xargs kill -9

#echo $usr  $pid has be killed in `date` 

#echo " $usr   $pid   $hostname      `date`    $command"  >> $stat_path/$err_log 

#echo "you process  $usr   $pid  $mesg  of   $hostname    at   `date`     has been killed "  |mail -s "killed pid" -c wugj@biomarker.com.cn $usr@biomarker.com.cn 


}

cpu_quota=80 

function watch_cpu()

  time_point_1=`get_cpu_info` 

  sleep  10

  time_point_2=`get_cpu_info` 

  cpu_usage=`echo $time_point_1 $time_point_2|awk '{used=$3-$1;total=$3+$4-$1-$2;print used*100/total}'` 

  echo cpu_usage: $cpu_usage>>$stat_path/$stat_file 

  if [[ $cpu_usage > $cpu_quota ]]; then 

      cpu_message="WARN! The CPU Usage is over than $cpu_quota%" 

     echo cpu_message: $cpu_message >> $stat_path/$stat_file 

#timeout proc_cpu_ps5 

timeout proc_cpu_top5

#kill_proc

  fi

#测试

# timeout proc_cpu_ps5 

# timeout proc_cpu_top5

function update_file(){

if [[ ! -f  $stat_path/$stat_file ]];then

touch $stat_path/$stat_file

echo make new file successful

fi

awk 'BEGIN{

print "ip: '"$ip"' "

print "host: '"$hostname"'" 

print "cpu_num: '"$cpu_num"'"

print "mem_usage: '"$mem_usage"'%"

print "mem_message: '"$mem_message"'"

}'>$stat_path/$stat_file

}

watch_mem

update_file

watch_cpu

kill_proc