效果展示
===================== 2022/05/20-09:12:11+0000 =====================
===================== check system =====================
[INFO] [2022/05/20-09:12:11+0000] Hostname: test-master-01
[INFO] [2022/05/20-09:12:11+0000] Ipaddress:
[INFO] [2022/05/20-09:12:11+0000] Os-release: CentOS Linux 7 (Core) GNU/Linux
[INFO] [2022/05/20-09:12:11+0000] Kernel: Linux 3.10.0-1127.19.1.el7.x86_64 x86_64 GNU/Linux
[INFO] [2022/05/20-09:12:11+0000] Up Days: 143 days
[INFO] [2022/05/20-09:12:11+0000] Os Language: en_US.UTF-8
===================== check cpu =====================
[INFO] [2022/05/20-09:12:11+0000] CPU Model: AMD EPYC 7571
[INFO] [2022/05/20-09:12:11+0000] Physical CPUS: 1
[INFO] [2022/05/20-09:12:11+0000] Processor CPUS: 4
[INFO] [2022/05/20-09:12:11+0000] CPU Cores: 2
[INFO] [2022/05/20-09:12:11+0000] Load Average: 0.55 , 0.44 , 0.51
[INFO] [2022/05/20-09:12:11+0000] CPU Usage: 15.66%
===================== check memory =====================
[INFO] [2022/05/20-09:12:11+0000] Mem Total: 15.14GiB
[INFO] [2022/05/20-09:12:11+0000] Mem Used: 12.07GiB
[INFO] [2022/05/20-09:12:11+0000] Mem Available: 3.07GiB
[INFO] [2022/05/20-09:12:11+0000] Mem Usage: 79.73%
===================== check disk =====================
[INFO] [2022/05/20-09:12:11+0000] Disk Info:
[INFO] [2022/05/20-09:12:11+0000] /dev/nvme0n1p1 xfs 100G 55G 46G 55% /
[INFO] [2022/05/20-09:12:11+0000] Disk Inode Info:
[INFO] [2022/05/20-09:12:11+0000] /dev/nvme0n1p1 xfs 50M 722K 50M 2% /
===================== check kubernetes =====================
[INFO] [2022/05/20-09:12:11+0000] Apiserver Cert Not After: Mar 16 07:45:06 2023 GMT
[INFO] [2022/05/20-09:12:11+0000] Node Status: test-master-01 is Ready
[INFO] [2022/05/20-09:12:11+0000] Node Status: test-master-02 is Ready
[INFO] [2022/05/20-09:12:11+0000] Node Status: test-master-03 is Ready
[INFO] [2022/05/20-09:12:11+0000] Node Status: test-node-01 is Ready
[INFO] [2022/05/20-09:12:11+0000] Node Status: test-node-02 is Ready
[INFO] [2022/05/20-09:12:11+0000] Node Status: test-node-03 is Ready
[INFO] [2022/05/20-09:12:11+0000] Node Status: test-node-04 is Ready
[INFO] [2022/05/20-09:12:11+0000] Node Status: test-node-05 is Ready
[INFO] [2022/05/20-09:12:11+0000] Top Nodes: test-master-01 552m 13% 12590Mi 81%
[INFO] [2022/05/20-09:12:11+0000] Top Nodes: test-master-02 399m 9% 9644Mi 62%
[INFO] [2022/05/20-09:12:11+0000] Top Nodes: test-master-03 534m 13% 10336Mi 67%
[INFO] [2022/05/20-09:12:11+0000] Top Nodes: test-node-01 679m 16% 21175Mi 67%
[INFO] [2022/05/20-09:12:11+0000] Top Nodes: test-node-02 591m 14% 21119Mi 66%
[INFO] [2022/05/20-09:12:11+0000] Top Nodes: test-node-03 674m 16% 23677Mi 75%
[INFO] [2022/05/20-09:12:11+0000] Top Nodes: test-node-04 564m 14% 23123Mi 73%
[INFO] [2022/05/20-09:12:11+0000] Top Nodes: test-node-05 558m 13% 22760Mi 72%
目录结构
├── config
│ └── conf.sh
└── inspection.sh
config/conf.sh
#!/usr/bin/env bash
disk_lists='
/
/data
'
cpu_limit='85%'
mem_limit='85%'
disk_limit='75%'
disk_inode_limit='85%'
api_cert_file='/etc/kubernetes/pki/apiserver.crt'
cert_expires='30'
kube_config='/root/.kube/config'
inspection.sh
#!/usr/bin/env bash
base_dir=$(cd `dirname "$0"`; pwd)
conf_file="${base_dir}/config/conf.sh"
log_dir="${base_dir}/logs"
log_file="${log_dir}/$(date +%Y-%m-%d)-INFO.log"
warn_log="${log_dir}/$(date +%Y-%m-%d)-WARN.log"
time_style="$(date +%Y/%m/%d-%T%z)"
df_cmd="df -Th -x devtmpfs -x tmpfs -x debugfs -x aufs -x overlay -x fuse.glusterfs"
tar_time=7
tar_dir=$(date +%Y-%m-%d -d "${tar_time} days ago")
tar_name="${tar_dir}.tgz"
function check_config () {
if [[ -f "${conf_file}" ]];then
source ${conf_file}
disk_lists=${disk_lists:-'/'}
cpu_limit=${cpu_limit:-'85%'}
mem_limit=${mem_limit:-'85%'}
disk_limit=${disk_limit:-'75%'}
disk_inode_limit=${disk_inode_limit:-'85%'}
api_cert_file=${api_cert_file:-'/etc/kubernetes/pki/apiserver.crt'}
cert_expires=${cert_expires:-'30'}
kube_config=${kube_config:-'/root/.kube/config'}
kube_cmd="kubectl --kubeconfig ${kube_config}"
else
echo "${conf_file} is not found, please check it !"
exit 0
fi
}
function check_user () {
local wai=$(id -u -n)
if [[ "${wai}"x != "root"x ]];then
printf "\e[1;31mPlease use the root to execute this shell !\e[0m\n"
exit 0
fi
}
function print_terminal () {
printf "\e[1;34m[INFO] [${time_style}] ${*}\e[0m\n"
}
function print_info_title () {
if [[ ! -f "${log_file}" ]];then
echo "===================== ${*} =====================" >> ${log_file}
else
echo " " >> ${log_file}
echo "===================== ${*} =====================" >> ${log_file}
fi
}
function print_warn_title () {
if [[ ! -f "${warn_log}" ]];then
echo "===================== ${*} =====================" >> ${warn_log}
else
echo " " >> ${warn_log}
echo "===================== ${*} =====================" >> ${warn_log}
fi
}
function check_warn_title () {
grep "${*}" ${warn_log} &> /dev/null || print_warn_title "${*}"
}
function print_info () {
echo "[INFO] [${time_style}] ${*}" >> ${log_file}
}
function print_warn () {
echo "[WARN] [${time_style}] ${*}" >> ${warn_log}
}
function check_log_dir () {
[[ -d ${log_dir} ]] || mkdir -p ${log_dir}
[[ ! -f ${log_file} ]] || mv ${log_file}{,-$(date +%T%z)}
[[ ! -f ${warn_log} ]] || mv ${warn_log}{,-$(date +%T%z)}
print_info_title "${time_style}"
print_warn_title "${time_style}"
}
function check_tar () {
local check_num=$(find ${log_dir} -mtime +${tar_time} -name *.log* | wc -l)
local check_tarnum=$(find ${log_dir} -mtime +${tar_time} -name *.tar.gz | wc -l)
if [[ "${check_num}" > 0 ]];then
[[ -d "${log_dir}/${tar_dir}" ]] || mkdir -p "${log_dir}/${tar_dir}"
[[ ! -f "${log_dir}/${tar_dir}/${tar_name}" ]] || mv ${log_dir}/${tar_dir}/${tar_name}{,-$(date +%T%z)}
find ${log_dir} -mtime +${tar_time} -name *.log* -exec mv {} ${log_dir}/${tar_dir} \; &> /dev/null
cd ${log_dir} && tar czf ${tar_name} ${tar_dir}/* && rm -rf ${tar_dir}
fi
if [[ "${check_tarnum}" > 0 ]];then
find ${log_dir} -mtime +${tar_time} -name *.tar.gz -exec rm -f {} \;
fi
print_terminal "check logs done"
}
function check_system () {
print_info_title 'check system'
get_hostname="$(cat /etc/hostname)"
print_info "Hostname: ${get_hostname}"
local get_host_ip=$(hostname -i)
print_info "Ipaddress: ${get_host_ip}"
local get_os_release="$(awk -F '"' '/PRETTY_NAME/ {print $2}' /etc/os-release)"
print_info "Os-release: ${get_os_release} $(uname -o)"
local get_kernel="$(uname -srmo)"
print_info "Kernel: ${get_kernel}"
local get_up_secs="$(awk -F '.' '{print $1}' /proc/uptime)"
local get_days="$(( ${get_up_secs} / 60 / 60 / 24 ))"
print_info "Up Days: ${get_days} days"
local os_lang=$(echo $LANG)
print_info "Os Language: ${os_lang}"
local chech_swap=$(grep -iv size /proc/swaps | wc -l)
if [[ "${chech_swap}" == "0" ]];then
print_info "Swap Status: off"
else
check_warn_title 'check system'
swapoff -a
print_info "Swap Status: manual off"
fi
local firewalld_status=$(systemctl is-active firewalld)
local firewalld_enable=$(systemctl is-enabled firewalld)
if [[ "${firewalld_status}"x == "inactive"x ]];then
print_info "Firewalld Status: dead"
else
check_warn_title 'check system'
systemctl stop firewalld
print_warn "Firewalld Status: manual dead"
fi
if [[ "${firewalld_enable}"x == "disabled"x ]];then
print_info "Firewalld Enabled: disabled"
else
check_warn_title 'check system'
systemctl disable firewalld
print_warn "Firewalld Enabled: manual disabled"
fi
print_terminal "check system done"
}
function check_cpu () {
print_info_title "check cpu"
local physical_cpus="$(grep "^physical id" /proc/cpuinfo | sort | uniq | wc -l)"
local process_cpus="$(grep -c "^processor" /proc/cpuinfo)"
local core_cpus="$(grep '^cpu cores' /proc/cpuinfo | tail -1 | awk '{print $NF}')"
local cpu_model="$(grep "^model name" /proc/cpuinfo | awk -F ': ' '{print $2}' | sort | uniq)"
print_info "CPU Model: ${cpu_model}"
print_info "Physical CPUS: ${physical_cpus}"
print_info "Processor CPUS: ${process_cpus}"
print_info "CPU Cores: ${core_cpus}"
local one_min="$(awk '{print $1}' /proc/loadavg)"
local five_min="$(awk '{print $2}' /proc/loadavg)"
local fif_min="$(awk '{print $3}' /proc/loadavg)"
print_info "Load Average: ${one_min} , ${five_min} , ${fif_min}"
local cpu_util="$(awk '/cpu / {util=($2+$4)*100/($2+$4+$5); printf ("%.2f%"), util}' /proc/stat)"
print_info "CPU Utilization: ${cpu_util}"
if [[ "${cpu_util%%.*}" -ge "${cpu_limit%%%}" ]];then
local top_cpu_use="$(ps -eo user,pid,pcpu,args --sort=-pcpu | head -n 10)"
check_warn_title 'check cpu'
print_warn "CPU utilization is ${cpu_util} , it's greater equal ${cpu_limit}, should be check !"
print_warn "Top 10 CPU Use: "
echo "${top_cpu_use}" >> ${warn_log}
fi
print_terminal "check cpu done"
}
function check_mem () {
print_info_title "check memory"
local get_mem_info="$(awk '/MemTotal:/{total=$2/1024/1024;next} /MemAvailable:/{available=$2/1024/1024;use=total-available; printf("%.2fGiB %.2fGiB %.2fGiB %.2f%"),total,use,available,(use/total)*100}' /proc/meminfo)"
local mem_total="$(awk '{print $1}' <<< ${get_mem_info})"
local mem_used="$(awk '{print $2}' <<< ${get_mem_info})"
local mem_available="$(awk '{print $3}' <<< ${get_mem_info})"
local mem_util="$(awk '{print $4}' <<< ${get_mem_info})"
local top_mem_use="$(ps -eo user,pid,pmem,args --sort=-pmem | head -n 10)"
print_info "Mem Total: ${mem_total}"
print_info "Mem Used: ${mem_used}"
print_info "Mem Available: ${mem_available}"
print_info "Mem Utilization: ${mem_util}"
if [[ "${mem_util%%.*}" -ge "${mem_limit%%%}" ]];then
check_warn_title 'check memory'
print_warn "Mem utilization is ${mem_util}, it's greater equal ${mem_limit}, should be check !"
print_warn "Top 10 Mem Use: "
echo "${top_mem_use}" >> ${warn_log}
fi
print_terminal "check memory done"
}
function check_disk () {
print_info_title "check disk"
print_info "Disk Info: "
local disk_lists_array=($(printf "%q\n" ${disk_lists}))
for (( i=0; i<${#disk_lists_array[@]}; i++ ))
do
local disk_info=$(${df_cmd} | egrep "${disk_lists_array[i]}$")
local disk_util="$(awk '{print $6}' <<< ${disk_info})"
local disk_name="$(awk '{print $NF}' <<< ${disk_info})"
[[ "${disk_info}"x != ""x ]] || break
print_info "${disk_info}"
if [[ "${disk_util%%%}" -ge "${disk_limit%%%}" ]];then
check_warn_title 'check disk'
print_warn "Disk ${disk_name} utilization is ${disk_util}, it's greater equal ${disk_limit}, should be check !"
fi
done
print_info '---'
print_info "Disk Inode Info: "
for (( i=0; i<${#disk_lists_array[@]}; i++ ))
do
local disk_inode_info=$(${df_cmd} -i | egrep "${disk_lists_array[i]}$")
local disk_inode_util="$(awk '{print $6}' <<< ${disk_inode_info})"
local disk_inode_name="$(awk '{print $NF}' <<< ${disk_inode_info})"
[[ "${disk_inode_info}"x != ""x ]] || break
print_info "${disk_inode_info}"
if [[ "${disk_inode_util%%%}" -ge "${disk_inode_limit%%%}" ]];then
check_warn_title 'check disk'
print_warn "Disk ${disk_inode_name} utilization is ${disk_inode_util}, it's greater equal ${disk_inode_limit}, should be check !"
fi
done
print_terminal "check disk done"
}
function check_kubernetes () {
print_info_title "check kubernetes"
if [[ -f ${api_cert_file} ]];then
local cert_info="$(openssl x509 -in ${api_cert_file} -noout -text | awk -F ': ' '/Not After/ {print $2}')"
local cert_time_stamp=$(date -d "${cert_info}" +%s)
local cert_not_after="$(( (${cert_time_stamp} - $(date +%s)) / 86400 ))"
print_info "Apiserver Cert Not After: ${cert_info}"
if [[ "${cert_not_after}" -le "${cert_expires}" ]];then
check_warn_title 'check kubernetes'
print_warn "The apiserver cert will expire in ${cert_expires} days, please renewal !"
fi
fi
if [[ -f "${kube_config}" ]];then
local k8s_nodes_lists=$(${kube_cmd} get node --no-headers=true | awk '{print $1}')
local k8s_lists_array=($(printf "%q\n" ${k8s_nodes_lists}))
for (( h=0; h<${#k8s_lists_array[@]}; h++ ))
do
local node_status=$(${kube_cmd} get nodes | awk "/${k8s_lists_array[h]}/ {print \$2}")
if [[ "${node_status}"x == "Ready"x ]];then
print_info "Node Status: ${k8s_lists_array[h]} is Ready"
else
check_warn_title 'check kubernetes'
print_warn "Node: ${k8s_lists_array[h]} is NotReady , please check !"
fi
done
${kube_cmd} top node &> /dev/null
if [[ "$?" -eq '0' ]];then
for (( tn=0; tn<${#k8s_lists_array[@]}; tn++ ))
do
local k_top_node=$(${kube_cmd} top node | awk "/${k8s_lists_array[tn]}/ {print \$0}")
local node_cpu_usage="$(awk '{print $3}' <<< ${k_top_node})"
local node_mem_usage="$(awk '{print $5}' <<< ${k_top_node})"
print_info "Top Nodes: ${k_top_node}"
if [[ "${node_cpu_usage%%%}" -ge "${cpu_limit%%%}" ]];then
check_warn_title 'check kubernetes'
print_warn "${k8s_lists_array[tn]} top node check: cpu usage is ${node_cpu_usage}, it's greater equal ${cpu_limit}, should be check !"
fi
if [[ "${node_mem_usage%%%}" -ge "${mem_limit%%%}" ]];then
check_warn_title 'check kubernetes'
print_warn "${k8s_lists_array[tn]} top node check: cpu usage is ${node_mem_usage}, it's greater equal ${mem_limit}, should be check !"
fi
done
fi
else
print_info "This node's role is the work for kubernetes cluster"
fi
}
check_config
check_user
check_log_dir
check_tar
check_system
check_cpu
check_mem
check_disk
check_kubernetes