#!/bin/bash
# 定义检查函数
check_status() {
component="$1"
status_code="$2"
if [ "$status_code" -ne 0 ]; then
echo "[$(date)] $component is not working properly. Exit code: $status_code"
exit 1
fi
}
# 检查API服务器
echo "Checking Kubernetes API server..."
kubectl get componentstatuses --field-selector status=healthy -o json > /dev/null 2>&1
check_status "Kubernetes API server" "$?"
# 检查etcd集群
echo "Checking etcd cluster..."
ETCDCTL_API=3 etcdctl --endpoints=https://[ETCD_ENDPOINTS] --cacert=/path/to/etcd/ca.crt --cert=/path/to/etcd/client.crt --key=/path/to/etcd/client.key endpoint health --timeout=3s > /dev/null 2>&1
check_status "etcd cluster" "$?"
# 检查控制器管理器
echo "Checking Kubernetes controller manager..."
kubectl get pods --namespace=kube-system --selector="k8s-app=kube-controller-manager" --output=json > /dev/null 2>&1
check_status "Kubernetes controller manager" "$?"
# 检查调度器
echo "Checking Kubernetes scheduler..."
kubectl get pods --namespace=kube-system --selector="k8s-app=kube-scheduler" --output=json > /dev/null 2>&1
check_status "Kubernetes scheduler" "$?"
# 检查DNS服务
echo "Checking Kubernetes DNS service..."
kubectl get services --namespace=kube-system --selector="k8s-app=kube-dns" --output=json > /dev/null 2>&1
check_status "Kubernetes DNS service" "$?"
# 检查系统版本
echo "Checking system version..."
cat /etc/os-release | grep PRETTY_NAME | cut -d= -f2 > /dev/null 2>&1
check_status "system version" "$?"
# 检查网络连接
echo "Checking network connection..."
ping -c 1 google.com > /dev/null 2>&1
check_status "network connection" "$?"
echo "All components are healthy, and the system is connected to the internet."