前提
-
Prometheus监控Kubernetes中的各个Pod:
为了实现对Kubernetes集群中各个Pod的监控,我们可以配置Prometheus来收集和存储这些指标数据。Prometheus提供了丰富的功能,可以直接与Kubernetes集成,并通过Kubernetes的服务发现机制自动发现并监控集群中的Pod。通过配置Prometheus的
prometheus.yml
文件,我们可以指定要监控的Kubernetes服务或Pod,以及要收集的指标类型。这样,Prometheus就可以定期拉取这些Pod的指标数据,并存储在其时间序列数据库中,供后续的监控和查询使用。 -
Python解释器安装:
在执行Python脚本之前,首先需要确保系统中已经安装了Python解释器。Python是一种流行的编程语言,有着丰富的第三方库和生态系统,非常适合用于编写各种类型的自动化脚本。通常情况下,Python解释器可以通过操作系统的包管理工具进行安装,或者直接从Python官方网站下载安装包进行安装。安装完成后,即可使用Python命令行或脚本来执行相应的任务。
思路
Python编写一个脚本来监控指定服务的各个指标,该服务由Prometheus监控。当指标超过预设的阈值时,脚本将调用Nacos接口下线该服务。在服务负载下降后,脚本将再次调用Nacos接口将该服务上线。为了避免同时下线导致业务中断,脚本会引入适当的延迟,以确保服务平稳下线。这种自动化的流程可以帮助确保服务的稳定性和可用性,同时减少人工干预的需求,提高运维效率
逻辑
最后时候还判断是否有下线的,有下线的就飞书告警
- 从prometheus获取pod的CPU状态
- 根据获取到的pod名称到k8s中获取IP地址
- 停止15s 再次执行前两步
- 15s后podcpu还是高,那就nacos下线
- nacos检查下监控的实例名称,获取信息,看看已经下线的是否再要下线的字典里没有
- 下线的没有再字典里,进行上线
- 飞书发信息
完整代码
from kubernetes import client, config
import requests,json
import time
import urllib3
urllib3.disable_warnings()
# 1. 连接k8s获取pod的IP
def k8s(host,token,namespace,pod_name):
config = client.Configuration()
config.host = host
config.verify_ssl = False
config.debug = False
config.api_key = {'authorization': "Bearer " + token}
client.Configuration.set_default(config)
api = client.CoreV1Api()
namespace = namespace
pod_name = pod_name
pod = api.read_namespaced_pod(pod_name, namespace)
pod_ip = pod.status.pod_ip
return pod_ip
# print("Pod IP address:", pod_ip)
# 2. 连接prometheus,获取k8s的pod的cpu的大小
def get_cpu_usage(url, formula):
query_url = f"{url}/api/v1/query?query={formula}"
response = requests.get(query_url)
response.raise_for_status()
data = response.json()["data"]
result_list = data["result"]
cpu_peaks = {}
for result in result_list:
pod_name = result["metric"]["pod"]
cpu_peak = round(float(result["value"][1]))
cpu_peaks[pod_name] = cpu_peak
return cpu_peaks
# 3. 连接nacos,可以让服务实现上线下线操作
def nacos_up_down(url,query_params):
response = requests.put(url,params=query_params)
response.raise_for_status()
# response = requests.get(url,params=query_params).url
# print('上下线URL',response)
# 4.飞书发送告警
def send_feishu_message(url,message):
data = {
"msg_type": "text",
"content": {
"text": message
}
}
headers = {
'Content-Type': 'application/json;charset=utf-8'
}
response = requests.post(url, headers=headers, data=json.dumps(data))
return response.text
# 4. 睡眠15秒,二次判断监控信息
def monitor_cpu(api_server, expr,Judge):
exceeded_pods = []
nacos_information_data=[]
cpu_usage = get_cpu_usage(api_server, expr)
for pod, v in cpu_usage.items():
for judge_data in Judge:
for k8s_deploy,nacos_data in judge_data.items():
if k8s_deploy in pod:
exceeded_pods.append([nacos_data[0],nacos_data[1],pod])
# print(exceeded_pods)
time.sleep(10)
cpu_usage = get_cpu_usage(api_server, expr)
for podname, v in cpu_usage.items():
for nacos_information in exceeded_pods:
if podname == nacos_information[2]:
# pod_ip = k8s(host=host,token=token,namespace=namespace,pod_name=podname)
# nacos_information_data.append([nacos_information[0],nacos_information[1],pod_ip])
# print(nacos_information)
nacos_information_data.append(nacos_information)
# return nacos_information_data
return nacos_information_data
# 5.nacos操作所需信息
def nacos_data_list(K8s_host,K8s_token,K8s_ns,mon_CPU):
monitor_cpu=mon_CPU
data_list = []
# for podname in monitor_cpu(Pro_URL, Pro_formula,Pro_Judge):
for podname in monitor_cpu:
pod_ip = k8s(host=K8s_host,token=K8s_token,namespace=K8s_ns,pod_name=podname[2])
data_list.append([podname[0],podname[1],pod_ip])
return data_list
# 6.nacos下线操作
def nacos_down(NA_U,NA_I,NA_IF):
nacos = NA_IF
for nacos_info in nacos:
NA_I['enabled'] = 'false'
NA_I['serviceName'] = nacos_info[0]
NA_I['ip'] = nacos_info[2]
NA_I['port'] = nacos_info[1]
# print(requests.get(NA_U,NA_I).url)
nacos_up_down(NA_U,NA_I)
# 7.nacos信息查询
def nacos_list(NA_U,NA_I,NA_IF):
nacos_dict = {}
for nacos_server in NA_IF:
for k8s_deploy,nacos_data in nacos_server.items():
NA_I['serviceName'] = nacos_data[0]
response = requests.get(NA_U,NA_I)
nacos_dict[nacos_data[0]] = response.json()
return nacos_dict
# 8.nacos上线操作
def nacos_up(NA_IQ,NA_IF,NA_I,NA_U):
nacos_server_ip=[]
for nacos_info in NA_IF:
nacos_server_ip.append(nacos_info[2])
for k,v in NA_IQ.items():
for nacos_api in v['list']:
if nacos_api.get("enabled") == False:
if nacos_api.get("ip") not in nacos_server_ip:
# print('ffffffffffffffffffffffffffffff---ip',nacos_api.get("ip"))
# print('ffffffffffffffffffffffffffffff---ip',nacos_api.get("serviceName").split('@@')[1])
NA_I['enabled'] = 'true'
NA_I['serviceName'] = nacos_api.get("serviceName").split('@@')[1]
NA_I['ip'] = nacos_api.get("ip")
NA_I['port'] = nacos_api.get("port")
nacos_up_down(NA_U,NA_I)
# 9.nacos查询信息,飞书告警
def nacos_feishu(NA_IQ,WU):
for k,v in NA_IQ.items():
count = v['count']
up = 0
down = 0
for nacos_api in v["list"]:
server_name = nacos_api.get("serviceName").split('@@')[1]
NS = nacos_api.get("serviceName").split('@@')[0]
if nacos_api.get("enabled") == True:
up += 1
else:
down += 1
message = f"\n环境:{NS}\n服务名:{server_name}\n总实例:{count}\n上线:{up}\n下线:{down}"
# send_feishu_message(WU,message)
if down > 0:
send_feishu_message(WU,message)
if __name__ == "__main__":
# 公共信息
namespace="test"
# 阀值
cpu_threshold = 85
# 格式 [{k8s中deploy名称:[nacos服务名,nacos里服务名端口]}]
Judge=[
{"aaaaaaaa-test-server":['a-test','8080']},
{'bbbbbbbb-test-server':['b-test','8181']}
]
# k8s 信息
host = 'https://127.0.0.1:6443'
token = 'xxxxxxxxxx'
# k8s(host=host,token=token,namespace="test",pod_name="aaaaaaaa-test-server-7f9d77b899-hgdnc")
# prometheus 信息
ProURL='http://127.0.0.1:9090'
Proformula='sum(irate(container_cpu_usage_seconds_total{namespace=~"%s",image!=""}[5m])*100)by(namespace,pod)/sum(container_spec_cpu_quota{namespace=~"%s",image!=""}/container_spec_cpu_period{namespace=~"%s",image!=""})by(namespace,pod) > %s' %(namespace,namespace,namespace,cpu_threshold)
# print(Proformula)
# print(get_cpu_usage(url=ProURL,formula=Proformula))
# nacos信息
name_namspace='test'
nacos_user="nacos"
nacos_pass="nacos"
nacos_url="http://nacos.test.com.cn:8848/nacos/v1/ns/instance"
nacos_list_url="http://nacos.test.com.cn:8848/nacos/v1/ns/catalog/instances"
nacos_info = {
"serviceName": "",
"groupName": name_namspace,
"namespaceId": name_namspace,
"ip": "",
"port": "",
"clusterName": "DEFAULT",
# true上线 false下线
"enabled": "false",
"username": nacos_user,
"password": nacos_pass,
}
nacos_list_info={
"pageSize":"10",
"pageNo":"1",
"serviceName":"",
"groupName": name_namspace,
"namespaceId": name_namspace,
"clusterName": "DEFAULT",
"username": nacos_user,
"password": nacos_pass,
}
# 飞书机器人信息
webhook_url = "https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxx"
# message = f"环境:{up}\nNacos服务名称:{count}\n发生CPU占用率过高的告警,请及时处理!"
# send_feishu_message(webhook_url,message)
############################
mon_CPU=monitor_cpu(ProURL, Proformula,Judge)
nacos_information=nacos_data_list(K8s_host=host,K8s_token=token,K8s_ns=namespace,mon_CPU=mon_CPU)
# print('mon_CPU',mon_CPU)
time.sleep(1)
nacos_down(NA_U=nacos_url,NA_I=nacos_info,NA_IF=nacos_information)
nacos_inquire = nacos_list(NA_U=nacos_list_url,NA_I=nacos_list_info,NA_IF=Judge)
# print(nacos_inquire)
nacos_up(NA_U=nacos_url,NA_I=nacos_info,NA_IF=nacos_information,NA_IQ=nacos_inquire)
# NA_IQ=nacos 查询的信息
# NA_IF=nacos操作所需要的信息
# NA_I=修改nacos所需要的信息
# NA_U=nacos_url
time.sleep(1)
nacos_inquire_two = nacos_list(NA_U=nacos_list_url,NA_I=nacos_list_info,NA_IF=Judge)
nacos_feishu(NA_IQ=nacos_inquire_two,WU=webhook_url)