在工作中需要时刻关注服务进程的健康状态或者metrics进程的健康状态,可以监控其服务的端口,下面编写一个监控kafka服务进程和其metrics进程的端口的Python脚本,直接上代码:
import socket
from prometheus_client import start_http_server,Summary
from prometheus_client import start_http_server, Gauge
from prometheus_client.core import CollectorRegistry
from prometheus_client.core import Gauge
import random
import time
# IP 地址到主机名的映射
ip_to_hostname = {
"10.110.1.110":"hostname1",
"10.110.1.111":"hostname2",
"10.110.1.112":"hostname3",
}
def check_port(server, port):
try:
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.settimeout(1) # 设置超时时间为1秒
s.connect((server, port))
s.close()
return True
except Exception as e:
return False
def check_servers_ports(servers, ports):
status = {}
for server in servers:
hostname = ip_to_hostname.get(server,"Unknown") #获取主机名,如果位置则默认为"Unknown"
status[server] = {'hostname':hostname}
for port in ports:
status[server][port] = check_port(server, port)
return status
gauge_metric = Gauge('kafka_server_and_monitor_port', 'Current status of kafka server and monitor', ['server', 'port', 'hostname'])
def get_kafka_server_and_monitor_status():
servers = list(ip_to_hostname.keys())
ports = [9092, 9100, 9990, 9308]
port_status = check_servers_ports(servers, ports)
print("# HELP kafka_server_and_monitor_port Current status of kafka server and monitor.")
print("# TYPE kafka_server_and_monitor_port Gauge")
for server in port_status:
for port, status in port_status[server].items():
if port == 'hostname': #跳过主机名
continue
gauge_metric.labels(server=server, port=str(port),hostname=port_status[server]['hostname']).set(1 if status else 0)
if __name__ == "__main__":
registry = CollectorRegistry(auto_describe=False)
registry.register(gauge_metric)
start_http_server(1333,registry=registry)
while True:
get_kafka_server_and_monitor_status()
time.sleep(60)
使用nohup在后台启动进程:
nohup python3.6 kafka_port_monitor.py >/dev/null 2>&1 &
最后查看监控指标是否正常输出
curl http://localhost:1334/metrics