自动重启实现原理
- 官方的clickhouse-watchdog经常失效,不能自动重启
- 通过python脚本定时执行service clickhouse-server status命令检查clickhouse的状态。
- 当检查到进程死亡时执行service clickhouse-server start启动
python脚本代码
- Java程序员写的,代码优点烂,不过能用
- webhook_url配置飞书的机器人实现告警。
import requests
import subprocess
import sys
import time
env = "线上环境"
webhook_url = 'https://open.feishu.cn/open-apis/bot/v2/hook/********************'
limit = False
def send_alert(message):
from datetime import datetime
now = datetime.now()
formatted_time = now.strftime("%Y-%m-%d %H:%M:%S")
message_content = {
"content": {
"text": f"告警环境:{env}\n告警级别:严重\n告警时间:{formatted_time}\n告警内容:{message}"
},
"msg_type": "text"
}
if limit:
print(f"消息发送失败,被限流,{message_content}")
else:
send_message_to_feishu(webhook_url, message_content)
print(f"消息发送成功,{message_content}")
def send_message_to_feishu(webhook_url, message_content):
"""
向Feishu机器人发送消息
:param webhook_url: Feishu机器人的Webhook URL
:param message_content: 要发送的消息内容(字典)
:return: 响应对象
"""
headers = {'Content-Type': 'application/json'}
try:
response = requests.post(webhook_url, headers=headers, json=message_content)
response.raise_for_status()
return response
except requests.RequestException as e:
print(f"消息发送失败: {e}")
return None
def check_clickhouse_status():
result = subprocess.run(['service', 'clickhouse-server', 'status'], stdout=subprocess.PIPE,
stderr=subprocess.PIPE, universal_newlines=True)
if 'running' not in result.stdout:
return False
return True
def restart_clickhouse_server():
command = ['service', 'clickhouse-server', 'restart']
try:
result = subprocess.run(command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
universal_newlines=True)
send_alert("clickhouse重启结果")
send_alert(result.stdout)
except subprocess.CalledProcessError as e:
print(f"Error restarting ClickHouse server: {e}")
print(f"Error output:\n{e.stderr}")
send_alert("clickhouse重启报错")
send_alert(e.stderr)
def monitor_clickhouse():
global limit
limitTime = None
delay_seconds = 600
while True:
try:
if not check_clickhouse_status():
send_alert("clickhouse进程死亡,将自动重启clickhouse")
restart_clickhouse_server()
time.sleep(30)
if not check_clickhouse_status():
send_alert("clickhouse重启失败,将于1分钟后重试")
time.sleep(60)
else:
send_alert("clickhouse重启成功,请及时关注数据是否正常")
except Exception as e:
send_alert("clickhouse监控报错,监控即将停止")
break
print("ClickHouse server is health")
time.sleep(5)
def checkParam(argv):
global env
if len(argv) > 1:
if argv[1] != '':
env = argv[1]
print(f"环境为: {env}")
if __name__ == '__main__':
checkParam(sys.argv)
monitor_clickhouse()