线上django服务偶尔会因为机器访问mysql过多,造成too many connections 问题,导致服务挂掉。
之前调大了最大连接数,有点治标不治本。
所以今天抽空写个监控mysql连接数的服务,如果连接数超过某个阈值,就杀掉一部分连接。
核心代码。
20191928更新:
之前直接time.sleep有点low,这回换了个Python定时任务的第三方框架——schedule,还挺好用的
参考:
https://schedule.readthedocs.io/en/stable/
https://schedule.readthedocs.io/en/stable/faq.html#what-if-my-task-throws-an-exception
更新后的核心代码如下:
# -*- coding: utf-8 -*-
import getpass
import time
import functools
import MySQLdb
import schedule
from config import *
from log_setup import set_log
if getpass.getuser() == 'wangjinyu':
from loc_config import *
logger = set_log(__name__)
def catch_exceptions(cancel_on_failure=False):
def catch_exceptions_decorator(job_func):
@functools.wraps(job_func)
def wrapper(*args, **kwargs):
try:
return job_func(*args, **kwargs)
except:
import traceback
logger.info(traceback.format_exc())
if cancel_on_failure:
return schedule.CancelJob
return wrapper
return catch_exceptions_decorator
@catch_exceptions()
def handle_process_num():
conn = MySQLdb.connect(host=MYSQL_HOST, port=MYSQL_PORT, user=MYSQL_USER, password=MYSQL_PASSWORD)
cur = conn.cursor()
cur.execute('show full processlist')
process_list = list(cur.fetchall())
cur_process_num = len(process_list)
cur.close()
if cur_process_num > MAX_CON_NUM:
logger.info('current process num: %d' % cur_process_num)
logger.info('start kill process')
try:
kill_process(process_list, cur_process_num)
except Exception:
logger.exception('process error ocurr')
def kill_process(process_list, cur_process_num):
killed_num = 0
# order by connected time
process_list.sort(key=lambda x: x[5], reverse=True)
conn = MySQLdb.connect(host=MYSQL_HOST, port=MYSQL_PORT, user=MYSQL_USER, password=MYSQL_PASSWORD)
cur = conn.cursor()
# try to kill 1/3 of processes
for i in range(cur_process_num // 2):
command = process_list[i][4]
time = int(process_list[i][5])
# if command is Sleep and Sleep time over 600s then kill process
if command == 'Sleep' and time > MAX_SLEEP_TIME:
try:
cur.execute('kill %s' % process_list[i][0])
except Exception:
logger.exception('kill process %d fail' % process_list[i][0])
continue
killed_num += 1
cur.execute('show full processlist')
post_process_num = len(cur.fetchall())
cur.close()
logger.info('this time kill %d process' % killed_num)
logger.info('remain process num:%d' % post_process_num)
if __name__ == '__main__':
# run job every 30 minutes
schedule.every(INTERVAL_TIME).seconds.do(handle_process_num)
logger.info('mysql monitor service startup success')
while True:
schedule.run_pending()
time.sleep(1)
# -*- coding: utf-8 -*-
import time
import MySQLdb
import getpass
from config import *
from log_setup import set_log
logger = set_log(__name__)
def hanlde_process_num():
conn = MySQLdb.connect(host=MYSQL_HOST, port=MYSQL_PORT, user=MYSQL_USER, password=MYSQL_PASSWORD)
cur = conn.cursor()
cur.execute('show full processlist')
process_list = list(cur.fetchall())
cur_process_num = len(process_list)
cur.close()
if cur_process_num > MAX_CON_NUM:
logger.info('current process num: %d' % cur_process_num)
logger.info('start kill process')
try:
kill_process(process_list)
except Exception:
logger.exception('process error ocurr')
def kill_process(process_list):
killed_num = 0
#order by connected time
process_list.sort(key=lambda x:x[5],reverse=True)
conn = MySQLdb.connect(host=MYSQL_HOST, port=MYSQL_PORT, user=MYSQL_USER, password=MYSQL_PASSWORD)
cur = conn.cursor()
#try to kill 1/3 of processes
for i in range(MAX_CON_NUM // 3):
Command = process_list[i][4]
Time = int(process_list[i][5])
#if command is Sleep and Sleep time over 600s then kill process
if Command == 'Sleep' and Time > MAX_SLEEP_TIME:
try:
cur.execute('kill %s' % process_list[i][0])
except Exception:
logger.exception('kill process %d fail'%process_list[i][0])
continue
killed_num += 1
cur.execute('show full processlist')
post_process_num = len(cur.fetchall())
cur.close()
logger.info('this time kill %d process' % killed_num)
logger.info('remain process num:%d' % post_process_num)
def sever():
logger.info('mysql monitor startup success')
while True:
try:
hanlde_process_num()
except Exception:
logger.exception("hanlde process num error")
time.sleep(10)
continue
#check mysql connect num every an hour
time.sleep(3600)
if __name__ == '__main__':
sever()