这是学习笔记的第 2128 篇文章
今天写了下Consul健康检查的脚本内容,之前更新过一版,可以参见:
我是在上一个版本上面做的更新,对于健康检查来说,我们改进的思路是希望检查的过程是稳定可控的,换句话说,要判断一个数据库是主库还是从库,这个逻辑不是很难写,难就难在这个过程中出现一些异常的时候,检查的逻辑是否健壮,比如网络出现抖动,可能检查的结果就错误了,对于数据库服务来说,基于Consul的域名服务应该是稳定的,在出现故障的时候才应该做出改变,所以对此我们设计了基于ACL的检查和本地自检,两者可以做下互补。
基于Consul的健康检查我们预期实现两大类功能,第一类是业务数据读写,第二类是读写分离,按照这两个类别我们划分为三个子类,分别代表code: write,mixed_read,read_only
第一类是业务读写,即主库的正常数据写入和读取
第二类是读写分离,混合读,在主库和从库间做查询的负载均衡
第三类是读写分离,从库只读,在从库侧做查询的负载均衡
所以业务需求是持续变化的,而我们要做的就是根据数据库角色(主库,从库),根据业务的选项(write,mixed_read,read_only)来进行域名服务的状态配置。
这个逻辑用如下的图形来标识:
返回为0代表成功,返回为2代表失败
红色的部分是异常的部分,比如主库不能设置为只读,从库不能设置为可写
如果仔细看这个流程,其实会发现实际的逻辑检查不是很复杂,略微复杂的是从库端的检查,判断是否开启读取的域名,一个关键的检查就是从库延迟,如果从库延迟过大,这个时候开启读写分离是有问题的,所以我们可以设定一个阈值,比如(1s-10s)的一个阈值来冗余一定的延时,超出阈值则读服务不可用,如果是多个从库就可以实现平滑的负载均衡。
而整个流程的检查中,核心的一个逻辑就是基于主库和从库。
要判断一个数据库是主库还是从库,看起来很简单,但是实际上要让整个流程足够稳定,经得起考验,我们就得设定一定的规范和流程检验。
整个检查逻辑中主从库的检查是按照如下的流程图来设计的:
很多条件都实现了多重条件检查和基于规范的检查。
整个逻辑的部分使用了如下的Shell脚本来完成,感兴趣的可以看一下,后续会做一些微调。
source ~/.bash_profile
source /etc/profile
function initEnv() {
BIN=/bin
UBIN=/usr/bin
service_level=1
toppid=$$
cat=`${UBIN}/which cat`
wc=`${UBIN}/which wc`
curl=`${UBIN}/which curl`
echo=`${UBIN}/which echo`
getopt=`${UBIN}/which getopt`
base64=`${UBIN}/which base64`
mysql=`${UBIN}/which mysql`
## added
db_role_master='Master'
db_role_slave='Slave'
db_role_Error='Error'
slave_secs_behind_threashold=10
check_option_write='write'
check_option_read='read'
check_option_read_only='read_only'
username=dba_admin
password=xxxxx
dbip="127.0.0.1"
dbname="infra"
netdevs=`${cat} /proc/net/dev | ${UBIN}/awk '{if($2>0 && NR > 2) print substr($1, 0, index($1, ":") - 1)}' | grep -v "lo"`
for netdev in ${netdevs} ; do
netdevinfo=`/sbin/ifconfig ${netdev} | ${BIN}/grep "inet addr" | ${wc} -l`
if [ ${netdevinfo} -eq 0 ]; then
continue;
else
localmysqlip=`/sbin/ifconfig ${netdev} |${BIN}/grep "inet addr"| ${BIN}/cut -f 2 -d ":"|${BIN}/cut -f 1 -d " "`
localmysqlport=${dbport}
fi
done
retcode=2
netdevinfo=`/sbin/ifconfig ${netdev} | ${BIN}/grep "inet addr" | ${wc} -l`
}
#initEnv
function usage() {
${cat} <<EOF
Usage: $0 [options]
-h, --help Show this help message.
-a, --appname Give app code
Example:
sh $0 --appname app33
EOF
}
function varifyMysqlConnection() {
TIMEOUT=5
printf "%s\n" \
"[client]" \
"user=${username}" \
"password=${password}" \
"host=${dbip}" \
"port=${dbport}" \
| timeout $TIMEOUT ${mysql} --defaults-file=/dev/stdin --protocol=tcp -Ne "select 1"
}
function remoteMysqlExec() {
query="$1"
local connectioncount=1
while [ ${connectioncount} -le 3 ] ; do
if [ `varifyMysqlConnection | wc -l` -eq 1 ]; then
TIMEOUT=5
printf "%s\n" \
"[client]" \
"user=${username}" \
"password=${password}" \
"host=${dbip}" \
"port=${dbport}" \
| timeout $TIMEOUT ${mysql} --defaults-file=/dev/stdin --protocol=tcp -e "${query}"
if [ $? -ne 0 ]; then
checkReturn $?
else
retcode=0
break;
fi
else
local connectioncount=`expr $connectioncount + 1`
sleep 1
echo "trying ......"
fi
done
}
function getJsonValue()
{
local json=$1
local key=$2
if [[ -z "$3" ]]; then
local num=1
else
local num=$3
fi
local value=$(echo "${json}" | awk -F"[,:}]" '{for(i=1;i<=NF;i++){if($i~/'${key}'\042/){print $(i+1)}}}' | tr -d '"' | sed -n ${num}p)
echo ${value}
}
function checkConsulMaster() {
consulinfo=$(getJsonValue $(curl -s http://127.0.0.1:8500/v1/kv/hbidc/mysql/${appname}/master_info) Value)
consulmaster_info=`${echo} ${consulinfo} | ${base64} -d`
}
function check_mysql_alive() {
if [ `varifyMysqlConnection | wc -l` -ne 1 ]; then
echo 'connect to MySQL failed...'
exit 2
else
echo 'alive'
fi
}
function check_slave_hosts(){
slave_flag=$(remoteMysqlExec "show slave hosts;"|wc -l)
if [ ${slave_flag} -ge 1 ]; then
echo 'true'
else
echo 'false'
fi
}
function check_slave_status(){
slave_status_flag=$(remoteMysqlExec "show slave status;"|wc -l)
if [ ${slave_status_flag} -gt 0 ]; then
echo 'true'
else
echo 'false'
fi
}
function check_slave_repl_user(){
repl_user_flag=$(remoteMysqlExec "show processlist;"|grep dba_repl|awk -F' ' '{print $2}')
if [ ${repl_user_flag} == 'dba_repl' ]; then
echo 'true'
else
echo 'false'
fi
}
function check_read_only(){
read_only_flag=$(remoteMysqlExec "show variables like 'read_only';"|grep read_only | awk '{print $2}')
if [ ${read_only_flag} == 'ON' ]; then
echo 'true'
else
echo 'false'
fi
}
function check_slave_io_thread(){
Slave_IO_Running=$(remoteMysqlExec "show slave status\G" | ${BIN}/grep -w "Slave_IO_Running"| ${BIN}/awk -F: '{print $2}'| ${BIN}/sed 's/^[ \t]*//g')
if [ ${Slave_IO_Running} == 'Yes' ]; then
echo 'true'
else
echo 'false'
fi
}
function check_slave_sql_thread(){
Slave_SQL_Running=$(remoteMysqlExec "show slave status\G" |${BIN}/grep -w "Slave_SQL_Running"| ${BIN}/awk -F: '{print $2}'| ${BIN}/sed 's/^[ \t]*//g')
if [ ${Slave_SQL_Running} == 'Yes' ]; then
echo 'true'
else
echo 'false'
fi
}
function get_slave_master_info(){
master_host=$(remoteMysqlExec "show slave status\G" | ${BIN}/grep -w "Master_Host"| ${BIN}/awk -F: '{print $2}'| ${BIN}/sed 's/^[ \t]*//g')
master_port=$(remoteMysqlExec "show slave status\G" | ${BIN}/grep -w "Master_Port"| ${BIN}/awk -F: '{print $2}'| ${BIN}/sed 's/^[ \t]*//g')
echo $master_host $master_port
}
function get_slave_sbm(){
Seconds_Behind_Master=$(remoteMysqlExec "show slave status\G" | ${BIN}/grep -w "Seconds_Behind_Master"| ${BIN}/awk -F: '{print $2}'| ${BIN}/sed 's/^[ \t]*//g')
echo ${Seconds_Behind_Master}
}
function check_role(){
if [ `check_mysql_alive` == 'alive' ]; then
if [ `check_slave_hosts` == 'true' ]; then
if [ `check_slave_repl_user` == 'true' ]; then
if [ `check_read_only` == 'false' ]; then
echo 'Master'
else
echo 'Error'
fi
else
echo 'Error'
fi
else
if [ `check_slave_status` == 'true' ]; then
if [ `check_read_only` == 'true' ]; then
if [ `check_slave_io_thread` == 'true' ]; then
if [ `check_slave_sql_thread` == 'true' ]; then
echo 'Slave'
else
echo 'Error'
fi
else
echo 'Error'
fi
else
echo 'Error'
fi
else
if [ `check_read_only` == 'true' ]; then
echo 'Error'
else
echo 'Master'
fi
fi
fi
fi
}
function additional_check(){
db_role=$1
if [ "${db_role}" == ${db_role_master} ]; then
if [ "${checkoption}" == ${check_option_write} ]; then
log_date=`date "+%Y%m%d"`
echo "$time1 ===> this is a master server" >> /var/log/consul/role_info_${log_date}.log
exit 0
elif [ "${checkoption}" == ${check_option_read} ]; then
log_date=`date "+%Y%m%d"`
echo "$time1 ===> this is a master server" >> /var/log/consul/role_info_${log_date}.log
exit 0
elif [ "${checkoption}" == ${check_option_read_only} ]; then
${echo} "Config Error, Master Should use write,read option"
exit 2
else
${echo} "No check option to be set"
exit 2
fi
elif [ "${db_role}" == ${db_role_slave} ]; then
sbm_value=`get_slave_sbm`
if [ "${checkoption}" == ${check_option_write} ]; then
${echo} "Config Error, Slave Should use read,read_only option"
exit 2
elif [ "${checkoption}" == ${check_option_read} ]; then
if [ "${sbm_value}" -gt ${slave_secs_behind_threashold} ]; then
${echo} "Slave Error, Slave delay seconds exceed healthycheck threashold: 10 seconds"
log_date=`date "+%Y%m%d"`
echo "$time1 ===> Slave Error, Slave delay seconds exceed healthycheck threashold: 10 seconds" >> /var/log/consul/role_info_${log_date}.log
exit 2
else
log_date=`date "+%Y%m%d"`
echo "$time1 ===> this is a slave server" >> /var/log/consul/role_info_${log_date}.log
exit 0
fi
elif [ "${checkoption}" == ${check_option_read_only} ]; then
if [ "${sbm_value}" -gt ${slave_secs_behind_threashold} ]; then
${echo} "Slave Error, Slave delay seconds exceed healthycheck threashold: 10 seconds"
log_date=`date "+%Y%m%d"`
echo "$time1 ===> Slave Error, Slave delay seconds exceed healthycheck threashold: 10 seconds" >> /var/log/consul/role_info_${log_date}.log
exit 2
else
log_date=`date "+%Y%m%d"`
echo "$time1 ===> this is a slave server" >> /var/log/consul/role_info_${log_date}.log
exit 0
fi
else
${echo} "No check option to be set"
exit 2
fi
fi
}
function checkReturn() {
ret=$1
if [ ${ret} -ne 0 ]; then
if [ "${checkoption}" == "read" ]; then
echo "Consul agent or MySQL dead, please check"
exit 2;
elif [ "${checkoption}" == "write" ]; then
echo "This is not master"
exit 2;
else
echo "Check failed and check option not read or write"
exit 2;
fi
fi
}
if [ $# -eq 0 ]; then
${echo} "please input a appname"
usage
exit 1;
fi
GETOPT_ARGS=`getopt -o a:o:p: -al appname:,checkoption:,dbport:, -- "$@"`
eval set -- "$GETOPT_ARGS"
while [ -n "$1" ]
do
case "$1" in
-a|--appname) appname=$2; shift 2;;
-o|--checkoption) checkoption=$2; shift 2;;
-p|--dbport) dbport=$2; shift 2;;
--) break ;;
*) ${echo} "Unknown option '$1'"; exit 1;;
esac
done
initEnv
checkConsulMaster
localmysql_info="${localmysqlip}:${localmysqlport}"
if [ `check_mysql_alive` == 'alive' ]; then
if [ "${localmysql_info}" == "${consulmaster_info}" ]; then
log_date=`date "+%Y%m%d"`
echo "$time1 ===> this is a master server" >> /var/log/consul/role_info_${log_date}.log
exit 0
else
db_role=`check_role`
additional_check $db_role
fi
else
exit 2
fi
脚本如果能够看到这里,说明你还是有一点耐心的,里面的设计有一些技巧和细节是很难表述完整的,希望大家在使用的时候也能够提出宝贵意见。
个人新书 《MySQL DBA工作笔记》
个人公众号:jianrong-notes