002、单机安装部署 + VIP切换
环境说明
操作系统版本:Red Hat Enterprise Linux Server release 7.2
orchestrator组件环境:
用途 | IP | PORT |
---|---|---|
orchestrator服务器 | 10.0.0.238 | 3000(默认) |
orchestrator元数据库(MYSQL) | 10.0.0.235 | 3311 |
orchestrator启动用户 | dbadmin(涉及vip切换免密配置) |
受管mysql环境说明
用途 | IP | PORT |
---|---|---|
mysql-master | 10.0.0.236 | 3315 |
mysql-slave | 10.0.0.237 | 3315 |
VIP | 10.0.0.163 |
1、orchestrator元数据库配置—(10.0.0.235:3311)
创建数据库和用户
# 建元数据库
create database orch_meta;
# 建用户
create user usr_orch_meta@'%' identified by '********';
grant all on orch_meta.* to usr_orch_meta@'%';
2、orachestrator软件安装—(10.0.0.238)
下载软件并解压安装
下载地址:https://github.com/openark/orchestrator/releases
当前最新版本:orchestrator-3.2.6-linux-amd64.tar.gz
拷贝到238服务器,解压并拷贝到指定目录
tar -xvzf orchestrator-3.2.6-linux-amd64.tar.gz
### 拷贝到根目录下
cp -r usr/ /
cp -r etc/ /
目录文件说明:
配置文件修改
cd /usr/local/orchestrator
# 从样例复制一个配置文件
cp orchestrator-sample.conf.json orchestrator.conf.json
# 修改配置文件
vim orchestrator.conf.json
主要修改的参数如下:
## 拓扑发现使用的用户密码,就是orchestrator管理mysql实例的用户密码
"MySQLTopologyUser": "orc_client_user",
"MySQLTopologyPassword": "orc_client_password",
## orchestrator元数据库的连接信息
"MySQLOrchestratorHost": "127.0.0.1",
"MySQLOrchestratorPort": 3306,
"MySQLOrchestratorDatabase": "orchestrator",
"MySQLOrchestratorUser": "orc_server_user",
"MySQLOrchestratorPassword": "orc_server_password",
## 启用web用户认证
"AuthenticationMethod": "",
"HTTPAuthUser": "",
"HTTPAuthPassword": "",
## 主机名解析方法设置
"HostnameResolveMethod": "default",
"MySQLHostnameResolveMethod": "@@hostname",
# 支持自动恢复的实例组的匹配规则
"RecoverMasterClusterFilters": [
"_master_pattern_"
],
"RecoverIntermediateMasterClusterFilters": [
"_intermediate_master_pattern_"
],
修改后的配置文件如下:
点击查看完整配置文件
{
"Debug": true,
"EnableSyslog": false,
"ListenAddress": ":3000",
"MySQLTopologyUser": "usr_orch_mng",
"MySQLTopologyPassword": "Cbd%661b",
"MySQLTopologyCredentialsConfigFile": "",
"MySQLTopologySSLPrivateKeyFile": "",
"MySQLTopologySSLCertFile": "",
"MySQLTopologySSLCAFile": "",
"MySQLTopologySSLSkipVerify": true,
"MySQLTopologyUseMutualTLS": false,
"MySQLOrchestratorHost": "10.0.0.235",
"MySQLOrchestratorPort": 3311,
"MySQLOrchestratorDatabase": "db_orch",
"MySQLOrchestratorUser": "usr_orch_meta",
"MySQLOrchestratorPassword": "Abc%d98e",
"MySQLOrchestratorCredentialsConfigFile": "",
"MySQLOrchestratorSSLPrivateKeyFile": "",
"MySQLOrchestratorSSLCertFile": "",
"MySQLOrchestratorSSLCAFile": "",
"MySQLOrchestratorSSLSkipVerify": true,
"MySQLOrchestratorUseMutualTLS": false,
"MySQLConnectTimeoutSeconds": 1,
"DefaultInstancePort": 3306,
"DiscoverByShowSlaveHosts": true,
"InstancePollSeconds": 5,
"DiscoveryIgnoreReplicaHostnameFilters": [
"a_host_i_want_to_ignore[.]example[.]com",
".*[.]ignore_all_hosts_from_this_domain[.]example[.]com",
"a_host_with_extra_port_i_want_to_ignore[.]example[.]com:3307"
],
"UnseenInstanceForgetHours": 240,
"SnapshotTopologiesIntervalHours": 0,
"InstanceBulkOperationsWaitTimeoutSeconds": 10,
"HostnameResolveMethod": "none",
"MySQLHostnameResolveMethod": "@@report_host",
"SkipBinlogServerUnresolveCheck": true,
"ExpiryHostnameResolvesMinutes": 60,
"RejectHostnameResolvePattern": "",
"ReasonableReplicationLagSeconds": 10,
"ProblemIgnoreHostnameFilters": [],
"VerifyReplicationFilters": false,
"ReasonableMaintenanceReplicationLagSeconds": 20,
"CandidateInstanceExpireMinutes": 60,
"AuditLogFile": "",
"AuditToSyslog": false,
"RemoveTextFromHostnameDisplay": ".mydomain.com:3306",
"ReadOnly": false,
"AuthenticationMethod": "basic",
"HTTPAuthUser": "admin",
"HTTPAuthPassword": "123456",
"AuthUserHeader": "",
"PowerAuthUsers": [
"*"
],
"ClusterNameToAlias": {
"127.0.0.1": "test suite"
},
"ReplicationLagQuery": "",
"DetectClusterAliasQuery": "SELECT SUBSTRING_INDEX(@@hostname, '.', 1)",
"DetectClusterDomainQuery": "",
"DetectInstanceAliasQuery": "",
"DetectPromotionRuleQuery": "",
"DataCenterPattern": "[.]([^.]+)[.][^.]+[.]mydomain[.]com",
"PhysicalEnvironmentPattern": "[.]([^.]+[.][^.]+)[.]mydomain[.]com",
"PromotionIgnoreHostnameFilters": [],
"DetectSemiSyncEnforcedQuery": "",
"ServeAgentsHttp": false,
"AgentsServerPort": ":3001",
"AgentsUseSSL": false,
"AgentsUseMutualTLS": false,
"AgentSSLSkipVerify": false,
"AgentSSLPrivateKeyFile": "",
"AgentSSLCertFile": "",
"AgentSSLCAFile": "",
"AgentSSLValidOUs": [],
"UseSSL": false,
"UseMutualTLS": false,
"SSLSkipVerify": false,
"SSLPrivateKeyFile": "",
"SSLCertFile": "",
"SSLCAFile": "",
"SSLValidOUs": [],
"URLPrefix": "",
"StatusEndpoint": "/api/status",
"StatusSimpleHealth": true,
"StatusOUVerify": false,
"AgentPollMinutes": 60,
"UnseenAgentForgetHours": 6,
"StaleSeedFailMinutes": 60,
"SeedAcceptableBytesDiff": 8192,
"PseudoGTIDPattern": "",
"PseudoGTIDPatternIsFixedSubstring": false,
"PseudoGTIDMonotonicHint": "asc:",
"DetectPseudoGTIDQuery": "",
"BinlogEventsChunkSize": 10000,
"SkipBinlogEventsContaining": [],
"ReduceReplicationAnalysisCount": true,
"FailureDetectionPeriodBlockMinutes": 60,
"FailMasterPromotionOnLagMinutes": 0,
"RecoveryPeriodBlockSeconds": 3600,
"RecoveryIgnoreHostnameFilters": [],
"RecoverMasterClusterFilters": [
"*"
],
"RecoverIntermediateMasterClusterFilters": [
"*"
],
"OnFailureDetectionProcesses": [
"echo 'Detected {failureType} on {failureCluster}. Affected replicas: {countSlaves}' >> /tmp/recovery.log"
],
"PreGracefulTakeoverProcesses": [
"echo 'Planned takeover about to take place on {failureCluster}. Master will switch to read_only' >> /tmp/recovery.log"
],
"PreFailoverProcesses": [
"echo 'Will recover from {failureType} on {failureCluster}' >> /tmp/recovery.log"
],
"PostFailoverProcesses": [
"echo 'failover found!!! postFailover scripts will exec'",
"bash /usr/local/orchestrator/orch_hook.sh"
],
"PostUnsuccessfulFailoverProcesses": [],
"PostMasterFailoverProcesses": [
"echo 'Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Promoted: {successorHost}:{successorPort}' >> /tmp/recovery.log"
],
"PostIntermediateMasterFailoverProcesses": [
"echo 'Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Successor: {successorHost}:{successorPort}' >> /tmp/recovery.log"
],
"PostGracefulTakeoverProcesses": [
"echo 'Planned takeover complete' >> /tmp/recovery.log"
],
"CoMasterRecoveryMustPromoteOtherCoMaster": true,
"DetachLostSlavesAfterMasterFailover": true,
"ApplyMySQLPromotionAfterMasterFailover": true,
"PreventCrossDataCenterMasterFailover": false,
"PreventCrossRegionMasterFailover": false,
"MasterFailoverDetachReplicaMasterHost": false,
"MasterFailoverLostInstancesDowntimeMinutes": 0,
"PostponeReplicaRecoveryOnLagMinutes": 0,
"OSCIgnoreHostnameFilters": [],
"GraphiteAddr": "",
"GraphitePath": "",
"GraphiteConvertHostnameDotsToUnderscores": true,
"ConsulAddress": "",
"ConsulAclToken": "",
"ConsulKVStoreProvider": "consul"
}
启动服务
su - dbadmin
# 进入工作目录(必须)
/usr/local/orchestrator
# 前台启动
./orchestrator --config=./orchestrator.conf.json http
# nohup后台运行
nohup ./orchestrator --config=./orchestrator.conf.json http &
3、受管MYSQL配置
mysql安装及主从配置
略
my.cnf添加配置—(10.0.0.236/237都需要配置)
# 实例所在服务器IP,以下以10.0.0.236:3315实例说明
report_host="10.0.0.236"
report_port= 3315
slave_net_timeout = 4
主从配置—(只在从库237上执行)
# 重点修改参数MASTER_CONNECT_RETRY=1/MASTER_RETRY_COUNT=86400
change master to
master_host='10.0.0.236',
master_port=3315,
master_user='repl',
master_password='xxxx',
master_auto_position=1,
MASTER_CONNECT_RETRY=1,
MASTER_RETRY_COUNT=86400;
受管用户创建----(只在主库236上执行,会自动同步到从库)
CREATE USER 'usr_orch_mng'@'%' IDENTIFIED BY 'xxxxx';
GRANT SUPER, PROCESS, REPLICATION SLAVE, RELOAD ON *.* TO 'usr_orch_mng'@'%';
GRANT SELECT ON mysql.slave_master_info TO 'usr_orch_mng'@'%';
4、WEB界面方式发现节点
浏览器访问orchestrator
http://10.0.0.238:3000,看到如下界面即表示服务启动正常。
实例发现
如下图,输入mysql实例的IP和Port即可。前提是在实例上已经建好了’usr_orch_mng’@'%'用户。
发现完成后,查看具体主从拓扑架构。
5、VIP切换配置
以下介绍使用脚本方式切换VIP。
参考:https://www.percona.com/blog/2016/11/03/orchestrator-moving-vips-during-failover/
本说明使用dbadmin用户。
orchestrator服务器和受管服务器创建dbadmin用户
# 样例命令
useradd dbadmin
passwd dbadmin
orchestrator服务器免密到受管MYSQL服务器配置
# orchestrator服务器---10.0.0.238
su - dbadmin
ssh-keygen -t rsa
# 一直回车,生成密钥
# 查看生成的公钥
cd ~/.ssh/
cat id_rsa.pub
# MYSQL受管服务器-----10.0.0.236/237
su - dbadmin
vim ~/.ssh/authorized_keys
# 将上面的公钥粘贴进去即可。
# 执行以下命令测试下------10.0.0.238
ssh dbadmin@10.0.0.236 date
# 配置dbadmin用户具有sudo执行ip切换命令权限------10.0.0.236/237
visudo
dbadmin ALL=(ALL) NOPASSWD: ALL
hook脚本配置
主要用到两个脚本:
脚本名 | 来源 | 说明 |
---|---|---|
orch_hook.py | 自己写的 | 使用python27环境,主要是网上的orch_hook.sh不能实现多套vip的自适配切换,无奈写了一个。 |
orch_vip.sh | percona文章中的github链接 | 通过远程执行ip addr del/add命令卸载/加载vip。修改了一点内容,把发邮件的部分删掉了。 |
拷贝到/usr/local/orchestrator目录下,并赋予执行权限。
orchestrator.conf.json配置文件修改
## 修改后如下
"PostFailoverProcesses": [
"echo '(for all types) Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Successor: {successorHost}:{successorPort}' >> /tmp/recovery.log",
"python /usr/local/orchestrator/orch_hook.py {failedHost} {failedPort} {successorHost} {failureType} >> /tmp/recovery.log"
],
orch_hook.py脚本内容:
#!/usr/bin/python
# -*- coding:utf8 -*-
import sys
import os
'''
脚本所需位置参数说明:旧masterIP 旧masterPORT 新masterIP 切换类型
{failedHost} {failedPort} {successorHost} {failureType}
'''
print sys.argv
# 增加受管实例需要修改此部分vip信息
all_vip_info = (
('ip', 'port', 'vip', 'interface'),
('10.0.0.237', '3315', '10.0.0.163', 'eth0'),
('10.0.0.236', '3315', '10.0.0.163', 'eth0')
)
# 基础环境
command_orch_vip = '/usr/local/orchestrator/orch_vip.sh'
ssh_user = 'dbadmin'
log_file = '/tmp/recovery.log'
# 获取脚本参数
v_old_machine_ip = sys.argv[1]
v_old_machine_port = sys.argv[2]
v_new_machine_ip = sys.argv[3]
v_switch_type = sys.argv[4]
# 获取vip
v_vip = ''
v_network_interface = ''
for ip_info in all_vip_info:
if v_old_machine_ip == ip_info[0] and int(v_old_machine_port) == int(ip_info[1]):
v_vip = ip_info[2]
v_network_interface = ip_info[3]
break
# 参数检查
all_params = {
'old_machine_ip': v_old_machine_ip,
'old_machine_port': v_old_machine_port,
'new_machine_ip': v_new_machine_ip,
'switch_type': v_switch_type,
'vip': v_vip,
'network_interface': v_network_interface
}
for k, v in all_params.items():
if not v:
print 'err: not invalid param--{0}'.format(k)
exit(1)
# 切换命令
if v_switch_type == 'DeadMaster':
switch_command = '{0} -d 1 -n {1} -i {2} -I {3} -u {4} -o {5} >> {6}'.\
format(command_orch_vip, v_new_machine_ip, v_network_interface, v_vip, ssh_user, v_old_machine_ip, log_file)
print 'switch commond: {0}'.format(switch_command)
res = os.popen(switch_command)
print 'execute result: {0}'.format(res.readlines())
else:
print 'not supported switch type--{0}'.format(v_switch_type)
exit(1)
orch_vip.sh脚本内容:
#!/bin/bash
emailaddress="email@example.com"
sendmail=0
logfile='/tmp/recovery.log'
function usage {
cat << EOF
usage: $0 [-h] [-d master is dead] [-o old master ] [-s ssh options] [-n new master] [-i interface] [-I] [-u SSH user]
OPTIONS:
-h Show this message
-o string Old master hostname or IP address
-d int If master is dead should be 1 otherweise it is 0
-s string SSH options
-n string New master hostname or IP address
-i string Interface exmple eth0:1
-I string Virtual IP
-u string SSH user
EOF
}
while getopts ho:d:s:n:i:I:u: flag; do
case $flag in
o)
orig_master="$OPTARG";
;;
d)
isitdead="${OPTARG}";
;;
s)
ssh_options="${OPTARG}";
;;
n)
new_master="$OPTARG";
;;
i)
interface="$OPTARG";
;;
I)
vip="$OPTARG";
;;
u)
ssh_user="$OPTARG";
;;
h)
usage;
exit 0;
;;
*)
usage;
exit 1;
;;
esac
done
if [ $OPTIND -eq 1 ]; then
echo "No options were passed";
usage;
fi
shift $(( OPTIND - 1 ));
# discover commands from our path
ssh=$(which ssh)
arping=$(which arping)
ip2util=$(which ip)
# command for adding our vip
cmd_vip_add="sudo -n $ip2util address add ${vip} dev ${interface}"
# command for deleting our vip
cmd_vip_del="sudo -n $ip2util address del ${vip}/32 dev ${interface}"
# command for discovering if our vip is enabled
cmd_vip_chk="sudo -n $ip2util address show dev ${interface} to ${vip%/*}/32"
# command for sending gratuitous arp to announce ip move
cmd_arp_fix="sudo -n $arping -c 1 -I ${interface} ${vip%/*} "
# command for sending gratuitous arp to announce ip move on current server
cmd_local_arp_fix="sudo -n $arping -c 1 -I ${interface} ${vip%/*} "
vip_stop() {
rc=0
# ensure the vip is removed
$ssh ${ssh_options} -tt ${ssh_user}@${orig_master} \
"[ -n \"\$(${cmd_vip_chk})\" ] && ${cmd_vip_del} && sudo ${ip2util} route flush cache || [ -z \"\$(${cmd_vip_chk})\" ]"
rc=$?
return $rc
}
vip_start() {
rc=0
# ensure the vip is added
# this command should exit with failure if we are unable to add the vip
# if the vip already exists always exit 0 (whether or not we added it)
$ssh ${ssh_options} -tt ${ssh_user}@${new_master} \
"[ -z \"\$(${cmd_vip_chk})\" ] && ${cmd_vip_add} && ${cmd_arp_fix} || [ -n \"\$(${cmd_vip_chk})\" ]"
rc=$?
$cmd_local_arp_fix
return $rc
}
vip_status() {
$arping -c 1 -I ${interface} ${vip%/*}
if ping -c 1 -W 1 "$vip"; then
echo "status 0"
return 0
else
echo "status 1"
return 1
fi
}
if [[ $isitdead == 0 ]]; then
echo "Online failover"
if vip_stop; then
if vip_start; then
echo "$vip is moved to $new_master."
else
echo "Can't add $vip on $new_master!"
exit 1
fi
else
echo $rc
echo "Can't remove the $vip from orig_master!"
exit 1
fi
elif [[ $isitdead == 1 ]]; then
echo "Master is dead, failover"
# make sure the vip is not available
if vip_status; then
if vip_stop; then
echo "$vip is removed from orig_master."
else
echo "Couldn't remove $vip from orig_master."
exit 1
fi
fi
if vip_start; then
echo "$vip is moved to $new_master."
else
echo "Can't add $vip on $new_master!"
exit 1
fi
else
echo "Wrong argument, the master is dead or live?"
fi
6、切换演练
主库shutdown或kill -9 宕掉场景
# 登陆236主库
shutdown
# 观察切换情况,及vip切换日志
手工切换
登陆web界面,拖拽237从库成为236的主库,观察切换情况及vip切换情况。