目前tcp长连接应用使用的方案是nginx(ngx_stream_core_module),静态配置;自带的健康检查只能基于端口,但应用有时程序错误或假死,端口是OK的,导致客户端不少报错,因此新的方案要求可以动态配置路由,由监控检查程序管理后端节点的动态配置。
http动态路由(也叫动态upstream)方案较多,有现成的插件,或者lua对这块也支持得较好;但tcp的方案相对少,充分研究了下,总结下以下两个靠谱的方案。
1. Nginx、openresty下的lua库、自己编写lua脚本
实现功能:
- Nginx启动时将server配置初始化至redis, hash结构:{ip1:0, ip2:0, ip3:0,……}(0:OK,1:不可用);
- 每个请求都从nginx中获取server,轮训方式,redis支持连接池;
- 访问redis连接失败,或者访问的key中无可用服务器(hash为空,或value全部是1),则用本地静态配置;
- 某个server不能访问(ip或端口不通,或太繁忙无法访问),会try其他节点,try次数可配;
- 未实现和静态配置一样的端口健康检查自动踢出的机制(两个原因:1、会与外部独立的监控检查程序冲突;2、需要在nginx init阶段起定时任务,但init阶段不支持socket模块访问redis,没有找到合适的方案)
脚本说明:
高并发场景需要关注变量的使用(尽可能不使用全局变量)和逻辑的处理,尤其轮训、失败重试两个功能。
stream {
lua_package_path "/usr/local/lib/lua/?.lua;;";
lua_shared_dict dict_ups 4m;
lua_shared_dict dict_try 10m;
lua_shared_dict rotate_lock 100k;
lua_add_variable $dhq_proxypass;
lua_add_variable $backend_server;
lua_add_variable $try_cnt;
lua_add_variable $dhq_conn_cnt;
log_format main '$remote_addr [$time_local] $server_addr:$server_port $dhq_proxypass $dhq_conn_cnt $try_cnt $backend_server $status';
access_log logs/access.log main;
upstream mytcp_static {
server 10.40.20.201:22 max_fails=3 fail_timeout=5s;
server 10.40.20.202:22 max_fails=3 fail_timeout=5s;
server 10.40.20.203:22 max_fails=3 fail_timeout=5s;
server 10.40.20.204:22 max_fails=3 fail_timeout=5s;
}
upstream mytcp_lua {
# just a place holder,not work
server 1.1.1.1:1111;
balancer_by_lua_block {
local backend_port = 22
local try_cnt = ngx.shared.dict_try:get("conn" .. ngx.var.dhq_conn_cnt)
if try_cnt > 16 then
return
end
local balancer = require "ngx.balancer"
balancer.set_timeouts(3, 3, 3)
balancer.set_more_tries(4)
if g_ups_cur_dhq then
local state_name, status_code = balancer.get_last_failure()
if state_name == nil then
balancer.set_current_peer(g_ups_cur_dhq, backend_port)
ngx.var.backend_server = g_ups_cur_dhq .. ":" .. backend_port
else
local table_len = table.getn(g_ups_dhq_active_table)
local ups_cur_dhq = g_ups_dhq_active_table[(try_cnt - 1) % table_len + 1]
balancer.set_current_peer(ups_cur_dhq, backend_port)
ngx.var.backend_server = ups_cur_dhq .. ":" .. backend_port
try_cnt = try_cnt + 1
ngx.shared.dict_try:set("conn" .. ngx.var.dhq_conn_cnt, try_cnt)
end
ngx.var.try_cnt = try_cnt
else
ngx.log(ngx.ERR, "[error]: no server in upstream. ")
return
end
}
}
server {
listen 12345;
proxy_connect_timeout 3s;
proxy_timeout 120s;
proxy_next_upstream_tries 5;
preread_by_lua_block {
ups_dhq_table = {}
ups_dhq_table["10.40.20.201"] = 0
ups_dhq_table["10.40.20.202"] = 0
ups_dhq_table["10.40.20.203"] = 0
ups_dhq_table["10.40.30.204"] = 0
local ups_name = "mytcp"
local redis_ups_key = "upstream_denghaoqi"
ngx.var.dhq_proxypass = ups_name .. "_lua"
function func_get_redis()
local Redis = require "resty.redis"
local redis = Redis:new()
local pool_options = { pool_size = 300, blck_log = 20000 }
redis:set_timeout(3000)
local ok, err = redis:connect("10.40.16.45", 36379, pool_options)
if not ok then
ngx.var.dhq_proxypass = ups_name .. "_static"
ngx.log(ngx.ERR, "connect to redis failed, ", err)
return
end
return redis
end
local dhq_conn_cnt, err = ngx.shared.dict_ups:incr("dhq_conn_cnt",1,0,0)
ngx.var.dhq_conn_cnt = dhq_conn_cnt
-- sync to redis when nginx start
if (dhq_conn_cnt == 1) then
local redis = func_get_redis()
if redis == nil then
return
end
local ok, err = redis:del(redis_ups_key)
local ok, err = redis:hmset(redis_ups_key, ups_dhq_table)
-- local ok, err = redis:close()
redis:set_keepalive(30000, 300)
end
-- get a server in rotation
local redis = func_get_redis()
if redis == nil then
return
end
local ok, err = redis:array_to_hash(redis:hgetall(redis_ups_key))
if not ok then
ngx.var.dhq_proxypass = ups_name .. "_static"
ngx.log(ngx.ERR, "get redis key failed. ")
return
end
redis:set_keepalive(30000, 300)
if type(ok) == "table" then
if ok[1] == false then
ngx.log(ngx.ERR, "error: ", ok[2])
else
local ups_dhq_active_table = {}
for key, value in pairs(ok) do
if value == "0" then
table.insert(ups_dhq_active_table,key)
end
end
if table.getn(ups_dhq_active_table) == 0 then
ngx.var.dhq_proxypass = ups_name .. "_static"
ngx.log(ngx.ERR, "redis key has no valid server. ")
return
end
table.sort(ups_dhq_active_table)
local ind = (dhq_conn_cnt - 1) % table.getn(ups_dhq_active_table) + 1
g_ups_cur_dhq = ups_dhq_active_table[ind]
g_ups_dhq_active_table = ups_dhq_active_table
ngx.shared.dict_try:set("conn" .. dhq_conn_cnt, 1)
end
end
}
proxy_pass $dhq_proxypass;
}
}
2. Haproxy、haproxy生态圈的dataplaneapi
说明:
Dataplaneapi实现了restful api,可通过友好的接口删除server,增加server,dataplaneapi与haproxy部署在一台服务器,是1对1 的关系,需要对每个haproxy节点的api操作;
手动编辑haproxy配置文件后,执行以下命令强刷dataplaneapi的缓存:kill -SIGUSR2 dataplaneapi进程,或重启dataplaneapi
Api使用说明:
增加或者删除server时,需要开启事务;
提交事务后,haproxy自动reload,配置文件自动修改
dataplaneapi服务启动
/root/dataplaneapi/dataplaneapi-master/build/dataplaneapi --host 10.40.20.203 --port 5555 -b /usr/local/haproxy/sbin/haproxy -c /usr/local/haproxy/conf/haproxy.cfg -d 5 -r "/usr/local/haproxy/haproxy_mgr.sh restart" -s "/usr/local/haproxy/haproxy_mgr.sh reload" -u api -t /tmp/haproxy
主要操作步骤:
查询信息,获取当前version
# curl -X GET -u admin:admin \
> -H "Content-Type: application/json" \
> "http://10.40.20.203:5555/v2/services/haproxy/configuration/servers?backend=test-proxy-srv"
{
"_version":1,
"data":[
{"address":"10.40.20.208","check":"enabled","name":"10.40.20.208","port":222},
{"address":"10.45.0.10","check":"enabled","name":"10.45.0.10","port":22,"weight":80},
{"address":"10.45.0.11","check":"enabled","name":"10.45.0.11","port":22,"weight":80}
]
}
开启事务,获取事务id
参数version根据上述步骤结果递增
# curl -X POST -u admin:admin \
> -H "Content-Type: application/json" \
> http://10.40.20.203:5555/v2/services/haproxy/transactions?version=2
{"_version":1,"id":"c69fa5fe-8dc7-4c85-8912-0ac86b3ad59d","status":"in_progress"}
删除server
curl -X DELETE -u admin:admin \
-H "Content-Type: application/json" \
"http://10.40.20.203:5555/v2/services/haproxy/configuration/servers/10.40.20.207?backend=test-proxy-srv&transaction_id=c69fa5fe-8dc7-4c85-8912-0ac86b3ad59d"
增加server
curl -X POST -u admin:admin \
-H "Content-Type: application/json" \
--data '{"address": "10.45.0.11", "check": "enabled", "max-connections": 500, "name": "10.45.0.11", "port": 22, "weight": 80}' \
"http://10.40.20.203:5555/v2/services/haproxy/configuration/servers?backend=test-proxy-srv&transaction_id=c69fa5fe-8dc7-4c85-8912-0ac86b3ad59d "
提交事务
curl -X PUT -u admin:admin \
-H "Content-Type: application/json" \
http://10.40.20.203:5555/v2/services/haproxy/transactions/c69fa5fe-8dc7-4c85-8912-0ac86b3ad59d
3. 方案比较
| Nginx + lua | Haproxy + dataplaneapi |
稳定性 | 自己写的lua,经过多方面的功能测试和性能测试,稳定性有待线上验证 | 自带特性,稳定性更有保证 |
对监控程序要求 | 调整1个redis即可,对程序来说简单 | 需要了解api,需要对N个节点分别处理 |
对运维要求 | 引入复杂的lua编程(尤其是高并发情况下的故障或性能问题),后续要进一步优化 | 新引入haproxy,运维要进一步研究
|