【告警信息分析】
在 fd 超限后首先会在 RabbitMQ 日志中看到类似下面的信息
...
=INFO REPORT==== 25-Sep-2015::10:35:48 ===
accepting AMQP connection <0.27563.2> (172.16.185.147:49571 -> 172.16.185.147:6672)
=WARNING REPORT==== 25-Sep-2015::10:35:48 ===
file descriptor limit alarm set.
********************************************************************
*** New connections will not be accepted until this alarm clears ***
********************************************************************
=INFO REPORT==== 25-Sep-2015::10:35:48 ===
accepting AMQP connection <0.27566.2> (172.16.185.147:49572 -> 172.16.185.147:6672)
=WARNING REPORT==== 25-Sep-2015::10:35:48 ===
closing AMQP connection <0.27566.2> (172.16.185.147:49572 -> 172.16.185.147:6672):
connection_closed_abruptly
=WARNING REPORT==== 25-Sep-2015::10:35:48 ===
closing AMQP connection <0.27563.2> (172.16.185.147:49571 -> 172.16.185.147:6672):
connection_closed_abruptly
=WARNING REPORT==== 25-Sep-2015::10:35:48 ===
file descriptor limit alarm cleared
=INFO REPORT==== 25-Sep-2015::10:35:48 ===
accepting AMQP connection <0.27569.2> (172.16.185.147:51510 -> 172.16.185.147:6672)
=WARNING REPORT==== 25-Sep-2015::10:35:48 ===
file descriptor limit alarm set.
********************************************************************
*** New connections will not be accepted until this alarm clears ***
********************************************************************
=WARNING REPORT==== 25-Sep-2015::10:35:48 ===
closing AMQP connection <0.27569.2> (172.16.185.147:51510 -> 172.16.185.147:6672):
connection_closed_abruptly
...
从上面的告警信息可以看出处理逻辑如下:
- accept 一条新的 AMQP connection
- 触发 fd limit 告警
- 当再 accept 一条新 AMQP connection 时直接 connection_closed_abruptly
- 当再关闭一条 AMQP connection 后 fd limit 告警解除
- 重复 1-4 步骤
【可用 socket 限制值计算】
在 RabbitMQ 服务刚启动时,会在相应日志中记录如下内容
=INFO REPORT==== 14-Jul-2015::04:28:19 ===
Limiting to approx 924 file handles (829 sockets)
该内容表明了当前 RabbitMQ 服务的 fd 使用限制值;
在 file_handle_cache.erl 中
%% 指定告警触发和清除函数
start_link() ->
start_link(fun alarm_handler:set_alarm/1, fun alarm_handler:clear_alarm/1).
start_link(AlarmSet, AlarmClear) ->
gen_server2:start_link({local, ?SERVER}, ?MODULE, [AlarmSet, AlarmClear],
[{timeout, infinity}]).
...
init([AlarmSet, AlarmClear]) ->
Limit = case application:get_env(file_handles_high_watermark) of %% 该环境变量默认是不设置的
{ok, Watermark} when (is_integer(Watermark) andalso
Watermark > 0) ->
Watermark;
_ ->
case ulimit() of %% 获取系统设置
unknown -> ?FILE_HANDLES_LIMIT_OTHER; %% 默认值 1024
Lim -> lists:max([2, Lim - ?RESERVED_FOR_OTHERS]) %% 需要预留 100 个 fd 给系统使用
end
end,
ObtainLimit = obtain_limit(Limit),
error_logger:info_msg("Limiting to approx ~p file handles (~p sockets)~n",
[Limit, ObtainLimit]),
Clients = ets:new(?CLIENT_ETS_TABLE, [set, private, {keypos, #cstate.pid}]),
Elders = ets:new(?ELDERS_ETS_TABLE, [set, private]),
{ok, #fhc_state { elders = Elders,
limit = Limit, %% fd 数目总体限制
open_count = 0,
open_pending = pending_new(),
obtain_limit = ObtainLimit, %% 用于 socket 的 fd 限制
obtain_count_file = 0,
obtain_pending_file = pending_new(),
obtain_count_socket = 0,
obtain_pending_socket = pending_new(),
clients = Clients,
timer_ref = undefined,
alarm_set = AlarmSet,
alarm_clear = AlarmClear }}.
...
-define(OBTAIN_LIMIT(LIMIT), trunc((LIMIT * 0.9) - 2)). %% 这里进行了值修正
...
obtain_limit(infinity) -> infinity;
obtain_limit(Limit) -> case ?OBTAIN_LIMIT(Limit) of
OLimit when OLimit < 0 -> 0;
OLimit -> OLimit
end.
...
%% To increase the number of file descriptors: on Windows set ERL_MAX_PORTS
%% environment variable, on Linux set `ulimit -n`.
ulimit() ->
case proplists:get_value(max_fds, erlang:system_info(check_io)) of
MaxFds when is_integer(MaxFds) andalso MaxFds > 1 ->
case os:type() of
{win32, _OsName} ->
%% On Windows max_fds is twice the number of open files:
%% https://github.com/yrashk/erlang/blob/e1282325ed75e52a98d5/erts/emulator/sys/win32/sys.c#L2459-2466
MaxFds div 2;
_Any ->
%% For other operating systems trust Erlang.
MaxFds
end;
_ ->
unknown
end.
所以日志中的输出值是按如下公式计算得到的
=INFO REPORT==== 14-Jul-2015::04:28:19 ===
Limiting to approx 924 file handles (829 sockets)
ulimit -n 为 1024
924 = 1024 - 100
829 = trunc((1024 - 100) * 0.9 - 2)
【socket 限制告警的触发和解除】
在 file_handle_cache.erl 中
adjust_alarm(OldState = #fhc_state { alarm_set = AlarmSet,
alarm_clear = AlarmClear }, NewState) ->
case {obtain_limit_reached(OldState), obtain_limit_reached(NewState)} of
{false, true} -> AlarmSet({file_descriptor_limit, []}); %% 触发 socket 超限告警
{true, false} -> AlarmClear(file_descriptor_limit); %% 清除 socket 超限告警
_ -> ok
end,
NewState.
其中
AlarmSet 对应 fun alarm_handler:set_alarm/1
AlarmClear 对应 fun alarm_handler:clear_alarm/1
在 rabbit_alarm.erl 中
...
set_alarm(Alarm) -> gen_event:notify(?SERVER, {set_alarm, Alarm}).
clear_alarm(Alarm) -> gen_event:notify(?SERVER, {clear_alarm, Alarm}).
...
handle_event({set_alarm, Alarm}, State = #alarms{alarms = Alarms}) ->
case lists:member(Alarm, Alarms) of
true -> {ok, State};
false -> UpdatedAlarms = lists:usort([Alarm|Alarms]),
handle_set_alarm(Alarm, State#alarms{alarms = UpdatedAlarms})
end;
handle_event({clear_alarm, Alarm}, State = #alarms{alarms = Alarms}) ->
case lists:keymember(Alarm, 1, Alarms) of
true -> handle_clear_alarm(
Alarm, State#alarms{alarms = lists:keydelete(
Alarm, 1, Alarms)});
false -> {ok, State}
end;
...
%% 触发告警
handle_set_alarm({file_descriptor_limit, []}, State) ->
rabbit_log:warning(
"file descriptor limit alarm set.~n~n"
"********************************************************************~n"
"*** New connections will not be accepted until this alarm clears ***~n"
"********************************************************************~n"),
{ok, State};
...
%% 清除告警
handle_clear_alarm(file_descriptor_limit, State) ->
rabbit_log:warning("file descriptor limit alarm cleared~n"),
{ok, State};
...
【rabbitmqctl 子命令 status】
通过 rabbitmqctl 的 status 子命令也可以看到当前 fd 使用情况;
[root@Betty ~]# rabbitmqctl status
Status of node rmq_betty@Betty ...
[{pid,20008},
...
{file_descriptors,
[{total_limit,924},{total_used,13},{sockets_limit,829},{sockets_used,8}]},
...
...done.
[root@Betty ~]#
在 rabbit_control_main.erl 中
...
action(status, Node, [], _Opts, Inform) ->
Inform("Status of node ~p", [Node]),
display_call_result(Node, {rabbit, status, []});
...
在 rabbit.erl 中
status() ->
...
S3 = rabbit_misc:with_exit_handler(
fun () -> [] end,
fun () -> [{file_descriptors, file_handle_cache:info()}] end),
...
在 file_handle_cache.erl 中
%%----------------------------------------------------------------------------
-define(INFO_KEYS, [total_limit, total_used, sockets_limit, sockets_used]).
...
info() -> info(?INFO_KEYS).
info(Items) -> gen_server2:call(?SERVER, {info, Items}, infinity).
...
handle_call({info, Items}, _From, State) ->
{reply, infos(Items, State), State}.
...
infos(Items, State) -> [{Item, i(Item, State)} || Item <- Items].
i(total_limit, #fhc_state{limit = Limit}) -> Limit;
i(total_used, State) -> used(State);
i(sockets_limit, #fhc_state{obtain_limit = Limit}) -> Limit;
i(sockets_used, #fhc_state{obtain_count_socket = Count}) -> Count;
i(Item, _) -> throw({bad_argument, Item}).
used(#fhc_state{open_count = C1,
obtain_count_socket = C2,
obtain_count_file = C3}) -> C1 + C2 + C3.
【限制调整(最简方式)】
[root@Betty ~]# ps aux|grep rabbit
root 20008 0.6 1.7 2291544 66796 ? Sl May18 20:53 /usr/local/lib/erlang/erts-6.0/bin/beam.smp -W w -K true -A30 -P 1048576 -- -root /usr/local/lib/erlang -progname erl -- -home /root -- -pa /usr/lib/rabbitmq/sbin/../ebin -noshell -noinput -s rabbit boot -sname rmq_betty -boot start_sasl -config /etc/rabbitmq/rabbitmq -kernel inet_default_connect_options [{nodelay,true}] -sasl errlog_type error -sasl sasl_error_logger false -rabbit error_logger {file,"/var/log/rabbitmq/rmq_betty.log"} -rabbit sasl_error_logger {file,"/var/log/rabbitmq/rmq_betty-sasl.log"} -rabbit enabled_plugins_file "/etc/rabbitmq/enabled_plugins" -rabbit plugins_dir "/usr/lib/rabbitmq/sbin/../plugins" -rabbit plugins_expand_dir "/var/lib/rabbitmq/mnesia/rmq_betty-plugins-expand" -os_mon start_cpu_sup false -os_mon start_disksup false -os_mon start_memsup false -mnesia dir "/var/lib/rabbitmq/mnesia/rmq_betty" -kernel inet_dist_listen_min 25672 -kernel inet_dist_listen_max 25672 -noshell -noinput
root 32060 0.0 0.0 103252 852 pts/1 S+ 13:22 0:00 grep rabbit
[root@Betty ~]#
[root@Betty ~]# cat /proc/20008/limits | grep "open"
Max open files 1024 4096 files
[root@Betty ~]#
[root@Betty ~]# ulimit -n
1024
[root@Betty ~]#
[root@Betty ~]# ulimit -n 10240
[root@Betty ~]# ulimit -n
10240
[root@Betty ~]#
[root@Betty ~]# rabbitmqctl stop
Stopping and halting node rmq_betty@Betty ...
Args = []
...
...done.
[root@Betty ~]#
[root@Betty ~]# rabbitmq-server -detached
Warning: PID file not written; -detached was passed.
[root@Betty ~]#
[root@Betty ~]# ps aux|grep rabbit
root 32202 33.5 1.4 2296228 56552 ? Sl 13:24 0:02 /usr/local/lib/erlang/erts-6.0/bin/beam.smp -W w -K true -A30 -P 1048576 -- -root /usr/local/lib/erlang -progname erl -- -home /root -- -pa /usr/lib/rabbitmq/sbin/../ebin -noshell -noinput -s rabbit boot -sname rmq_betty -boot start_sasl -config /etc/rabbitmq/rabbitmq -kernel inet_default_connect_options [{nodelay,true}] -sasl errlog_type error -sasl sasl_error_logger false -rabbit error_logger {file,"/var/log/rabbitmq/rmq_betty.log"} -rabbit sasl_error_logger {file,"/var/log/rabbitmq/rmq_betty-sasl.log"} -rabbit enabled_plugins_file "/etc/rabbitmq/enabled_plugins" -rabbit plugins_dir "/usr/lib/rabbitmq/sbin/../plugins" -rabbit plugins_expand_dir "/var/lib/rabbitmq/mnesia/rmq_betty-plugins-expand" -os_mon start_cpu_sup false -os_mon start_disksup false -os_mon start_memsup false -mnesia dir "/var/lib/rabbitmq/mnesia/rmq_betty" -kernel inet_dist_listen_min 25672 -kernel inet_dist_listen_max 25672 -noshell -noinput
root 32250 0.0 0.0 103252 852 pts/1 S+ 13:24 0:00 grep rabbit
[root@Betty ~]#
[root@Betty ~]# cat /proc/32202/limits | grep "open"
Max open files 10240 10240 files
[root@Betty ~]# rabbitmqctl status
Status of node rmq_betty@Betty ...
[{pid,32202},
...
{file_descriptors,
[{total_limit,10140},
{total_used,12},
{sockets_limit,9124},
{sockets_used,7}]},
...
...done.
[root@Betty ~]#