【原创】RabbitMQ 之 file descriptor limit alarm 分析

【告警信息分析】

在 fd 超限后首先会在 RabbitMQ 日志中看到类似下面的信息

...
=INFO REPORT==== 25-Sep-2015::10:35:48 ===
accepting AMQP connection <0.27563.2> (172.16.185.147:49571 -> 172.16.185.147:6672)

=WARNING REPORT==== 25-Sep-2015::10:35:48 ===
file descriptor limit alarm set.

********************************************************************
*** New connections will not be accepted until this alarm clears ***
********************************************************************

=INFO REPORT==== 25-Sep-2015::10:35:48 ===
accepting AMQP connection <0.27566.2> (172.16.185.147:49572 -> 172.16.185.147:6672)

=WARNING REPORT==== 25-Sep-2015::10:35:48 ===
closing AMQP connection <0.27566.2> (172.16.185.147:49572 -> 172.16.185.147:6672):
connection_closed_abruptly

=WARNING REPORT==== 25-Sep-2015::10:35:48 ===
closing AMQP connection <0.27563.2> (172.16.185.147:49571 -> 172.16.185.147:6672):
connection_closed_abruptly

=WARNING REPORT==== 25-Sep-2015::10:35:48 ===
file descriptor limit alarm cleared

=INFO REPORT==== 25-Sep-2015::10:35:48 ===
accepting AMQP connection <0.27569.2> (172.16.185.147:51510 -> 172.16.185.147:6672)

=WARNING REPORT==== 25-Sep-2015::10:35:48 ===
file descriptor limit alarm set.

********************************************************************
*** New connections will not be accepted until this alarm clears ***
********************************************************************

=WARNING REPORT==== 25-Sep-2015::10:35:48 ===
closing AMQP connection <0.27569.2> (172.16.185.147:51510 -> 172.16.185.147:6672):
connection_closed_abruptly

...

从上面的告警信息可以看出处理逻辑如下:

  • accept 一条新的 AMQP connection
  • 触发 fd limit 告警
  • 当再 accept 一条新 AMQP connection 时直接 connection_closed_abruptly
  • 当再关闭一条 AMQP connection 后 fd limit 告警解除
  • 重复 1-4 步骤


【可用 socket 限制值计算】

在 RabbitMQ 服务刚启动时,会在相应日志中记录如下内容

=INFO REPORT==== 14-Jul-2015::04:28:19 ===
Limiting to approx 924 file handles (829 sockets)

该内容表明了当前 RabbitMQ 服务的 fd 使用限制值;

在 file_handle_cache.erl 中

%% 指定告警触发和清除函数
start_link() ->
    start_link(fun alarm_handler:set_alarm/1, fun alarm_handler:clear_alarm/1).

start_link(AlarmSet, AlarmClear) ->
    gen_server2:start_link({local, ?SERVER}, ?MODULE, [AlarmSet, AlarmClear],
                           [{timeout, infinity}]).
...
init([AlarmSet, AlarmClear]) ->
    Limit = case application:get_env(file_handles_high_watermark) of   %% 该环境变量默认是不设置的
                {ok, Watermark} when (is_integer(Watermark) andalso
                                      Watermark > 0) ->
                    Watermark;
                _ ->
                    case ulimit() of   %% 获取系统设置
                        unknown  -> ?FILE_HANDLES_LIMIT_OTHER;  %% 默认值 1024
                        Lim      -> lists:max([2, Lim - ?RESERVED_FOR_OTHERS])  %% 需要预留 100 个 fd 给系统使用
                    end
            end,
    ObtainLimit = obtain_limit(Limit),
    error_logger:info_msg("Limiting to approx ~p file handles (~p sockets)~n",
                          [Limit, ObtainLimit]),
    Clients = ets:new(?CLIENT_ETS_TABLE, [set, private, {keypos, #cstate.pid}]),
    Elders = ets:new(?ELDERS_ETS_TABLE, [set, private]),
    {ok, #fhc_state { elders                = Elders,
                      limit                 = Limit,              %% fd 数目总体限制
                      open_count            = 0,
                      open_pending          = pending_new(),
                      obtain_limit          = ObtainLimit,        %% 用于 socket 的 fd 限制
                      obtain_count_file     = 0,
                      obtain_pending_file   = pending_new(),
                      obtain_count_socket   = 0,
                      obtain_pending_socket = pending_new(),
                      clients               = Clients,
                      timer_ref             = undefined,
                      alarm_set             = AlarmSet,
                      alarm_clear           = AlarmClear }}.
...

-define(OBTAIN_LIMIT(LIMIT), trunc((LIMIT * 0.9) - 2)).    %% 这里进行了值修正
...
obtain_limit(infinity) -> infinity;
obtain_limit(Limit)    -> case ?OBTAIN_LIMIT(Limit) of
                              OLimit when OLimit < 0 -> 0;
                              OLimit                 -> OLimit
                          end.
...
%% To increase the number of file descriptors: on Windows set ERL_MAX_PORTS
%% environment variable, on Linux set `ulimit -n`.
ulimit() ->
    case proplists:get_value(max_fds, erlang:system_info(check_io)) of
        MaxFds when is_integer(MaxFds) andalso MaxFds > 1 ->
            case os:type() of
                {win32, _OsName} ->
                    %% On Windows max_fds is twice the number of open files:
                    %%   https://github.com/yrashk/erlang/blob/e1282325ed75e52a98d5/erts/emulator/sys/win32/sys.c#L2459-2466
                    MaxFds div 2;
                _Any ->
                    %% For other operating systems trust Erlang.
                    MaxFds
            end;
        _ ->
            unknown
    end.

所以日志中的输出值是按如下公式计算得到的

=INFO REPORT==== 14-Jul-2015::04:28:19 ===
Limiting to approx 924 file handles (829 sockets)

ulimit -n 为 1024
924 = 1024 - 100
829 = trunc((1024 - 100) * 0.9 - 2)

【socket 限制告警的触发和解除】

在 file_handle_cache.erl 中

adjust_alarm(OldState = #fhc_state { alarm_set   = AlarmSet,
                                     alarm_clear = AlarmClear }, NewState) ->
    case {obtain_limit_reached(OldState), obtain_limit_reached(NewState)} of
        {false, true} -> AlarmSet({file_descriptor_limit, []});   %% 触发 socket 超限告警
        {true, false} -> AlarmClear(file_descriptor_limit);   %% 清除 socket 超限告警
        _             -> ok
    end,
    NewState.

其中
AlarmSet 对应 fun alarm_handler:set_alarm/1
AlarmClear 对应 fun alarm_handler:clear_alarm/1

在 rabbit_alarm.erl 中

...
set_alarm(Alarm)   -> gen_event:notify(?SERVER, {set_alarm,   Alarm}).
clear_alarm(Alarm) -> gen_event:notify(?SERVER, {clear_alarm, Alarm}).
...
handle_event({set_alarm, Alarm}, State = #alarms{alarms = Alarms}) ->
    case lists:member(Alarm, Alarms) of
        true  -> {ok, State};
        false -> UpdatedAlarms = lists:usort([Alarm|Alarms]),
                 handle_set_alarm(Alarm, State#alarms{alarms = UpdatedAlarms})
    end;

handle_event({clear_alarm, Alarm}, State = #alarms{alarms = Alarms}) ->
    case lists:keymember(Alarm, 1, Alarms) of
        true  -> handle_clear_alarm(
                   Alarm, State#alarms{alarms = lists:keydelete(
                                                  Alarm, 1, Alarms)});
        false -> {ok, State}

    end;
...
%% 触发告警
handle_set_alarm({file_descriptor_limit, []}, State) ->
    rabbit_log:warning(
      "file descriptor limit alarm set.~n~n"
      "********************************************************************~n"
      "*** New connections will not be accepted until this alarm clears ***~n"
      "********************************************************************~n"),
    {ok, State};
...
%% 清除告警
handle_clear_alarm(file_descriptor_limit, State) ->
    rabbit_log:warning("file descriptor limit alarm cleared~n"),
    {ok, State};
...

【rabbitmqctl 子命令 status】

通过 rabbitmqctl 的 status 子命令也可以看到当前 fd 使用情况;

[root@Betty ~]# rabbitmqctl status
Status of node rmq_betty@Betty ...
[{pid,20008},
...
 {file_descriptors,
     [{total_limit,924},{total_used,13},{sockets_limit,829},{sockets_used,8}]},
...
...done.
[root@Betty ~]#

在 rabbit_control_main.erl 中

...
action(status, Node, [], _Opts, Inform) ->
    Inform("Status of node ~p", [Node]),
    display_call_result(Node, {rabbit, status, []});
...

在 rabbit.erl 中

status() ->
...
    S3 = rabbit_misc:with_exit_handler(
           fun () -> [] end,
           fun () -> [{file_descriptors, file_handle_cache:info()}] end),
...

在 file_handle_cache.erl 中

%%----------------------------------------------------------------------------
-define(INFO_KEYS, [total_limit, total_used, sockets_limit, sockets_used]).
...
info() -> info(?INFO_KEYS).
info(Items) -> gen_server2:call(?SERVER, {info, Items}, infinity).
...
handle_call({info, Items}, _From, State) ->
    {reply, infos(Items, State), State}.
...
infos(Items, State) -> [{Item, i(Item, State)} || Item <- Items].

i(total_limit,   #fhc_state{limit               = Limit}) -> Limit;
i(total_used,    State)                                   -> used(State);
i(sockets_limit, #fhc_state{obtain_limit        = Limit}) -> Limit;
i(sockets_used,  #fhc_state{obtain_count_socket = Count}) -> Count;
i(Item, _) -> throw({bad_argument, Item}).

used(#fhc_state{open_count          = C1,
                 obtain_count_socket = C2,
                 obtain_count_file   = C3}) -> C1 + C2 + C3.

【限制调整(最简方式)】

[root@Betty ~]# ps aux|grep rabbit
root     20008  0.6  1.7 2291544 66796 ?       Sl   May18  20:53 /usr/local/lib/erlang/erts-6.0/bin/beam.smp -W w -K true -A30 -P 1048576 -- -root /usr/local/lib/erlang -progname erl -- -home /root -- -pa /usr/lib/rabbitmq/sbin/../ebin -noshell -noinput -s rabbit boot -sname rmq_betty -boot start_sasl -config /etc/rabbitmq/rabbitmq -kernel inet_default_connect_options [{nodelay,true}] -sasl errlog_type error -sasl sasl_error_logger false -rabbit error_logger {file,"/var/log/rabbitmq/rmq_betty.log"} -rabbit sasl_error_logger {file,"/var/log/rabbitmq/rmq_betty-sasl.log"} -rabbit enabled_plugins_file "/etc/rabbitmq/enabled_plugins" -rabbit plugins_dir "/usr/lib/rabbitmq/sbin/../plugins" -rabbit plugins_expand_dir "/var/lib/rabbitmq/mnesia/rmq_betty-plugins-expand" -os_mon start_cpu_sup false -os_mon start_disksup false -os_mon start_memsup false -mnesia dir "/var/lib/rabbitmq/mnesia/rmq_betty" -kernel inet_dist_listen_min 25672 -kernel inet_dist_listen_max 25672 -noshell -noinput
root     32060  0.0  0.0 103252   852 pts/1    S+   13:22   0:00 grep rabbit
[root@Betty ~]# 
[root@Betty ~]# cat /proc/20008/limits | grep "open"
Max open files            1024                 4096                 files     
[root@Betty ~]#
[root@Betty ~]# ulimit -n
1024
[root@Betty ~]# 
[root@Betty ~]# ulimit -n 10240
[root@Betty ~]# ulimit -n
10240
[root@Betty ~]# 
[root@Betty ~]# rabbitmqctl stop
Stopping and halting node rmq_betty@Betty ...
Args = []
 ...
...done.
[root@Betty ~]#
[root@Betty ~]# rabbitmq-server -detached
Warning: PID file not written; -detached was passed.
[root@Betty ~]# 
[root@Betty ~]# ps aux|grep rabbit
root     32202 33.5  1.4 2296228 56552 ?       Sl   13:24   0:02 /usr/local/lib/erlang/erts-6.0/bin/beam.smp -W w -K true -A30 -P 1048576 -- -root /usr/local/lib/erlang -progname erl -- -home /root -- -pa /usr/lib/rabbitmq/sbin/../ebin -noshell -noinput -s rabbit boot -sname rmq_betty -boot start_sasl -config /etc/rabbitmq/rabbitmq -kernel inet_default_connect_options [{nodelay,true}] -sasl errlog_type error -sasl sasl_error_logger false -rabbit error_logger {file,"/var/log/rabbitmq/rmq_betty.log"} -rabbit sasl_error_logger {file,"/var/log/rabbitmq/rmq_betty-sasl.log"} -rabbit enabled_plugins_file "/etc/rabbitmq/enabled_plugins" -rabbit plugins_dir "/usr/lib/rabbitmq/sbin/../plugins" -rabbit plugins_expand_dir "/var/lib/rabbitmq/mnesia/rmq_betty-plugins-expand" -os_mon start_cpu_sup false -os_mon start_disksup false -os_mon start_memsup false -mnesia dir "/var/lib/rabbitmq/mnesia/rmq_betty" -kernel inet_dist_listen_min 25672 -kernel inet_dist_listen_max 25672 -noshell -noinput
root     32250  0.0  0.0 103252   852 pts/1    S+   13:24   0:00 grep rabbit
[root@Betty ~]# 
[root@Betty ~]# cat /proc/32202/limits | grep "open"
Max open files            10240                10240                files 
[root@Betty ~]# rabbitmqctl status
Status of node rmq_betty@Betty ...
[{pid,32202},
...
 {file_descriptors,
     [{total_limit,10140},
      {total_used,12},
      {sockets_limit,9124},
      {sockets_used,7}]},
...
...done.
[root@Betty ~]#

 



 

转载于:https://my.oschina.net/moooofly/blog/678513

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值