erlang mysql:start_link_Erlang OTP之terminate 深入分析

香香甜甜圈

于 2021-03-04 01:16:13 发布

阅读量203

点赞数

文章标签： erlang mysql:start_link

本文链接：https://blog.csdn.net/weixin_33404102/article/details/114980927

版权

gen_server进程设置了trap_exit为true(process_flag(trap_exit,

true))，则在该进程结束时会自动调用terminate。利用这个功能，我们可以在进程退出时进行一些善后工作，例如持久化数据、清理等等。但实际上terminate不一定有时间完成所有的任务，在此之前可能已经被系统强制结束了(如果使用init:stop形式结束beam)。

二、测试terminate

一个erlang 内部 process结束有两种形式：主动结束(如玩家下线后，玩家进程会自动结束)和被动结束(init:stop)。

系统停止时(init:stop/c:q/erlang:halt)，会依次停止所有的进程，如果一个进程是监控树，则该监控树会先依次停止所有的子进程，然后结束自己。对于子进程也是同样的处理方法。

先做测试，后分析源码。测试分为四种情况：

进程主动退出 +

simple_one_for_one

init:stop +

simple_one_for_one

进程主动退出 +

one_for_one

init:stop +

one_for_one

源码文件：

test.erl (application)

test_sup.erl (supervisor)

test_server.erl (gen_server)

test.erl源码：

-module(test).

-behaviour(application).

-export([start/0, start/2, stop/1]).

start() ->

application:start(test).

start(_StartType, _StartArgs) ->

case test_sup:start_link() of{ok, Pid} ->{ok, Pid};

Error ->

Error

end.

stop(_State) ->ok.

非常简单，直接通过 erl -name test@192.168.1.83 -setcookie 123456 -boot

start_sasl -s test start 即可启动该app。

test_sup.erl源码：

-module(test_sup).

-behaviour(supervisor).

%% API-export([start_link/0]).

%% Supervisor

callbacks-export([init/1]).

-define(SERVER, ?MODULE).

start_link() ->

supervisor:start_link({local, ?SERVER}, ?MODULE, []).

init([]) ->

RestartStrategy = simple_one_for_one,

MaxRestarts = 1000,

MaxSecondsBetweenRestarts

= 3600,

SupFlags = {RestartStrategy, MaxRestarts, MaxSecondsBetweenRestarts},

Restart = transient,

Shutdown = 200000,

Type = worker,

AChild = {test_server, {test_server, start_link, []},

Restart, Shutdown, Type, [test_server]},

{ok,

{SupFlags, [AChild]}}.

源码骨架都是emacs生成的，我们只关注RestartStrategy = simple_one_for_one,这里，等会需要改成one_for_one以便测试对比。

test_server.erl

-module(test_server).

-behaviour(gen_server).

-export([

start/0,

start_link/0]).

-export([init/1, handle_call/3, handle_cast/2, handle_info/2,

terminate/2, code_change/3]).

-define(SERVER, ?MODULE).

-record(state, {}).

start() ->{ok,

_} = supervisor:start_child(test_sup, []).

start_link() ->

gen_server:start_link(?MODULE, [], []).

init([]) ->

erlang:process_flag(trap_exit, true),

{ok,

#state{}}.

handle_call(_Request, _From, State) ->

Reply = ok,

{reply, Reply, State}.

handle_cast(_Msg, State) ->{noreply, State}.

handle_info({'EXIT', _, Reason}, State) ->

io:format("exit:~p~n", [Reason]),

{stop,

normal, State};

handle_info(_Info, State) ->{noreply, State}.

terminate(Reason, _State) ->

io:format("i'm terminate:~p~n", [Reason]),

timer:sleep(10000),

io:format("~s", ["end"]),

ok.

code_change(_OldVsn, State, _Extra) ->{ok,

State}.

简单说明下test_server：

2．handle_info({'EXIT', _, Reason}, State) -> 方便simple_one_for_one下进程进程退出操作

3．terminate中的清理工作：io → timer:sleep → io

启动命令行：erl -name test@192.168.1.83 -setcookie 123456 -boot

start_sasl -s test start

1．进程主动退出 +

simple_one_for_one

(test@192.168.1.83)1>

erlang:exit(list_to_pid("<0.51.0>"),

test).

exit:test

true

i'm terminate:normal

(test@192.168.1.83)2>

end

正常完成了terminate

2．

init:stop + simple_one_for_one

(test@192.168.1.83)1>

test_server:start().

{ok,<0.53.0>}

(test@192.168.1.83)2>

init:stop().

(test@192.168.1.83)3>

i'm terminate:shutdown

[root@ming2_local_dev

test]#

可以看到似乎没能正常的处理完terminate

3. 进程主动退出 + one_for_one

(test@192.168.1.83)1>

erlang:exit(list_to_pid("<0.51.0>"),

test).

exit:test

true

i'm terminate:normal

(test@192.168.1.83)2>

end

正常完成了terminate

4．init:stop +

one_for_one

(test@192.168.1.83)2>

init:stop().

(test@192.168.1.83)3>

i'm terminate:shutdown

end[root@ming2_local_dev

test]#

ok，很完整的执行了我们的terminate。

terminate执行测试结果

Simple_one_for_one

One_for_one

进程主动退出

完整执行

init:stop

不能完整执行

完整执行

三、底层分析

看起来很奇怪的结果，还是从源码来分析问题。从supervisor开始：

terminate(_Reason, State) ->terminate_children(State#state.children, State#state.name),

ok.

terminate_children/2 是一个尾递归函数，依次结束每个子进程:

terminate_children(Children, SupName) ->terminate_children(Children, SupName, []).

terminate_children([Child | Children], SupName, Res) ->

NChild = do_terminate(Child, SupName),

terminate_children(Children, SupName, [NChild | Res]); terminate_children([], _SupName, Res) ->

Res.

在看do_terminate/2

do_terminate(Child, SupName) when Child#child.pid =/= undefined ->

case shutdown(Child#child.pid,

Child#child.shutdown) ofok

Child#child{pid = undefined};

{error, OtherReason} ->report_error(shutdown_error, OtherReason, Child, SupName),

Child#child{pid = undefined}

end; do_terminate(Child, _SupName) ->

Child.

继续：

shutdown(Pid, brutal_kill) ->

case monitor_child(Pid) ofok

exit(Pid, kill),

receive{'DOWN', _MRef, process, Pid, killed} ->ok;

{'DOWN', _MRef, process, Pid, OtherReason} ->{error, OtherReason}

end;

{error, Reason} ->{error, Reason}

end;

shutdown(Pid, Time) ->

case monitor_child(Pid) ofok

exit(Pid, shutdown), %% Try to shutdown gracefully

receive {'DOWN', _MRef, process, Pid, shutdown} ->ok;

{'DOWN', _MRef, process, Pid, OtherReason} ->{error, OtherReason}

after Time ->

exit(Pid, kill), %% Force termination.

receive{'DOWN', _MRef, process, Pid, OtherReason} ->{error, OtherReason}

end

end;

{error, Reason} ->{error, Reason}

end.

Ok，结束子进程时分情况处理了，先看看monitor_child/1，代码注释的比较详细，简单的说是用于处理child自己退出的情况。

monitor_child(Pid) ->

%% Do the monitor operation first so that

if the child dies

%% before the monitoring is done causing a

'DOWN'-message with

%% reason noproc, we will get the real

reason in the 'EXIT'-message

%% unless a naughty child has already done

unlink…

erlang:monitor(process, Pid),

unlink(Pid),

receive

%% If the child dies before the unlik we

must empty

%% the mail-box of the 'EXIT'-message and

the 'DOWN'-message.{'EXIT', Pid, Reason} ->

receive {'DOWN', _, process, Pid, _} ->{error, Reason}

end

after 0 ->

%% If a naughty child did unlink and the

child dies before

%% monitor the result will be that

shutdown/2 receives a

%% 'DOWN'-message with reason

noproc.

%% If the child should die after the

unlink there

%% will be a 'DOWN'-message with a correct

reason

%% that will be handled in

shutdown/2. ok

end.

回头看shutdown/2，主要区别在于exit(Pid, Reason)这一行，如果子进程的shutdown策略为brutal_kill，则子进程被直接kill，而kill消息是不能被捕捉的，也就不存在terminate被调用的可能了(terminate能被调用是因为捕捉了{‘EXIT’,_, _}消息，详细情况请自行查看gen_server实现)。如果你想在退出时清理数据，这里一定不能设置为brutal_kill，而是设置为一个较大的时间数值(毫秒)，用于等待子进程做善后工作：

exit(Pid, shutdown), %% Try to shutdown gracefully

receive {'DOWN', _MRef, process, Pid, shutdown} ->ok;

{'DOWN', _MRef, process, Pid, OtherReason} ->{error, OtherReason}

after Time ->

exit(Pid, kill), %% Force termination.

receive{'DOWN', _MRef, process, Pid, OtherReason} ->{error, OtherReason}

end 如果在指定时间内，子进程尚未结束，则强制kill。

从这一块的源码中我们没有看到restart strategy(one_for_one …)对terminate的影响，这跟上面的测试结果不太吻合。过一遍supervisor的代码，发现针对simple_one_for_one和one_for_one的子进程的启动过程是不同的：

handle_call({start_child, EArgs}, _From, State) when ?is_simple(State) ->

#child{mfa = {M, F, A}} = hd(State#state.children),

Args = A ++ EArgs,

case do_start_child_i(M, F, Args) of{ok,

Pid} ->

NState = State#state{dynamics =

?DICT:store(Pid, Args, State#state.dynamics)},

{reply, {ok, Pid}, NState};

{ok,

Pid, Extra} ->

NState = State#state{dynamics =

?DICT:store(Pid, Args, State#state.dynamics)},

{reply, {ok, Pid, Extra}, NState};

What ->{reply, What, State}

end;

%%% The requests terminate_child,

delete_child and restart_child are %%% invalid for simple_one_for_one supervisors.

handle_call({_Req, _Data}, _From, State) when ?is_simple(State) ->{reply,

{error, simple_one_for_one}, State};

handle_call({start_child, ChildSpec}, _From, State) ->

case check_childspec(ChildSpec) of{ok,

Child} ->{Resp, NState} = handle_start_child(Child, State),

{reply, Resp, NState};

What ->{reply, {error, What}, State}

end;

simple_one_for_one形式启动的子进程根本没有放在supervisor的state.children里面，也就是说supervisor在terminate的时候根本没管simple_one_for_one形式启动的子进程，如此当supervisor结束时，所有的simple_one_for_one子进程都会收到一条{‘EXIT’, Pid, Reason}的消息，如果子进程有处理这样的消息并返回了stop，则会调用terminate。但在执行terminate期间，app可能已经结束，从而正在停止中的系统会直接kill掉该进程(实际上是所有剩余的进程)，使得其没有时间执行完所有的功能代码(参考之前的分析《init:stop浅析》)。

四、结论、问题与解决办法

1．

结论

Simple_one_for_one

One_for_one

进程主动退出

完整执行

init:stop

不能完整执行

完整执行

2．

问题

使用simple_one_for_one时，在系统关闭时，可能无法正常的完成某些的善后工作，如数据持久等等

3．

解决办法

使用one_for_one，但是one_for_one的启动过程需要做一些简单的调整：

sup树的init返回：

init([]) ->

RestartStrategy = one_for_one,

MaxRestarts = 1000,

MaxSecondsBetweenRestarts

= 3600,

SupFlags = {RestartStrategy, MaxRestarts, MaxSecondsBetweenRestarts},

{ok,

{SupFlags, []}}.

start_child的时候childspec需要拼凑spec id：

supervisor:start_child(mod_stall_sup,

{lists:concat(["mod_stall_server_", MAPID]),

{mod_stall_server,

start_link, [MAPID]},

transient,

30000, worker, [mod_stall_server]})

香香甜甜圈

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫