Erlang常用数据结构实现

最新推荐文章于 2024-04-12 08:00:00 发布

TriKin

最新推荐文章于 2024-04-12 08:00:00 发布

阅读量499

点赞数 1

分类专栏： Erlang 文章标签： erlang

原文链接：https://wudaijun.com/2015/12/erlang-datastructures/

版权

Erlang 专栏收录该内容

21 篇文章 2 订阅

订阅专栏

详见Erlang 常用数据结构实现，以下为阅读该博客所做笔记和其他相关的笔记：

Erlang虚拟机使用一个字(64/32位)来表示所有类型的数据，即Eterm。具体的实施方案通过占用Eterm的后几位作为类型标签，然后根据标签类型来解释剩余位的用途。这个标签是多层级的，最外层占用两位，有三种类型：
01: list，剩下62位是指向列表Cons的指针。
10: boxed对象，即复杂对象，剩余62位指向boxed对象的对象头。包括元组，大整数，外部Pid/Port等。
11: immediate立即数，即可以在一个字中表示的小型对象，包括小整数，本地Pid/Port，Atom，NIL等。
这三种类型是Erlang类型的大框架，前两者是可以看做是引用类型，立即数相当于是值类型，但无论对于哪种类型，Erlang Eterm本身只占用一个字。
atom用立即数表示，Eterm中存放的是atom在全局atom表中的索引，依赖于哈希和索引表。需要注意的是，atom表是不回收的，默认最大值为1024*1024，超过这个限制Erlang虚拟机将会崩溃，可通过+t参数调整该上限。
对于外部Pid和内部Pid来说，表示是不同的。内部Pid通过32位表示，指向Pid表，不包含节点信息，而外部Pid是包含节点信息的boxed对象。将本地Pid发向其他节点时，会加上节点信息。
list在进程内对重复对象的引用只占用一份对象内存，但在跨进程的时候，对象就会被展开，执行深度拷贝。
tuple属于boxed对象的一种，实际上就是一个有头部的数组，其包含的Eterm在内存中紧凑排列，tuple的操作效率和数组是一致的。头部包含tuple的大小信息。
binary根据大小是否大于64字节，分为heap binary和refc binary。heap binary创建在进程堆上，而refc binary创建在Erlang虚拟机全局堆上，被所有进程共享。这样可以在跨进程传输时只传输引用而无需进行大数据拷贝。
record就是一个tuple，record filed在预编译后通过数值下标来索引，因此它访问field是O(1)复杂度的。
map依据大小的不同使用不同的结构，当map的大小<?MAP_SMALL_MAP_LIMIT时，使用flatmap的结构，如下：

typedef struct map_s {
 Eterm thing_word;   //  boxed对象header
 Uint  size;         //  map 键值对个数
 Eterm keys;         //  keys的tuple
} map_t;

在此结构体后，再依次存放Value。因此在get时，需要先遍历keys tuple，找到key所在下标，然后在value中取出该下标偏移对应的值。这种情况下，map实际上相当于record的替代。
当map的大小>=?MAP_SMALL_MAP_LIMIT时，使用hashmap结构。
由于Erlang变量的不可变原则，每次更新maps都需要开辟新的map，拷贝原来的keys和value。因此，maps:update比maps:put更高效，因为前者keys数量不会变，因此无需开辟新的keys tuple，拷贝keys tuples ETerm即可。实际使用maps时，更新已有key值时，使用update(:=)而不是put(=>)，不仅可以检错，并且效率更高。

dict是通过动态哈希实现的字典，结构如下：

%% Define a hashtable.  The default values are the standard ones.
-record(dict, {
	size=0		      :: non_neg_integer(), % Number of elements
	n=?seg_size	      :: non_neg_integer(), % Number of active slots
	maxn=?seg_size	  :: non_neg_integer(),	% Maximum slots
	bso=?seg_size div 2  :: non_neg_integer(), % Buddy slot offset
	exp_size=?exp_size   :: non_neg_integer(), % Size to expand at
	con_size=?con_size   :: non_neg_integer(), % Size to contract at
	empty		      :: tuple(),	% Empty segment
	segs		      :: segs(_, _)	% Segments
}).

其中，size表示当前dict的键值对数量。n表示当前可用的slot数量，默认为16。maxn表示slot的最大值，默认为16。bso为slot偏移量，默认值为8。exp_size和con_size分别表示要扩展和压缩时的size值，默认值分别为80和48。empty为扩展dict的模板。segs用于存放数据。empty如下：

{[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]}

segs初始结构如下：

{{[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]}}

上面的每一个[]，都代表一个slot，用于存放key-value。

dict通过对key值进行哈希运算，从而将key-value映射到不同的slot中，提升查询效率。以find为例：

-spec find(Key, Dict) -> {'ok', Value} | 'error' when
      Dict :: dict(Key, Value).

find(Key, D) ->
    Slot = get_slot(D, Key),
    Bkt = get_bucket(D, Slot),
    find_val(Key, Bkt).

find_val(K, [?kv(K,Val)|_]) -> {ok,Val};
find_val(K, [_|Bkt]) -> find_val(K, Bkt);
find_val(_, []) -> error.

%% get_slot(Hashdb, Key) -> Slot.
%%  Get the slot.  First hash on the new range, if we hit a bucket
%%  which has not been split use the unsplit buddy bucket.
get_slot(T, Key) ->
    H = erlang:phash(Key, T#dict.maxn),
    if
	H > T#dict.n -> H - T#dict.bso;
	true -> H
    end.

get_bucket(T, Slot) -> get_bucket_s(T#dict.segs, Slot).

get_bucket_s(Segs, Slot) ->
    SegI = ((Slot-1) div ?seg_size) + 1,%% 获取当前slot在segs中的位置。
    BktI = ((Slot-1) rem ?seg_size) + 1,%% 获取当前slot在元组{[],[],....,[]}中的位置
    element(BktI, element(SegI, Segs)).

dict:store(Key, Val, Dict)实现如下：

-spec store(Key, Value, Dict1) -> Dict2 when
      Dict1 :: dict(Key, Value),
      Dict2 :: dict(Key, Value).

store(Key, Val, D0) ->
    Slot = get_slot(D0, Key),
    {D1,Ic} = on_bucket(fun (B0) -> store_bkt_val(Key, Val, B0) end,
			D0, Slot),
    maybe_expand(D1, Ic).

store_bkt_val(Key, New, [?kv(Key,_Old)|Bkt]) -> {[?kv(Key,New)|Bkt],0};
store_bkt_val(Key, New, [Other|Bkt0]) ->
    {Bkt1,Ic} = store_bkt_val(Key, New, Bkt0),
    {[Other|Bkt1],Ic};
store_bkt_val(Key, New, []) -> {[?kv(Key,New)],1}.

on_bucket(F, T, Slot) ->
    SegI = ((Slot-1) div ?seg_size) + 1,
    BktI = ((Slot-1) rem ?seg_size) + 1,
    Segs = T#dict.segs,
    Seg = element(SegI, Segs),
    B0 = element(BktI, Seg),
    {B1,Res} = F(B0),				%Op on the bucket.
    {T#dict{segs=setelement(SegI, Segs, setelement(BktI, Seg, B1))},Res}.

dict扩展如下：

maybe_expand(T, 0) -> maybe_expand_aux(T, 0);
maybe_expand(T, 1) -> maybe_expand_aux(T, 1).

maybe_expand_aux(T0, Ic) when T0#dict.size + Ic > T0#dict.exp_size ->
    T = maybe_expand_segs(T0),			%Do we need more segments.
    N = T#dict.n + 1,				%Next slot to expand into
    Segs0 = T#dict.segs,
    Slot1 = N - T#dict.bso,
    B = get_bucket_s(Segs0, Slot1),
    Slot2 = N,
    [B1|B2] = rehash(B, Slot1, Slot2, T#dict.maxn),
    Segs1 = put_bucket_s(Segs0, Slot1, B1),
    Segs2 = put_bucket_s(Segs1, Slot2, B2),
    T#dict{size=T#dict.size + Ic,
	   n=N,
	   exp_size=N * ?expand_load,
	   con_size=N * ?contract_load,
	   segs=Segs2};
maybe_expand_aux(T, Ic) -> T#dict{size=T#dict.size + Ic}.

maybe_expand_segs(T) when T#dict.n =:= T#dict.maxn ->
    T#dict{maxn=2 * T#dict.maxn,
	   bso=2 * T#dict.bso,
	   segs=expand_segs(T#dict.segs, T#dict.empty)};
maybe_expand_segs(T) -> T.

expand_segs({B1}, Empty) ->
    {B1,Empty};
expand_segs({B1,B2}, Empty) ->
    {B1,B2,Empty,Empty};
expand_segs({B1,B2,B3,B4}, Empty) ->
    {B1,B2,B3,B4,Empty,Empty,Empty,Empty};
expand_segs({B1,B2,B3,B4,B5,B6,B7,B8}, Empty) ->
    {B1,B2,B3,B4,B5,B6,B7,B8,
     Empty,Empty,Empty,Empty,Empty,Empty,Empty,Empty};
expand_segs({B1,B2,B3,B4,B5,B6,B7,B8,B9,B10,B11,B12,B13,B14,B15,B16}, Empty) ->
    {B1,B2,B3,B4,B5,B6,B7,B8,B9,B10,B11,B12,B13,B14,B15,B16,
     Empty,Empty,Empty,Empty,Empty,Empty,Empty,Empty,
     Empty,Empty,Empty,Empty,Empty,Empty,Empty,Empty};
expand_segs(Segs, Empty) ->
    list_to_tuple(tuple_to_list(Segs)
    ++ lists:duplicate(tuple_size(Segs), Empty)).

每当size达到exp_size时，就会激活多一个slot用于数据存储，增加的同时会纠正原来因slot不足存储在经过偏移计算的slot上的数据。当激活的slot的数量等于slot的最大值且size达到exp_size时，会对dict进行成倍扩展，即expand_segs处的处理。

1> dict:new().
{dict,0,16,16,8,80,48,
      {[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]},
      {{[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]}}}
2> D1 = lists:foldl(fun(N,TD) -> dict:store(N,N,TD) end, dict:new(), lists:seq(1,80)).
{dict,80,16,16,8,80,48,
      {[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]},
      {{[[16|16],[32|32],[48|48],[64|64],[80|80]],
        [[3|3],[19|19],[35|35],[51|51],[67|67]],
        [[6|6],[22|22],[38|38],[54|54],[70|70]],
        [[9|9],[25|25],[41|41],[57|57],[73|73]],
        [[12|12],[28|28],[44|44],[60|60],[76|76]],
        [[15|15],[31|31],[47|47],[63|63],[79|79]],
        [[2|2],[18|18],[34|34],[50|50],[66|66]],
        [[5|5],[21|21],[37|37],[53|53],[69|69]],
        [[8|8],[24|24],[40|40],[56|56],[72|72]],
        [[11|11],[27|27],[43|43],[59|59],[75|75]],
        [[14|14],[30|30],[46|46],[62|62],[78|78]],
        [[1|1],[17|17],[33|33],[49|49],[65|65]],
        [[4|4],[20|20],[36|36],[52|52],[68|...]],
        [[7|7],[23|23],[39|39],[55|...],[...]],
        [[10|10],[26|26],[42|...],[...]|...],
        [[13|13],[29|...],[...]|...]}}}
3> D2 = dict:store(81,81,D1).
{dict,81,17,32,16,85,51,
      {[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]},
      {{[[32|32],[64|64]],
        [[3|3],[19|19],[35|35],[51|51],[67|67]],
        [[6|6],[22|22],[38|38],[54|54],[70|70]],
        [[9|9],[25|25],[41|41],[57|57],[73|73]],
        [[12|12],[28|28],[44|44],[60|60],[76|76]],
        [[15|15],[31|31],[47|47],[63|63],[79|79]],
        [[2|2],[18|18],[34|34],[50|50],[66|66]],
        [[5|5],[21|21],[37|37],[53|53],[69|69]],
        [[8|8],[24|24],[40|40],[56|56],[72|72]],
        [[11|11],[27|27],[43|43],[59|59],[75|75]],
        [[14|14],[30|30],[46|46],[62|62],[78|78]],
        [[1|1],[17|17],[33|33],[49|49],[65|65],[81|...]],
        [[4|4],[20|20],[36|36],[52|52],[68|...]],
        [[7|7],[23|23],[39|39],[55|...],[...]],
        [[10|10],[26|26],[42|...],[...]|...],
        [[13|13],[29|...],[...]|...]},
       {[[16|16],[48|48],[80|80]],
        [],[],[],[],[],[],[],[],[],[],[],[],[],[],[]}}}

可以发现，当dict到了要扩展的时候，除了对segs进行扩展外，还纠正了原来放在slot 1中的数据，重新计算后放到了相应的位置上即slot 17上。

压缩则是上述的逆运算，在此就不赘述了。

array实际上是一个record，如下：

-record(array, {
	size :: non_neg_integer(),  %% number of defined entries
	max  :: non_neg_integer(),  %% maximum number of entries
	default,    %% the default value (usually 'undefined')
	elements :: elements(_)     %% the tuple tree
}).

elements是一个tuple tree，即用tuple包含tuple的方式组成的树，叶子节点就是元素值，元素默认以10个为一组，亦即完全展开的情况下，是一颗十叉树。但是对于没有赋值的节点，array用其叶子节点数量代替，并不展开。

1> array:set(0,value,array:new()).%% 10个节点全展开，索引为0的节点被赋值为value，其余的都是默认值。
  {array,1,10,undefined,
      {value,undefined,undefined,undefined,undefined,undefined,
             undefined,undefined,undefined,undefined}}
  2> array:set(10,value,array:new()).%% 是一课深度为3的树，只有索引为10的节点所在的子树有展开，其余的未展开且值为叶子节点数量。
  {array,11,100,undefined,
      {10,
       {value,undefined,undefined,undefined,undefined,undefined,
              undefined,undefined,undefined,undefined},
       10,10,10,10,10,10,10,10,10}}
  3> array:set(100,value,array:new()).%% 是一课深度为4的树，只有索引为100的节点所在的子树有展开，其余的未展开且值为叶子节点数量。
  {array,101,1000,undefined,
      {100,
       {{value,undefined,undefined,undefined,undefined,undefined,
               undefined,undefined,undefined,undefined},
        10,10,10,10,10,10,10,10,10,10},
       100,100,100,100,100,100,100,100,100}}