深入解析Hashtable、Dictionary、SortedDictionary、SortedList

最新推荐文章于 2024-05-11 10:45:11 发布

weixin_30359021

最新推荐文章于 2024-05-11 10:45:11 发布

阅读量185

点赞数

文章标签：数据结构与算法

原文链接：http://www.cnblogs.com/moozi/archive/2010/05/23/1741980.html

版权

在《在线用户实体缓存解决方案》方案中使用Dictionary来存储，评论里同事说SortedDictionary采用二分法查找比Dictionary快，于是我们都做了测试，最后发现Dictionary是比SortedDictionary快的，前者用的是Hash算法，而后者是RB-Tree算法。

于是想深入地分析如题的4个字典的原理。

我们先看Hashtable。

MSDN的解释：表示键/值对的集合，这些键/值对根据键的哈希代码进行组织。

Hash算法是把任意长度的输入（又叫做预映射， pre-image），通过散列算法，变换成固定长度的输出，该输出就是散列值。这种转换是一种压缩映射，也就是，散列值的空间通常远小于输入的空间，不同的输入可能会散列成相同的输出，而不可能从散列值来唯一的确定输入值。

Hashtable 对象由包含集合元素的存储桶组成。存储桶是 Hashtable 中各元素的虚拟子组，与大多数集合中进行的搜索和检索相比，存储桶可令搜索和检索更为便捷。每一存储桶都与一个哈希代码关联，该哈希代码是使用哈希函数生成的并基于该元素的键。

Hashtable 类默认的装填因子是 1.0，但实际上它默认的装填因子是 0.72。所有从构造函数输入的装填因子，Hashtable 类内部都会将其乘以0.72。这是一个要求苛刻的数字, 某些时刻将装填因子增减 0.01, 可能你的 Hashtable 存取效率就提高或降低了 50%，其原因是装填因子决定散列表容量，而散列表容量又影响 Key 的冲突几率，进而影响性能。0.72 是 Microsoft经过大量实验得出的一个比较平衡的值。

我们看Hashtable的一些源码：

Hashtable .ctor

public Hashtable() : this ( 0 , ( float ) 1f)
{
}
public Hashtable( int capacity, float loadFactor)
{
     if (capacity < 0 )
    {
         throw new ArgumentOutOfRangeException( " capacity " , Environment.GetResourceString( " ArgumentOutOfRange_NeedNonNegNum " ));
    }
     if ((loadFactor < 0.1f ) || (loadFactor > 1f))
    {
         throw new ArgumentOutOfRangeException( " loadFactor " , Environment.GetResourceString( " ArgumentOutOfRange_HashtableLoadFactor " , new object [] { 0.1 , 1.0 }));
    }
     this .loadFactor = 0.72f * loadFactor;
     double num = (( float ) capacity) / this .loadFactor;
     if (num > 2147483647.0 )
    {
         throw new ArgumentException(Environment.GetResourceString( " Arg_HTCapacityOverflow " ));
    }
     int num2 = (num > 11.0 ) ? HashHelpers.GetPrime(( int ) num) : 11 ;
     this .buckets = new bucket[num2];
     this .loadsize = ( int ) ( this .loadFactor * num2);
     this .isWriterInProgress = false ;
}

Hashtable 扩容是个耗时非常惊人的内部操作，它之所以写入效率仅为读取效率的 1/10 数量级，频繁的扩容是一个因素。当进行扩容时，散列表内部要重新 new 一个更大的数组，然后把原来数组的内容拷贝到新数组，并进行重新散列。如何 new这个更大的数组也有讲究。散列表的初始容量一般来讲是个素数。当扩容时，新数组的大小会设置成原数组双倍大小的相近的一个素数。

Hashtable expand

private void expand()
{
     int prime = HashHelpers.GetPrime( this .buckets.Length * 2 );
     this .rehash(prime);
}
private void rehash( int newsize)
{
     this .occupancy = 0 ;
    Hashtable.bucket[] newBuckets = new Hashtable.bucket[newsize];
     for ( int i = 0 ; i < this .buckets.Length; i ++ )
    {
        Hashtable.bucket bucket = this .buckets[i];
         if ((bucket.key != null ) && (bucket.key != this .buckets))
        {
             this .putEntry(newBuckets, bucket.key, bucket.val, bucket.hash_coll & 0x7fffffff );
        }
    }
    Thread.BeginCriticalRegion();
     this .isWriterInProgress = true ;
     this .buckets = newBuckets;
     this .loadsize = ( int ) ( this .loadFactor * newsize);
     this .UpdateVersion();
     this .isWriterInProgress = false ;
    Thread.EndCriticalRegion();
}

HashTable数据结构存在问题：空间利用率偏低、受填充因子影响大、扩容时所有的数据需要重新进行散列计算。虽然Hash具有O(1)的数据检索效率，但它空间开销却通常很大，是以空间换取时间。所以Hashtable适用于读取操作频繁，写入操作很少的操作类型。

而Dictionary<K, V> 也是用的Hash算法，通过数组实现多条链式结构。不过它是采用分离链接散列法。采用分离链接散列法不受到装填因子的影响，扩容时原有数据不需要重新进行散列计算。

采用分离链接法的 Dictionary<TKey, TValue> 会在内部维护一个链表数组。对于这个链表数组 L0,L1,...，LM-1, 散列函数将告诉我们应当把元素 X 插入到链表的什么位置。然后在 find 操作时告诉我们哪一个表中包含了 X。这种方法的思想在于：尽管搜索一个链表是线性操作，但如果表足够小，搜索非常快(事实也的确如此，同时这也是查找，插入，删除等操作并非总是 O(1) 的原因)。特别是，它不受装填因子的限制。
这种情况下，常见的装填因子是 1.0。更低的装填因子并不能明显的提高性能，但却需要更多的额外空间。

Dictionary .ctor

public Dictionary() : this ( 0 , null )
{
}
public Dictionary( int capacity, IEqualityComparer < TKey > comparer)
{
     if (capacity < 0 )
    {
        ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.capacity);
    }
     if (capacity > 0 )
    {
         this .Initialize(capacity);
    }
     if (comparer == null )
    {
        comparer = EqualityComparer < TKey > .Default;
    }
     this .comparer = comparer;
}
private void Resize()
{
     int prime = HashHelpers.GetPrime( this .count * 2 );
     int [] numArray = new int [prime];
     for ( int i = 0 ; i < numArray.Length; i ++ )
    {
        numArray[i] = - 1 ;
    }
    Entry < TKey, TValue > [] destinationArray = new Entry < TKey, TValue > [prime];
    Array.Copy( this .entries, 0 , destinationArray, 0 , this .count);
     for ( int j = 0 ; j < this .count; j ++ )
    {
         int index = destinationArray[j].hashCode % prime;
        destinationArray[j].next = numArray[index];
        numArray[index] = j;
    }
     this .buckets = numArray;
     this .entries = destinationArray;
}

Dictionary的插入算法：1、计算key的hash值，并且找到buckets中目标桶的链首索引，2、从链上依次查找是否key已经保存，3、如果没有的话，判断是否存在freeList，4、如果存在freeList，从freeList上摘下结点保存数据，否则追加在count位置上。

Dictionary Add

private void Insert(TKey key, TValue value, bool add)
{
     int freeList;
     if (key == null )
    {
        ThrowHelper.ThrowArgumentNullException(ExceptionArgument.key);
    }
     if ( this .buckets == null )
    {
         this .Initialize( 0 );
    }
     int num = this .comparer.GetHashCode(key) & 0x7fffffff ;
     int index = num % this .buckets.Length;
     for ( int i = this .buckets[index]; i >= 0 ; i = this .entries[i].next)
    {
         if (( this .entries[i].hashCode == num) && this .comparer.Equals( this .entries[i].key, key))
        {
             if (add)
            {
                ThrowHelper.ThrowArgumentException(ExceptionResource.Argument_AddingDuplicate);
            }
             this .entries[i].value = value;
             this .version ++ ;
             return ;
        }
    }
     if ( this .freeCount > 0 )
    {
        freeList = this .freeList;
         this .freeList = this .entries[freeList].next;
         this .freeCount -- ;
    }
     else
    {
         if ( this .count == this .entries.Length)
        {
             this .Resize();
            index = num % this .buckets.Length;
        }
        freeList = this .count;
         this .count ++ ;
    }
     this .entries[freeList].hashCode = num;
     this .entries[freeList].next = this .buckets[index];
     this .entries[freeList].key = key;
     this .entries[freeList].value = value;
     this .buckets[index] = freeList;
     this .version ++ ;
}

buckets数组保存所有数据链的链首，Buckets[i]表示在桶i中数据链的链首元素。entries结构体数组用于保存实际的数据，通过next值作为链式结构的向后索引。删除的数据空间会被串入到freeList链表的首部，当再次插入数据时，会首先查找freeList链表，以提高查找entries中空闲数据项位置的效率。在枚举器中，枚举顺序为entries数组的下标递增顺序。

Dictionary Remove

public bool Remove(TKey key)
{
     if (key == null )
    {
        ThrowHelper.ThrowArgumentNullException(ExceptionArgument.key);
    }
     if ( this .buckets != null )
    {
         int num = this .comparer.GetHashCode(key) & 0x7fffffff ;
         int index = num % this .buckets.Length;
         int num3 = - 1 ;
         for ( int i = this .buckets[index]; i >= 0 ; i = this .entries[i].next)
        {
             if (( this .entries[i].hashCode == num) && this .comparer.Equals( this .entries[i].key, key))
            {
                 if (num3 < 0 )
                {
                     this .buckets[index] = this .entries[i].next;
                }
                 else
                {
                     this .entries[num3].next = this .entries[i].next;
                }
                 this .entries[i].hashCode = - 1 ;
                 this .entries[i].next = this .freeList;
                 this .entries[i].key = default (TKey);
                 this .entries[i].value = default (TValue);
                 this .freeList = i;
                 this .freeCount ++ ;
                 this .version ++ ;
                 return true ;
            }
            num3 = i;
        }
    }
     return false ;
}

而SortedDictionary，MSDN是这样描述的：

SortedDictionary<(Of <(TKey, TValue>)>) 泛型类是检索运算复杂度为 O(log n) 的二叉搜索树，其中 n 是字典中的元素数。就这一点而言，它与 SortedList<(Of <(TKey, TValue>)>) 泛型类相似。这两个类具有相似的对象模型，并且都具有 O(log n) 的检索运算复杂度。这两个类的区别在于内存的使用以及插入和移除元素的速度：

SortedList<(Of <(TKey, TValue>)>) 使用的内存比 SortedDictionary<(Of <(TKey, TValue>)>) 少。
SortedDictionary<(Of <(TKey, TValue>)>) 可对未排序的数据执行更快的插入和移除操作：它的时间复杂度为 O(log n)，而 SortedList<(Of <(TKey, TValue>)>) 为 O(n)。
如果使用排序数据一次性填充列表，则 SortedList<(Of <(TKey, TValue>)>) 比 SortedDictionary<(Of <(TKey, TValue>)>) 快。

SortedDictionary<K, V>是按照K有序排列的(K, V)数据结构，以红黑树作为内部数据结构对K进行排列保存– TreeSet<T>，红黑树是一棵二叉搜索树，每个结点具有黑色或者红色的属性。它比普通的二叉搜索树拥有更好的平衡性。2-3-4树是红黑树在“理论”上的数据结构。

2-3-4树插入算法：类似于二叉搜索树的插入（插入数据插入到树的叶子结点），如果插入位置是2-结点或者3-结点，那么直接插入到当前结点，如果插入位置是4-结点，需要将当前的4-结点进行拆分，然后再执行后继的插入操作。

SortedDictionary Add

public void Add(T item)
{
     if ( this .root == null )
    {
         this .root = new Node < T > (item, false );
         this .count = 1 ;
    }
     else
    {
        Node < T > root = this .root;
        Node < T > node = null ;
        Node < T > grandParent = null ;
        Node < T > greatGrandParent = null ;
         int num = 0 ;
         while (root != null )
        {
            num = this .comparer.Compare(item, root.Item);
             if (num == 0 )
            {
                 this .root.IsRed = false ;
                ThrowHelper.ThrowArgumentException(ExceptionResource.Argument_AddingDuplicate);
            }
             if (TreeSet < T > .Is4Node(root))
            {
                TreeSet < T > .Split4Node(root);
                 if (TreeSet < T > .IsRed(node))
                {
                     this .InsertionBalance(root, ref node, grandParent, greatGrandParent);
                }
            }
            greatGrandParent = grandParent;
            grandParent = node;
            node = root;
            root = (num < 0 ) ? root.Left : root.Right;
        }
        Node < T > current = new Node < T > (item);
         if (num > 0 )
        {
            node.Right = current;
        }
         else
        {
            node.Left = current;
        }
         if (node.IsRed)
        {
             this .InsertionBalance(current, ref node, grandParent, greatGrandParent);
        }
         this .root.IsRed = false ;
         this .count ++ ;
         this .version ++ ;
    }
}

我们来测试一下Hashtable、Dictionary和SortedDictionary的插入和查找性能。

性能测试代码

using System;
using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;

namespace DictionaryTest
{
     class Program
    {
         private static int totalCount = 10000 ;

         static void Main( string [] args)
        {
            HashtableTest();
            DictionaryTest();
            SortedDictionaryTest();
            Console.ReadKey();
        }

         private static void HashtableTest()
        {
            Hashtable hastable = new Hashtable();
            Stopwatch watch = new Stopwatch();
            watch.Start();
             for ( int i = 1 ; i < totalCount; i ++ )
            {
                hastable.Add(i, 0 );
            }
            watch.Stop();
            Console.WriteLine( string .Format( " Hashtable添加{0}个元素耗时：{1}ms " ,totalCount, watch.ElapsedMilliseconds));
            Console.WriteLine( " Hashtable不做查找测试 " );
            hastable.Clear();
        }

         private static void DictionaryTest()
        {
            Dictionary < int , int > dictionary = new Dictionary < int , int > ();
            Stopwatch watch = new Stopwatch();
            watch.Start();
             for ( int i = 1 ; i < totalCount; i ++ )
            {
                dictionary.Add(i, 0 );
            }
            watch.Stop();
            Console.WriteLine( string .Format( " Dictionary添加{0}个元素耗时：{1}ms " ,totalCount, watch.ElapsedMilliseconds));
            watch.Reset();
            watch.Start();
            dictionary.Select(o => o.Key % 1000 == 0 ).ToList().ForEach(o => { });
            watch.Stop();
            Console.WriteLine( string .Format( " Dictionary查找能被1000整除的元素耗时：{0}ms " , watch.ElapsedMilliseconds));
            dictionary.Clear();
        }

         private static void SortedDictionaryTest()
        {
            SortedDictionary < int , int > dictionary = new SortedDictionary < int , int > ();
            Stopwatch watch = new Stopwatch();
            watch.Start();
             for ( int i = 1 ; i < totalCount; i ++ )
            {
                dictionary.Add(i, 0 );
            }
            watch.Stop();
            Console.WriteLine( string .Format( " SortedDictionary添加{0}个元素耗时：{1}ms " ,totalCount, watch.ElapsedMilliseconds));
            watch.Reset();
            watch.Start();
            dictionary.Select(o => o.Key % 1000 == 0 ).ToList().ForEach(o => { });
            watch.Stop();
            Console.WriteLine( string .Format( " SortedDictionary查找能被1000整除的元素耗时：{0}ms " , watch.ElapsedMilliseconds));
            dictionary.Clear();
        }
    }
}

最终结果如图：