字典：哈希结构的应用

balalabiu

已于 2023-03-04 16:47:42 修改

阅读量90

点赞数

分类专栏：算法与数据结构文章标签： python Powered by 金山文档

于 2023-02-12 17:58:02 首次发布

本文链接：https://blog.csdn.net/balalabiu/article/details/128997423

版权

算法与数据结构专栏收录该内容

3 篇文章 0 订阅

订阅专栏

什么是哈希结构？

通过关键字来访问内存存储位置的数据结构。简单来说哈希结构由一个散列函数和一个数组组成，以key-value为例，通过散列函数对key进行计算，得到的值对应数组中的某个下标，于是就将value存于该位置上。

哈希值	key	value
哈希值	key	value
哈希值	key	value
哈希值	key	value
哈希值	key	value
哈希值	key	value
哈希值	key	value
哈希值	key	value

解决哈希冲突

链表法解决哈希冲突

import hashlib


class MyDict:
    def __init__(self, arr_length=8):
        self.values = [[None, None, None] for i in range(arr_length)]
        # 维护一个key-value元素的添加顺序表
        self.add_order = []

    def add(self, k, v):
        hash_value = self._gen_unchanged_hash_value(k)
        location = hash_value % len(self.values)
        start_location = location
        # 解决hash冲突：开放定址法-之-线性探查法
        if self.values[location][0]:
            while self.values[location][0]:
                if location == len(self.values) - 1:
                    location = -1
                location += 1
                if location == start_location:
                    raise Exception('该对底层数组扩容了！')
        self.values[location] = [hash_value, k, v]
        # 记录新添加的元素位于self.values列表中的那个位置
        self.add_order.append(location)

    def get(self, k):
        hash_value = self._gen_unchanged_hash_value(k)
        location = hash_value % len(self.values)
        # 解决hash冲突：开放定址法-之-线性探查法
        while self.values[location][1] != k:
            if location == len(self.values) - 1:
                location = -1
            location += 1
        return self.values[location][2]

    def __repr__(self):
        temp = []
        for index in self.add_order:
            temp.append('\'' + str(self.values[index][1]) + '\': ' + str(self.values[index][2]))
        return '{' + ', '.join(temp) + '}'

    @staticmethod
    def _gen_unchanged_hash_value(k):
        md5gen = hashlib.md5()
        md5gen.update(k.encode(encoding='utf8'))
        md5code = md5gen.hexdigest()
        md5value = int(md5code, base=16)
        return md5value


if __name__ == '__main__':
    my_dict = MyDict()
    my_dict.add('c', 3)
    my_dict.add('d', 4)
    my_dict.add('e', 6)
    my_dict.add('y', 9)
    my_dict.add('f', 10)
    my_dict.add('g', 11)
    my_dict.add('h', 12)
    my_dict.add('i', 13)
    # my_dict.add('j', 14)
    print(my_dict)
    print(my_dict.values)
    print(my_dict.get('h') == 12)  # True
    # {'c': 3, 'd': 4, 'e': 6, 'y': 9, 'f': 10, 'g': 11, 'h': 12, 'i': 13}
    # [[237879873640563870927460284185496028253, 'g', 11], [49268479078006861543109070154241760913, 'h', 12],
    #  [299611584147932843547128611849858313266, 'e', 6], [99079589977253916124855502156832923443, 'c', 3],
    #  [178594487029704683911797514996985530177, 'i', 13], [173422077530204247440288476180261147053, 'd', 4],
    #  [86828518130618008439946455853590066269, 'y', 9], [190917122200326810055233066464581373159, 'f', 10]]
    # True

使用开放定址法的线性探索来解决哈希冲突，会遇到当添加的key-value越来越多时，会把存放key-value的数组填满，此时就需要对哈希结构扩容，python的dict扩容会将已经存在的键值对重新哈希取余计算存放到其他位置。

使用链表法解决哈希冲突，会使哈希结构的装填因子>1，若要保留字典的有序输出特性，维护的元素添加顺序表要做改变，由原来的整型变为元组（桶的位置, 桶中元素的位置），代码如下：

import hashlib


class MyDict:
    def __init__(self, arr_length=8):
        self.values = [[] for i in range(arr_length)]
        # 维护一个key-value元素的添加顺序表
        self.add_order = []

    def add(self, k, v):
        hash_value = self._gen_unchanged_hash_value(k)
        barrel_location = hash_value % len(self.values)
        # 解决hash冲突：链表法（数组形式实现）
        if not self.values[barrel_location]:
            self.add_order.append((barrel_location, 0))
        else:
            index = len(self.values[barrel_location])
            self.add_order.append((barrel_location, index))
        self.values[barrel_location].append((k, v))

    def get(self, k):
        hash_value = self._gen_unchanged_hash_value(k)
        barrel_location = hash_value % len(self.values)
        # 解决hash冲突：链表法（数组形式实现）
        for item in self.values[barrel_location]:
            if item[0] == k:
                return item[1]
        return f'key {k} not exist'

    def __repr__(self):
        temp = []
        for location in self.add_order:
            item = self.values[location[0]][location[1]]
            temp.append('\'' + str(item[0]) + '\': ' + str(item[1]))
        return '{' + ', '.join(temp) + '}'

    @staticmethod
    def _gen_unchanged_hash_value(k):
        md5gen = hashlib.md5()
        md5gen.update(k.encode(encoding='utf8'))
        md5code = md5gen.hexdigest()
        md5value = int(md5code, base=16)
        return md5value


if __name__ == '__main__':
    my_dict = MyDict()
    my_dict.add('c', 3)
    my_dict.add('d', 4)
    my_dict.add('e', 6)
    my_dict.add('y', 9)
    my_dict.add('f', 10)
    my_dict.add('g', 11)
    my_dict.add('h', 12)
    my_dict.add('i', 13)
    my_dict.add('j', 14)
    print(my_dict)
    print(my_dict.values)
    print(my_dict.get('h') == 12)  # True
    print(my_dict.get('l'))  # key l not exist
    # {'c': 3, 'd': 4, 'e': 6, 'y': 9, 'f': 10, 'g': 11, 'h': 12, 'i': 13, 'j': 14}
    # [[], [('h', 12), ('i', 13)], [('e', 6)], [('c', 3)], [], [('d', 4), ('y', 9), ('g', 11), ('j', 14)], [],
    #  [('f', 10)]]

扩缩容（改变字典底层存储数据的数组长度）

问题引入：

my_dict = MyDict(arr_length=8)
# {'c': 3, 'd': 4, 'e': 6, 'y': 9, 'f': 10, 'g': 11, 'h': 12, 'i': 13, 'j': 14}
    # [[], [('h', 12), ('i', 13)], [('e', 6)], [('c', 3)], [], [('d', 4), ('y', 9), ('g', 11), ('j', 14)], [], [('f', 10)]]

my_dict = MyDict(arr_length=11)
# {'c': 3, 'd': 4, 'e': 6, 'y': 9, 'f': 10, 'g': 11, 'h': 12, 'i': 13, 'j': 14}
    # [[], [('y', 9)], [('d', 4)], [('c', 3)], [], [('h', 12)], [], [('e', 6), ('j', 14)], [('i', 13)], [('f', 10)], [('g', 11)]]

上面两段代码可以看出，当改变字典底层存储数据的数组长度时，会打乱key-value在数组中出现的位置，主要是因为hash(key)对数组长度的余数变了。

redis集群部署中使用了哈希槽分区的概念，可以想象为一个大圆形（0， 2^16 - 1）进行N等分，hash(k)对大圆形的长度进行取余，顺时针计算，余数落在哪个区间，就将该key-value放置于哪个hash槽中，代码实现如下：

import math

import hashlib


class MyDict:
    def __init__(self, arr_length=8):
        self.circular_max = int(math.pow(2, 5) - 1)
        self.arr_length = arr_length
        self.values = []
        self.locations = []
        self.add_order = []
        self._gen_bottom_stored_data_array()

    def _gen_bottom_stored_data_array(self):
        step = int(math.pow(2, 5) - 1) // self.arr_length
        for index in range(self.arr_length):
            self.values.append([0 + step * index, []])
            self.locations.append(0 + step * index)
        self.locations.sort(reverse=True)

    def add(self, k, v):
        hash_value = self._gen_unchanged_hash_value(k)
        remainder = hash_value % self.circular_max
        location_index = self._get_clockwise_latest_location(remainder)
        self.add_item(k, v, location_index, self.values[location_index][1])

    def add_item(self, k, v, barrel_location, values):
        index = len(values)
        self.add_order.append((barrel_location, index))
        values.append((k, v))

    def _get_clockwise_latest_location(self, remainder):
        for index in range(len(self.locations)):
            if remainder <= self.locations[index]:
                return index
        return -1

    def get(self, k):
        hash_value = self._gen_unchanged_hash_value(k)
        remainder = hash_value % self.circular_max
        location_index = self._get_clockwise_latest_location(remainder)
        for item in self.values[location_index][1]:
            if item[0] == k:
                return item[1]
        return f'key {k} not exist'

    def __repr__(self):
        temp = []
        for location in self.add_order:
            item = self.values[location[0]][1][location[1]]
            temp.append('\'' + str(item[0]) + '\': ' + str(item[1]))
        return '{' + ', '.join(temp) + '}'

    @staticmethod
    def _gen_unchanged_hash_value(k):
        md5gen = hashlib.md5()
        md5gen.update(k.encode(encoding='utf8'))
        md5code = md5gen.hexdigest()
        md5value = int(md5code, base=16)
        return md5value


if __name__ == '__main__':
    my_dict = MyDict(arr_length=8)
    my_dict.add('c', 3)
    my_dict.add('d', 4)
    my_dict.add('e', 6)
    my_dict.add('f', 9)
    my_dict.add('g', 10)
    my_dict.add('h', 11)
    my_dict.add('i', 12)
    my_dict.add('j', 13)
    my_dict.add('k', 14)
    print(my_dict)
    print(my_dict.values)
    print(my_dict.get('h') == 11)  # True
    print(my_dict.get('l'))  # key l not exist
    # {'c': 3, 'd': 4, 'e': 6, 'f': 9, 'g': 10, 'h': 11, 'i': 12, 'j': 13, 'k': 14}
    # [[0, [('c', 3), ('e', 6), ('g', 10), ('k', 14)]], [3, []], [6, []], [9, []], [12, []], [15, []], [18, []], [21, [('d', 4), ('f', 9), ('h', 11), ('i', 12), ('j', 13)]]]
    # True
    # key l not exist

需要进行扩容时，重新对大圆形（0， 2^16-1）进行N+M等分，不过是对原本的N个槽区，每个槽区都切割一部分，划给新增的M个槽区，这样可以保证一大部分数据在哈希槽中的位置不变

待实现？

缩容时，将去掉的某个槽区的key-value全部移植到某个指定的槽区即可

待实现？

字典的有序输出

python3.6之前，字典的输出是无序的，主要是因为通过散列函数求得的哈希值，对数组长度取余后，余数的位置对应的value在数组中的位置，因此在输出时的顺序并不是按照赋值时的顺序，而在python3.6之后，通过多维护一张表（入字典的次序-value数组中的位置），使得字典的输出是有序的

hash(key1) % 8 = 2

hash(key2) % 8 = 7



hash	key1	value

在python3.6之前可以简单的认为python中字典的key-value仅仅存在于一个二维数组中，因此hash(key) 对数组长度取余后的位置索引会出现在数组的某个位置上，例如：

{'c': 3, 'd': 4}
[[None, None, None], [None, None, None], [None, None, None], [99079589977253916124855502156832923443, 'c', 3], [None, None, None], [173422077530204247440288476180261147053, 'd', 4], [None, None, None], [None, None, None]]

获取字典中所有的键值对时，顺序遍历二维数组中的值，当某个key的hash值对长度取余后，其落在数组靠前的位置索引处，于是在遍历字典的key-value时，就不是按照字典的添加顺序展示了，如下所示：

my_dict = MyDict()
my_dict.add('c', 3)
my_dict.add('d', 4)
my_dict.add('e', 6)

# {'e': 6, 'c': 3, 'd': 4}
# [[None, None, None], [None, None, None], [299611584147932843547128611849858313266, 'e', 6],
#  [99079589977253916124855502156832923443, 'c', 3], [None, None, None],
#  [173422077530204247440288476180261147053, 'd', 4], [None, None, None], [None, None, None]]

hash冲突是指不同的key，利用散列函数得到的值对数组长度取余后，在数组中的索引位置相同，就会出现后者覆盖前者的情况，如下所示，字符'd'与字符'y'通过计算得到的索引值相同，于是'y'-value就覆盖了'd'-value

my_dict = MyDict()
my_dict.add('c', 3)
my_dict.add('d', 4)
my_dict.add('e', 6)
my_dict.add('y', 9)
print(my_dict._gen_unchanged_hash_value('d') % 8 == my_dict._gen_unchanged_hash_value('y') % 8)  # True

# {'e': 6, 'c': 3, 'y': 9}
    # [[None, None, None], [None, None, None], [299611584147932843547128611849858313266, 'e', 6],
    #  [99079589977253916124855502156832923443, 'c', 3], [None, None, None],
    #  [86828518130618008439946455853590066269, 'y', 9], [None, None, None], [None, None, None]]

此部分完整代码如下：

import hashlib


class MyDict:
    def __init__(self, arr_length=8):
        self.values = [[None, None, None] for i in range(arr_length)]

    def add(self, k, v):
        hash_value = self._gen_unchanged_hash_value(k)
        location = hash_value % len(self.values)
        self.values[location] = [hash_value, k, v]

    def get(self, k):
        hash_value = self._gen_unchanged_hash_value(k)
        location = hash_value % len(self.values)
        return self.values[location][2]

    def __repr__(self):
        temp = []
        for item in self.values:
            if item[0]:
                temp.append('\'' + str(item[1]) + '\': ' + str(item[2]))
        return '{' + ', '.join(temp) + '}'

    @staticmethod
    def _gen_unchanged_hash_value(k):
        md5gen = hashlib.md5()
        md5gen.update(k.encode(encoding='utf8'))
        md5code = md5gen.hexdigest()
        md5value = int(md5code, base=16)
        return md5value


if __name__ == '__main__':
    my_dict = MyDict()
    my_dict.add('c', 3)
    my_dict.add('d', 4)
    print(my_dict)
    print(my_dict.values)
    # {'c': 3, 'd': 4}
    # [[None, None, None], [None, None, None], [None, None, None], [99079589977253916124855502156832923443, 'c', 3],
    #  [None, None, None], [173422077530204247440288476180261147053, 'd', 4], [None, None, None], [None, None, None]]

    # 字典的输出无序
    my_dict.add('e', 6)
    print(my_dict)
    print(my_dict.values)
    # {'e': 6, 'c': 3, 'd': 4}
    # [[None, None, None], [None, None, None], [299611584147932843547128611849858313266, 'e', 6],
    #  [99079589977253916124855502156832923443, 'c', 3], [None, None, None],
    #  [173422077530204247440288476180261147053, 'd', 4], [None, None, None], [None, None, None]]

    # hash冲突
    my_dict.add('y', 9)
    print(my_dict)
    print(my_dict.values)
    print(my_dict._gen_unchanged_hash_value('d') % 8 == my_dict._gen_unchanged_hash_value('y') % 8)  # True
    # {'e': 6, 'c': 3, 'y': 9}
    # [[None, None, None], [None, None, None], [299611584147932843547128611849858313266, 'e', 6],
    #  [99079589977253916124855502156832923443, 'c', 3], [None, None, None],
    #  [86828518130618008439946455853590066269, 'y', 9], [None, None, None], [None, None, None]]

字典的有序输出

python3.6之后字典的输出是有序的，主要是因为底层维护了一个key-value元素的添加顺序表，进行字典输出时，遍历该列表中的值，其对应着二维数组中的key-value的存储位置，代码如下

import hashlib


class MyDict:
    def __init__(self, arr_length=8):
        self.values = [[None, None, None] for i in range(arr_length)]
        # 维护一个key-value元素的添加顺序表
        self.add_order = []

    def add(self, k, v):
        hash_value = self._gen_unchanged_hash_value(k)
        location = hash_value % len(self.values)
        self.values[location] = [hash_value, k, v]
        # 记录新添加的元素位于self.values列表中的那个位置
        self.add_order.append(location)

    def get(self, k):
        hash_value = self._gen_unchanged_hash_value(k)
        location = hash_value % len(self.values)
        return self.values[location][2]

    # def __repr__(self):
    #     temp = []
    #     for item in self.values:
    #         if item[0]:
    #             temp.append('\'' + str(item[1]) + '\': ' + str(item[2]))
    #     return '{' + ', '.join(temp) + '}'
    def __repr__(self):
        temp = []
        for index in self.add_order:
            temp.append('\'' + str(self.values[index][1]) + '\': ' + str(self.values[index][2]))
        return '{' + ', '.join(temp) + '}'

    @staticmethod
    def _gen_unchanged_hash_value(k):
        md5gen = hashlib.md5()
        md5gen.update(k.encode(encoding='utf8'))
        md5code = md5gen.hexdigest()
        md5value = int(md5code, base=16)
        return md5value


if __name__ == '__main__':
    my_dict = MyDict()
    my_dict.add('c', 3)
    my_dict.add('d', 4)
    print(my_dict)
    print(my_dict.values)
    # {'c': 3, 'd': 4}
    # [[None, None, None], [None, None, None], [None, None, None], [99079589977253916124855502156832923443, 'c', 3],
    #  [None, None, None], [173422077530204247440288476180261147053, 'd', 4], [None, None, None], [None, None, None]]

    # 字典的输出无序
    my_dict.add('e', 6)
    print(my_dict)
    print(my_dict.values)
    # {'e': 6, 'c': 3, 'd': 4}
    # [[None, None, None], [None, None, None], [299611584147932843547128611849858313266, 'e', 6],
    #  [99079589977253916124855502156832923443, 'c', 3], [None, None, None],
    #  [173422077530204247440288476180261147053, 'd', 4], [None, None, None], [None, None, None]]
    print(my_dict.add_order)  # [3, 5, 2]
    # {'c': 3, 'd': 4, 'e': 6}
    # [[None, None, None], [None, None, None], [299611584147932843547128611849858313266, 'e', 6],
    #  [99079589977253916124855502156832923443, 'c', 3], [None, None, None],
    #  [173422077530204247440288476180261147053, 'd', 4], [None, None, None], [None, None, None]]

    # # hash冲突
    # my_dict.add('y', 9)
    # print(my_dict)
    # print(my_dict.values)
    # print(my_dict._gen_unchanged_hash_value('d') % 8 == my_dict._gen_unchanged_hash_value('y') % 8)  # True
    # # {'e': 6, 'c': 3, 'y': 9}
    # # [[None, None, None], [None, None, None], [299611584147932843547128611849858313266, 'e', 6],
    # #  [99079589977253916124855502156832923443, 'c', 3], [None, None, None],
    # #  [86828518130618008439946455853590066269, 'y', 9], [None, None, None], [None, None, None]]

解决hash冲突

最简单的解决hash冲突的方法为开放定址法中的线性探查法，从发生冲突的单元起，依次判断下一个单元是否为空，当达到最后一个单元时，再从表首依次判断。直到碰到空闲的单元或者探查完全部单元为止

import hashlib


class MyDict:
    def __init__(self, arr_length=8):
        self.values = [[None, None, None] for i in range(arr_length)]
        # 维护一个key-value元素的添加顺序表
        self.add_order = []

    def add(self, k, v):
        hash_value = self._gen_unchanged_hash_value(k)
        location = hash_value % len(self.values)
        start_location = location
        # 解决hash冲突：开放定址法-之-线性探查法
        if self.values[location][0]:
            while self.values[location][0]:
                if location == len(self.values) - 1:
                    location = -1
                location += 1
                if location == start_location:
                    raise Exception('该对底层数组扩容了！')
        self.values[location] = [hash_value, k, v]
        # 记录新添加的元素位于self.values列表中的那个位置
        self.add_order.append(location)

    def get(self, k):
        hash_value = self._gen_unchanged_hash_value(k)
        location = hash_value % len(self.values)
        # 解决hash冲突：开放定址法-之-线性探查法
        while self.values[location][1] != k:
            if location == len(self.values) - 1:
                location = -1
            location += 1
        return self.values[location][2]

    # def __repr__(self):
    #     temp = []
    #     for item in self.values:
    #         if item[0]:
    #             temp.append('\'' + str(item[1]) + '\': ' + str(item[2]))
    #     return '{' + ', '.join(temp) + '}'
    def __repr__(self):
        temp = []
        for index in self.add_order:
            temp.append('\'' + str(self.values[index][1]) + '\': ' + str(self.values[index][2]))
        return '{' + ', '.join(temp) + '}'

    @staticmethod
    def _gen_unchanged_hash_value(k):
        md5gen = hashlib.md5()
        md5gen.update(k.encode(encoding='utf8'))
        md5code = md5gen.hexdigest()
        md5value = int(md5code, base=16)
        return md5value


if __name__ == '__main__':
    my_dict = MyDict()
    my_dict.add('c', 3)
    my_dict.add('d', 4)
    print(my_dict)
    print(my_dict.values)
    # {'c': 3, 'd': 4}
    # [[None, None, None], [None, None, None], [None, None, None], [99079589977253916124855502156832923443, 'c', 3],
    #  [None, None, None], [173422077530204247440288476180261147053, 'd', 4], [None, None, None], [None, None, None]]

    # 字典的输出无序
    my_dict.add('e', 6)
    print(my_dict)
    print(my_dict.values)
    # {'e': 6, 'c': 3, 'd': 4}
    # [[None, None, None], [None, None, None], [299611584147932843547128611849858313266, 'e', 6],
    #  [99079589977253916124855502156832923443, 'c', 3], [None, None, None],
    #  [173422077530204247440288476180261147053, 'd', 4], [None, None, None], [None, None, None]]
    print(my_dict.add_order)  # [3, 5, 2]
    # {'c': 3, 'd': 4, 'e': 6}
    # [[None, None, None], [None, None, None], [299611584147932843547128611849858313266, 'e', 6],
    #  [99079589977253916124855502156832923443, 'c', 3], [None, None, None],
    #  [173422077530204247440288476180261147053, 'd', 4], [None, None, None], [None, None, None]]

    # hash冲突
    my_dict.add('y', 9)
    print(my_dict)
    print(my_dict.values)
    print(my_dict._gen_unchanged_hash_value('d') % 8 == my_dict._gen_unchanged_hash_value('y') % 8)  # True
    # # {'e': 6, 'c': 3, 'y': 9}
    # # [[None, None, None], [None, None, None], [299611584147932843547128611849858313266, 'e', 6],
    # #  [99079589977253916124855502156832923443, 'c', 3], [None, None, None],
    # #  [86828518130618008439946455853590066269, 'y', 9], [None, None, None], [None, None, None]]
    print(my_dict.add_order)
    print(my_dict.get('y'))

    # 解决hash冲突
    my_dict.add('f', 10)
    my_dict.add('g', 11)
    my_dict.add('h', 12)
    my_dict.add('i', 13)
    # my_dict.add('j', 14)
    print(my_dict)
    print(my_dict.values)
    print(my_dict.get('g') == 11)  # True