字典:哈希结构的应用

  1. 什么是哈希结构?

通过关键字来访问内存存储位置的数据结构。简单来说哈希结构由一个散列函数和一个数组组成,以key-value为例,通过散列函数对key进行计算,得到的值对应数组中的某个下标,于是就将value存于该位置上。

哈希值

key

value

哈希值

key

value

哈希值

key

value

哈希值

key

value

哈希值

key

value

哈希值

key

value

哈希值

key

value

哈希值

key

value

  1. 解决哈希冲突

链表法解决哈希冲突

import hashlib


class MyDict:
    def __init__(self, arr_length=8):
        self.values = [[None, None, None] for i in range(arr_length)]
        # 维护一个key-value元素的添加顺序表
        self.add_order = []

    def add(self, k, v):
        hash_value = self._gen_unchanged_hash_value(k)
        location = hash_value % len(self.values)
        start_location = location
        # 解决hash冲突:开放定址法-之-线性探查法
        if self.values[location][0]:
            while self.values[location][0]:
                if location == len(self.values) - 1:
                    location = -1
                location += 1
                if location == start_location:
                    raise Exception('该对底层数组扩容了!')
        self.values[location] = [hash_value, k, v]
        # 记录新添加的元素位于self.values列表中的那个位置
        self.add_order.append(location)

    def get(self, k):
        hash_value = self._gen_unchanged_hash_value(k)
        location = hash_value % len(self.values)
        # 解决hash冲突:开放定址法-之-线性探查法
        while self.values[location][1] != k:
            if location == len(self.values) - 1:
                location = -1
            location += 1
        return self.values[location][2]

    def __repr__(self):
        temp = []
        for index in self.add_order:
            temp.append('\'' + str(self.values[index][1]) + '\': ' + str(self.values[index][2]))
        return '{' + ', '.join(temp) + '}'

    @staticmethod
    def _gen_unchanged_hash_value(k):
        md5gen = hashlib.md5()
        md5gen.update(k.encode(encoding='utf8'))
        md5code = md5gen.hexdigest()
        md5value = int(md5code, base=16)
        return md5value


if __name__ == '__main__':
    my_dict = MyDict()
    my_dict.add('c', 3)
    my_dict.add('d', 4)
    my_dict.add('e', 6)
    my_dict.add('y', 9)
    my_dict.add('f', 10)
    my_dict.add('g', 11)
    my_dict.add('h', 12)
    my_dict.add('i', 13)
    # my_dict.add('j', 14)
    print(my_dict)
    print(my_dict.values)
    print(my_dict.get('h') == 12)  # True
    # {'c': 3, 'd': 4, 'e': 6, 'y': 9, 'f': 10, 'g': 11, 'h': 12, 'i': 13}
    # [[237879873640563870927460284185496028253, 'g', 11], [49268479078006861543109070154241760913, 'h', 12],
    #  [299611584147932843547128611849858313266, 'e', 6], [99079589977253916124855502156832923443, 'c', 3],
    #  [178594487029704683911797514996985530177, 'i', 13], [173422077530204247440288476180261147053, 'd', 4],
    #  [86828518130618008439946455853590066269, 'y', 9], [190917122200326810055233066464581373159, 'f', 10]]
    # True

使用开放定址法的线性探索来解决哈希冲突,会遇到当添加的key-value越来越多时,会把存放key-value的数组填满,此时就需要对哈希结构扩容,python的dict扩容会将已经存在的键值对重新哈希取余计算存放到其他位置。

使用链表法解决哈希冲突,会使哈希结构的装填因子>1,若要保留字典的有序输出特性,维护的元素添加顺序表要做改变,由原来的整型变为元组(桶的位置, 桶中元素的位置),代码如下:

import hashlib


class MyDict:
    def __init__(self, arr_length=8):
        self.values = [[] for i in range(arr_length)]
        # 维护一个key-value元素的添加顺序表
        self.add_order = []

    def add(self, k, v):
        hash_value = self._gen_unchanged_hash_value(k)
        barrel_location = hash_value % len(self.values)
        # 解决hash冲突:链表法(数组形式实现)
        if not self.values[barrel_location]:
            self.add_order.append((barrel_location, 0))
        else:
            index = len(self.values[barrel_location])
            self.add_order.append((barrel_location, index))
        self.values[barrel_location].append((k, v))

    def get(self, k):
        hash_value = self._gen_unchanged_hash_value(k)
        barrel_location = hash_value % len(self.values)
        # 解决hash冲突:链表法(数组形式实现)
        for item in self.values[barrel_location]:
            if item[0] == k:
                return item[1]
        return f'key {k} not exist'

    def __repr__(self):
        temp = []
        for location in self.add_order:
            item = self.values[location[0]][location[1]]
            temp.append('\'' + str(item[0]) + '\': ' + str(item[1]))
        return '{' + ', '.join(temp) + '}'

    @staticmethod
    def _gen_unchanged_hash_value(k):
        md5gen = hashlib.md5()
        md5gen.update(k.encode(encoding='utf8'))
        md5code = md5gen.hexdigest()
        md5value = int(md5code, base=16)
        return md5value


if __name__ == '__main__':
    my_dict = MyDict()
    my_dict.add('c', 3)
    my_dict.add('d', 4)
    my_dict.add('e', 6)
    my_dict.add('y', 9)
    my_dict.add('f', 10)
    my_dict.add('g', 11)
    my_dict.add('h', 12)
    my_dict.add('i', 13)
    my_dict.add('j', 14)
    print(my_dict)
    print(my_dict.values)
    print(my_dict.get('h') == 12)  # True
    print(my_dict.get('l'))  # key l not exist
    # {'c': 3, 'd': 4, 'e': 6, 'y': 9, 'f': 10, 'g': 11, 'h': 12, 'i': 13, 'j': 14}
    # [[], [('h', 12), ('i', 13)], [('e', 6)], [('c', 3)], [], [('d', 4), ('y', 9), ('g', 11), ('j', 14)], [],
    #  [('f', 10)]]
  1. 扩缩容(改变字典底层存储数据的数组长度)

问题引入:

my_dict = MyDict(arr_length=8)
# {'c': 3, 'd': 4, 'e': 6, 'y': 9, 'f': 10, 'g': 11, 'h': 12, 'i': 13, 'j': 14}
    # [[], [('h', 12), ('i', 13)], [('e', 6)], [('c', 3)], [], [('d', 4), ('y', 9), ('g', 11), ('j', 14)], [], [('f', 10)]]
my_dict = MyDict(arr_length=11)
# {'c': 3, 'd': 4, 'e': 6, 'y': 9, 'f': 10, 'g': 11, 'h': 12, 'i': 13, 'j': 14}
    # [[], [('y', 9)], [('d', 4)], [('c', 3)], [], [('h', 12)], [], [('e', 6), ('j', 14)], [('i', 13)], [('f', 10)], [('g', 11)]]

上面两段代码可以看出,当改变字典底层存储数据的数组长度时,会打乱key-value在数组中出现的位置,主要是因为hash(key)对数组长度的余数变了。

redis集群部署中使用了哈希槽分区的概念,可以想象为一个大圆形(0, 2^16 - 1)进行N等分,hash(k)对大圆形的长度进行取余,顺时针计算,余数落在哪个区间,就将该key-value放置于哪个hash槽中,代码实现如下:

import math

import hashlib


class MyDict:
    def __init__(self, arr_length=8):
        self.circular_max = int(math.pow(2, 5) - 1)
        self.arr_length = arr_length
        self.values = []
        self.locations = []
        self.add_order = []
        self._gen_bottom_stored_data_array()

    def _gen_bottom_stored_data_array(self):
        step = int(math.pow(2, 5) - 1) // self.arr_length
        for index in range(self.arr_length):
            self.values.append([0 + step * index, []])
            self.locations.append(0 + step * index)
        self.locations.sort(reverse=True)

    def add(self, k, v):
        hash_value = self._gen_unchanged_hash_value(k)
        remainder = hash_value % self.circular_max
        location_index = self._get_clockwise_latest_location(remainder)
        self.add_item(k, v, location_index, self.values[location_index][1])

    def add_item(self, k, v, barrel_location, values):
        index = len(values)
        self.add_order.append((barrel_location, index))
        values.append((k, v))

    def _get_clockwise_latest_location(self, remainder):
        for index in range(len(self.locations)):
            if remainder <= self.locations[index]:
                return index
        return -1

    def get(self, k):
        hash_value = self._gen_unchanged_hash_value(k)
        remainder = hash_value % self.circular_max
        location_index = self._get_clockwise_latest_location(remainder)
        for item in self.values[location_index][1]:
            if item[0] == k:
                return item[1]
        return f'key {k} not exist'

    def __repr__(self):
        temp = []
        for location in self.add_order:
            item = self.values[location[0]][1][location[1]]
            temp.append('\'' + str(item[0]) + '\': ' + str(item[1]))
        return '{' + ', '.join(temp) + '}'

    @staticmethod
    def _gen_unchanged_hash_value(k):
        md5gen = hashlib.md5()
        md5gen.update(k.encode(encoding='utf8'))
        md5code = md5gen.hexdigest()
        md5value = int(md5code, base=16)
        return md5value


if __name__ == '__main__':
    my_dict = MyDict(arr_length=8)
    my_dict.add('c', 3)
    my_dict.add('d', 4)
    my_dict.add('e', 6)
    my_dict.add('f', 9)
    my_dict.add('g', 10)
    my_dict.add('h', 11)
    my_dict.add('i', 12)
    my_dict.add('j', 13)
    my_dict.add('k', 14)
    print(my_dict)
    print(my_dict.values)
    print(my_dict.get('h') == 11)  # True
    print(my_dict.get('l'))  # key l not exist
    # {'c': 3, 'd': 4, 'e': 6, 'f': 9, 'g': 10, 'h': 11, 'i': 12, 'j': 13, 'k': 14}
    # [[0, [('c', 3), ('e', 6), ('g', 10), ('k', 14)]], [3, []], [6, []], [9, []], [12, []], [15, []], [18, []], [21, [('d', 4), ('f', 9), ('h', 11), ('i', 12), ('j', 13)]]]
    # True
    # key l not exist

需要进行扩容时,重新对大圆形(0, 2^16-1)进行N+M等分,不过是对原本的N个槽区,每个槽区都切割一部分,划给新增的M个槽区,这样可以保证一大部分数据在哈希槽中的位置不变

待实现?

缩容时,将去掉的某个槽区的key-value全部移植到某个指定的槽区即可

待实现?

  1. 字典的有序输出

python3.6之前,字典的输出是无序的,主要是因为通过散列函数求得的哈希值,对数组长度取余后,余数的位置对应的value在数组中的位置,因此在输出时的顺序并不是按照赋值时的顺序,而在python3.6之后,通过多维护一张表(入字典的次序-value数组中的位置),使得字典的输出是有序的

hash(key1) % 8 = 2

hash(key2) % 8 = 7

2

7

1

0

.

.

.

hash

key1

value

在python3.6之前可以简单的认为python中字典的key-value仅仅存在于一个二维数组中,因此hash(key) 对数组长度取余后的位置索引会出现在数组的某个位置上,例如:

{'c': 3, 'd': 4}
[[None, None, None], [None, None, None], [None, None, None], [99079589977253916124855502156832923443, 'c', 3], [None, None, None], [173422077530204247440288476180261147053, 'd', 4], [None, None, None], [None, None, None]]

获取字典中所有的键值对时,顺序遍历二维数组中的值,当某个key的hash值对长度取余后,其落在数组靠前的位置索引处,于是在遍历字典的key-value时,就不是按照字典的添加顺序展示了,如下所示:

my_dict = MyDict()
my_dict.add('c', 3)
my_dict.add('d', 4)
my_dict.add('e', 6)

# {'e': 6, 'c': 3, 'd': 4}
# [[None, None, None], [None, None, None], [299611584147932843547128611849858313266, 'e', 6],
#  [99079589977253916124855502156832923443, 'c', 3], [None, None, None],
#  [173422077530204247440288476180261147053, 'd', 4], [None, None, None], [None, None, None]]

hash冲突是指不同的key,利用散列函数得到的值对数组长度取余后,在数组中的索引位置相同,就会出现后者覆盖前者的情况,如下所示,字符'd'与字符'y'通过计算得到的索引值相同,于是'y'-value就覆盖了'd'-value

my_dict = MyDict()
my_dict.add('c', 3)
my_dict.add('d', 4)
my_dict.add('e', 6)
my_dict.add('y', 9)
print(my_dict._gen_unchanged_hash_value('d') % 8 == my_dict._gen_unchanged_hash_value('y') % 8)  # True

# {'e': 6, 'c': 3, 'y': 9}
    # [[None, None, None], [None, None, None], [299611584147932843547128611849858313266, 'e', 6],
    #  [99079589977253916124855502156832923443, 'c', 3], [None, None, None],
    #  [86828518130618008439946455853590066269, 'y', 9], [None, None, None], [None, None, None]]

此部分完整代码如下:

import hashlib


class MyDict:
    def __init__(self, arr_length=8):
        self.values = [[None, None, None] for i in range(arr_length)]

    def add(self, k, v):
        hash_value = self._gen_unchanged_hash_value(k)
        location = hash_value % len(self.values)
        self.values[location] = [hash_value, k, v]

    def get(self, k):
        hash_value = self._gen_unchanged_hash_value(k)
        location = hash_value % len(self.values)
        return self.values[location][2]

    def __repr__(self):
        temp = []
        for item in self.values:
            if item[0]:
                temp.append('\'' + str(item[1]) + '\': ' + str(item[2]))
        return '{' + ', '.join(temp) + '}'

    @staticmethod
    def _gen_unchanged_hash_value(k):
        md5gen = hashlib.md5()
        md5gen.update(k.encode(encoding='utf8'))
        md5code = md5gen.hexdigest()
        md5value = int(md5code, base=16)
        return md5value


if __name__ == '__main__':
    my_dict = MyDict()
    my_dict.add('c', 3)
    my_dict.add('d', 4)
    print(my_dict)
    print(my_dict.values)
    # {'c': 3, 'd': 4}
    # [[None, None, None], [None, None, None], [None, None, None], [99079589977253916124855502156832923443, 'c', 3],
    #  [None, None, None], [173422077530204247440288476180261147053, 'd', 4], [None, None, None], [None, None, None]]

    # 字典的输出无序
    my_dict.add('e', 6)
    print(my_dict)
    print(my_dict.values)
    # {'e': 6, 'c': 3, 'd': 4}
    # [[None, None, None], [None, None, None], [299611584147932843547128611849858313266, 'e', 6],
    #  [99079589977253916124855502156832923443, 'c', 3], [None, None, None],
    #  [173422077530204247440288476180261147053, 'd', 4], [None, None, None], [None, None, None]]

    # hash冲突
    my_dict.add('y', 9)
    print(my_dict)
    print(my_dict.values)
    print(my_dict._gen_unchanged_hash_value('d') % 8 == my_dict._gen_unchanged_hash_value('y') % 8)  # True
    # {'e': 6, 'c': 3, 'y': 9}
    # [[None, None, None], [None, None, None], [299611584147932843547128611849858313266, 'e', 6],
    #  [99079589977253916124855502156832923443, 'c', 3], [None, None, None],
    #  [86828518130618008439946455853590066269, 'y', 9], [None, None, None], [None, None, None]]

字典的有序输出

python3.6之后字典的输出是有序的,主要是因为底层维护了一个key-value元素的添加顺序表,进行字典输出时,遍历该列表中的值,其对应着二维数组中的key-value的存储位置,代码如下

import hashlib


class MyDict:
    def __init__(self, arr_length=8):
        self.values = [[None, None, None] for i in range(arr_length)]
        # 维护一个key-value元素的添加顺序表
        self.add_order = []

    def add(self, k, v):
        hash_value = self._gen_unchanged_hash_value(k)
        location = hash_value % len(self.values)
        self.values[location] = [hash_value, k, v]
        # 记录新添加的元素位于self.values列表中的那个位置
        self.add_order.append(location)

    def get(self, k):
        hash_value = self._gen_unchanged_hash_value(k)
        location = hash_value % len(self.values)
        return self.values[location][2]

    # def __repr__(self):
    #     temp = []
    #     for item in self.values:
    #         if item[0]:
    #             temp.append('\'' + str(item[1]) + '\': ' + str(item[2]))
    #     return '{' + ', '.join(temp) + '}'
    def __repr__(self):
        temp = []
        for index in self.add_order:
            temp.append('\'' + str(self.values[index][1]) + '\': ' + str(self.values[index][2]))
        return '{' + ', '.join(temp) + '}'

    @staticmethod
    def _gen_unchanged_hash_value(k):
        md5gen = hashlib.md5()
        md5gen.update(k.encode(encoding='utf8'))
        md5code = md5gen.hexdigest()
        md5value = int(md5code, base=16)
        return md5value


if __name__ == '__main__':
    my_dict = MyDict()
    my_dict.add('c', 3)
    my_dict.add('d', 4)
    print(my_dict)
    print(my_dict.values)
    # {'c': 3, 'd': 4}
    # [[None, None, None], [None, None, None], [None, None, None], [99079589977253916124855502156832923443, 'c', 3],
    #  [None, None, None], [173422077530204247440288476180261147053, 'd', 4], [None, None, None], [None, None, None]]

    # 字典的输出无序
    my_dict.add('e', 6)
    print(my_dict)
    print(my_dict.values)
    # {'e': 6, 'c': 3, 'd': 4}
    # [[None, None, None], [None, None, None], [299611584147932843547128611849858313266, 'e', 6],
    #  [99079589977253916124855502156832923443, 'c', 3], [None, None, None],
    #  [173422077530204247440288476180261147053, 'd', 4], [None, None, None], [None, None, None]]
    print(my_dict.add_order)  # [3, 5, 2]
    # {'c': 3, 'd': 4, 'e': 6}
    # [[None, None, None], [None, None, None], [299611584147932843547128611849858313266, 'e', 6],
    #  [99079589977253916124855502156832923443, 'c', 3], [None, None, None],
    #  [173422077530204247440288476180261147053, 'd', 4], [None, None, None], [None, None, None]]

    # # hash冲突
    # my_dict.add('y', 9)
    # print(my_dict)
    # print(my_dict.values)
    # print(my_dict._gen_unchanged_hash_value('d') % 8 == my_dict._gen_unchanged_hash_value('y') % 8)  # True
    # # {'e': 6, 'c': 3, 'y': 9}
    # # [[None, None, None], [None, None, None], [299611584147932843547128611849858313266, 'e', 6],
    # #  [99079589977253916124855502156832923443, 'c', 3], [None, None, None],
    # #  [86828518130618008439946455853590066269, 'y', 9], [None, None, None], [None, None, None]]

解决hash冲突

最简单的解决hash冲突的方法为开放定址法中的线性探查法,从发生冲突的单元起,依次判断下一个单元是否为空,当达到最后一个单元时,再从表首依次判断。直到碰到空闲的单元或者探查完全部单元为止

import hashlib


class MyDict:
    def __init__(self, arr_length=8):
        self.values = [[None, None, None] for i in range(arr_length)]
        # 维护一个key-value元素的添加顺序表
        self.add_order = []

    def add(self, k, v):
        hash_value = self._gen_unchanged_hash_value(k)
        location = hash_value % len(self.values)
        start_location = location
        # 解决hash冲突:开放定址法-之-线性探查法
        if self.values[location][0]:
            while self.values[location][0]:
                if location == len(self.values) - 1:
                    location = -1
                location += 1
                if location == start_location:
                    raise Exception('该对底层数组扩容了!')
        self.values[location] = [hash_value, k, v]
        # 记录新添加的元素位于self.values列表中的那个位置
        self.add_order.append(location)

    def get(self, k):
        hash_value = self._gen_unchanged_hash_value(k)
        location = hash_value % len(self.values)
        # 解决hash冲突:开放定址法-之-线性探查法
        while self.values[location][1] != k:
            if location == len(self.values) - 1:
                location = -1
            location += 1
        return self.values[location][2]

    # def __repr__(self):
    #     temp = []
    #     for item in self.values:
    #         if item[0]:
    #             temp.append('\'' + str(item[1]) + '\': ' + str(item[2]))
    #     return '{' + ', '.join(temp) + '}'
    def __repr__(self):
        temp = []
        for index in self.add_order:
            temp.append('\'' + str(self.values[index][1]) + '\': ' + str(self.values[index][2]))
        return '{' + ', '.join(temp) + '}'

    @staticmethod
    def _gen_unchanged_hash_value(k):
        md5gen = hashlib.md5()
        md5gen.update(k.encode(encoding='utf8'))
        md5code = md5gen.hexdigest()
        md5value = int(md5code, base=16)
        return md5value


if __name__ == '__main__':
    my_dict = MyDict()
    my_dict.add('c', 3)
    my_dict.add('d', 4)
    print(my_dict)
    print(my_dict.values)
    # {'c': 3, 'd': 4}
    # [[None, None, None], [None, None, None], [None, None, None], [99079589977253916124855502156832923443, 'c', 3],
    #  [None, None, None], [173422077530204247440288476180261147053, 'd', 4], [None, None, None], [None, None, None]]

    # 字典的输出无序
    my_dict.add('e', 6)
    print(my_dict)
    print(my_dict.values)
    # {'e': 6, 'c': 3, 'd': 4}
    # [[None, None, None], [None, None, None], [299611584147932843547128611849858313266, 'e', 6],
    #  [99079589977253916124855502156832923443, 'c', 3], [None, None, None],
    #  [173422077530204247440288476180261147053, 'd', 4], [None, None, None], [None, None, None]]
    print(my_dict.add_order)  # [3, 5, 2]
    # {'c': 3, 'd': 4, 'e': 6}
    # [[None, None, None], [None, None, None], [299611584147932843547128611849858313266, 'e', 6],
    #  [99079589977253916124855502156832923443, 'c', 3], [None, None, None],
    #  [173422077530204247440288476180261147053, 'd', 4], [None, None, None], [None, None, None]]

    # hash冲突
    my_dict.add('y', 9)
    print(my_dict)
    print(my_dict.values)
    print(my_dict._gen_unchanged_hash_value('d') % 8 == my_dict._gen_unchanged_hash_value('y') % 8)  # True
    # # {'e': 6, 'c': 3, 'y': 9}
    # # [[None, None, None], [None, None, None], [299611584147932843547128611849858313266, 'e', 6],
    # #  [99079589977253916124855502156832923443, 'c', 3], [None, None, None],
    # #  [86828518130618008439946455853590066269, 'y', 9], [None, None, None], [None, None, None]]
    print(my_dict.add_order)
    print(my_dict.get('y'))

    # 解决hash冲突
    my_dict.add('f', 10)
    my_dict.add('g', 11)
    my_dict.add('h', 12)
    my_dict.add('i', 13)
    # my_dict.add('j', 14)
    print(my_dict)
    print(my_dict.values)
    print(my_dict.get('g') == 11)  # True
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值