什么是哈希结构?
通过关键字来访问内存存储位置的数据结构。简单来说哈希结构由一个散列函数和一个数组组成,以key-value为例,通过散列函数对key进行计算,得到的值对应数组中的某个下标,于是就将value存于该位置上。
哈希值 | key | value |
哈希值 | key | value |
哈希值 | key | value |
哈希值 | key | value |
哈希值 | key | value |
哈希值 | key | value |
哈希值 | key | value |
哈希值 | key | value |
解决哈希冲突
链表法解决哈希冲突
import hashlib
class MyDict:
def __init__(self, arr_length=8):
self.values = [[None, None, None] for i in range(arr_length)]
# 维护一个key-value元素的添加顺序表
self.add_order = []
def add(self, k, v):
hash_value = self._gen_unchanged_hash_value(k)
location = hash_value % len(self.values)
start_location = location
# 解决hash冲突:开放定址法-之-线性探查法
if self.values[location][0]:
while self.values[location][0]:
if location == len(self.values) - 1:
location = -1
location += 1
if location == start_location:
raise Exception('该对底层数组扩容了!')
self.values[location] = [hash_value, k, v]
# 记录新添加的元素位于self.values列表中的那个位置
self.add_order.append(location)
def get(self, k):
hash_value = self._gen_unchanged_hash_value(k)
location = hash_value % len(self.values)
# 解决hash冲突:开放定址法-之-线性探查法
while self.values[location][1] != k:
if location == len(self.values) - 1:
location = -1
location += 1
return self.values[location][2]
def __repr__(self):
temp = []
for index in self.add_order:
temp.append('\'' + str(self.values[index][1]) + '\': ' + str(self.values[index][2]))
return '{' + ', '.join(temp) + '}'
@staticmethod
def _gen_unchanged_hash_value(k):
md5gen = hashlib.md5()
md5gen.update(k.encode(encoding='utf8'))
md5code = md5gen.hexdigest()
md5value = int(md5code, base=16)
return md5value
if __name__ == '__main__':
my_dict = MyDict()
my_dict.add('c', 3)
my_dict.add('d', 4)
my_dict.add('e', 6)
my_dict.add('y', 9)
my_dict.add('f', 10)
my_dict.add('g', 11)
my_dict.add('h', 12)
my_dict.add('i', 13)
# my_dict.add('j', 14)
print(my_dict)
print(my_dict.values)
print(my_dict.get('h') == 12) # True
# {'c': 3, 'd': 4, 'e': 6, 'y': 9, 'f': 10, 'g': 11, 'h': 12, 'i': 13}
# [[237879873640563870927460284185496028253, 'g', 11], [49268479078006861543109070154241760913, 'h', 12],
# [299611584147932843547128611849858313266, 'e', 6], [99079589977253916124855502156832923443, 'c', 3],
# [178594487029704683911797514996985530177, 'i', 13], [173422077530204247440288476180261147053, 'd', 4],
# [86828518130618008439946455853590066269, 'y', 9], [190917122200326810055233066464581373159, 'f', 10]]
# True
使用开放定址法的线性探索来解决哈希冲突,会遇到当添加的key-value越来越多时,会把存放key-value的数组填满,此时就需要对哈希结构扩容,python的dict扩容会将已经存在的键值对重新哈希取余计算存放到其他位置。
使用链表法解决哈希冲突,会使哈希结构的装填因子>1,若要保留字典的有序输出特性,维护的元素添加顺序表要做改变,由原来的整型变为元组(桶的位置, 桶中元素的位置),代码如下:
import hashlib
class MyDict:
def __init__(self, arr_length=8):
self.values = [[] for i in range(arr_length)]
# 维护一个key-value元素的添加顺序表
self.add_order = []
def add(self, k, v):
hash_value = self._gen_unchanged_hash_value(k)
barrel_location = hash_value % len(self.values)
# 解决hash冲突:链表法(数组形式实现)
if not self.values[barrel_location]:
self.add_order.append((barrel_location, 0))
else:
index = len(self.values[barrel_location])
self.add_order.append((barrel_location, index))
self.values[barrel_location].append((k, v))
def get(self, k):
hash_value = self._gen_unchanged_hash_value(k)
barrel_location = hash_value % len(self.values)
# 解决hash冲突:链表法(数组形式实现)
for item in self.values[barrel_location]:
if item[0] == k:
return item[1]
return f'key {k} not exist'
def __repr__(self):
temp = []
for location in self.add_order:
item = self.values[location[0]][location[1]]
temp.append('\'' + str(item[0]) + '\': ' + str(item[1]))
return '{' + ', '.join(temp) + '}'
@staticmethod
def _gen_unchanged_hash_value(k):
md5gen = hashlib.md5()
md5gen.update(k.encode(encoding='utf8'))
md5code = md5gen.hexdigest()
md5value = int(md5code, base=16)
return md5value
if __name__ == '__main__':
my_dict = MyDict()
my_dict.add('c', 3)
my_dict.add('d', 4)
my_dict.add('e', 6)
my_dict.add('y', 9)
my_dict.add('f', 10)
my_dict.add('g', 11)
my_dict.add('h', 12)
my_dict.add('i', 13)
my_dict.add('j', 14)
print(my_dict)
print(my_dict.values)
print(my_dict.get('h') == 12) # True
print(my_dict.get('l')) # key l not exist
# {'c': 3, 'd': 4, 'e': 6, 'y': 9, 'f': 10, 'g': 11, 'h': 12, 'i': 13, 'j': 14}
# [[], [('h', 12), ('i', 13)], [('e', 6)], [('c', 3)], [], [('d', 4), ('y', 9), ('g', 11), ('j', 14)], [],
# [('f', 10)]]
扩缩容(改变字典底层存储数据的数组长度)
问题引入:
my_dict = MyDict(arr_length=8)
# {'c': 3, 'd': 4, 'e': 6, 'y': 9, 'f': 10, 'g': 11, 'h': 12, 'i': 13, 'j': 14}
# [[], [('h', 12), ('i', 13)], [('e', 6)], [('c', 3)], [], [('d', 4), ('y', 9), ('g', 11), ('j', 14)], [], [('f', 10)]]
my_dict = MyDict(arr_length=11)
# {'c': 3, 'd': 4, 'e': 6, 'y': 9, 'f': 10, 'g': 11, 'h': 12, 'i': 13, 'j': 14}
# [[], [('y', 9)], [('d', 4)], [('c', 3)], [], [('h', 12)], [], [('e', 6), ('j', 14)], [('i', 13)], [('f', 10)], [('g', 11)]]
上面两段代码可以看出,当改变字典底层存储数据的数组长度时,会打乱key-value在数组中出现的位置,主要是因为hash(key)对数组长度的余数变了。
redis集群部署中使用了哈希槽分区的概念,可以想象为一个大圆形(0, 2^16 - 1)进行N等分,hash(k)对大圆形的长度进行取余,顺时针计算,余数落在哪个区间,就将该key-value放置于哪个hash槽中,代码实现如下:
import math
import hashlib
class MyDict:
def __init__(self, arr_length=8):
self.circular_max = int(math.pow(2, 5) - 1)
self.arr_length = arr_length
self.values = []
self.locations = []
self.add_order = []
self._gen_bottom_stored_data_array()
def _gen_bottom_stored_data_array(self):
step = int(math.pow(2, 5) - 1) // self.arr_length
for index in range(self.arr_length):
self.values.append([0 + step * index, []])
self.locations.append(0 + step * index)
self.locations.sort(reverse=True)
def add(self, k, v):
hash_value = self._gen_unchanged_hash_value(k)
remainder = hash_value % self.circular_max
location_index = self._get_clockwise_latest_location(remainder)
self.add_item(k, v, location_index, self.values[location_index][1])
def add_item(self, k, v, barrel_location, values):
index = len(values)
self.add_order.append((barrel_location, index))
values.append((k, v))
def _get_clockwise_latest_location(self, remainder):
for index in range(len(self.locations)):
if remainder <= self.locations[index]:
return index
return -1
def get(self, k):
hash_value = self._gen_unchanged_hash_value(k)
remainder = hash_value % self.circular_max
location_index = self._get_clockwise_latest_location(remainder)
for item in self.values[location_index][1]:
if item[0] == k:
return item[1]
return f'key {k} not exist'
def __repr__(self):
temp = []
for location in self.add_order:
item = self.values[location[0]][1][location[1]]
temp.append('\'' + str(item[0]) + '\': ' + str(item[1]))
return '{' + ', '.join(temp) + '}'
@staticmethod
def _gen_unchanged_hash_value(k):
md5gen = hashlib.md5()
md5gen.update(k.encode(encoding='utf8'))
md5code = md5gen.hexdigest()
md5value = int(md5code, base=16)
return md5value
if __name__ == '__main__':
my_dict = MyDict(arr_length=8)
my_dict.add('c', 3)
my_dict.add('d', 4)
my_dict.add('e', 6)
my_dict.add('f', 9)
my_dict.add('g', 10)
my_dict.add('h', 11)
my_dict.add('i', 12)
my_dict.add('j', 13)
my_dict.add('k', 14)
print(my_dict)
print(my_dict.values)
print(my_dict.get('h') == 11) # True
print(my_dict.get('l')) # key l not exist
# {'c': 3, 'd': 4, 'e': 6, 'f': 9, 'g': 10, 'h': 11, 'i': 12, 'j': 13, 'k': 14}
# [[0, [('c', 3), ('e', 6), ('g', 10), ('k', 14)]], [3, []], [6, []], [9, []], [12, []], [15, []], [18, []], [21, [('d', 4), ('f', 9), ('h', 11), ('i', 12), ('j', 13)]]]
# True
# key l not exist
需要进行扩容时,重新对大圆形(0, 2^16-1)进行N+M等分,不过是对原本的N个槽区,每个槽区都切割一部分,划给新增的M个槽区,这样可以保证一大部分数据在哈希槽中的位置不变
待实现?
缩容时,将去掉的某个槽区的key-value全部移植到某个指定的槽区即可
待实现?
字典的有序输出
python3.6之前,字典的输出是无序的,主要是因为通过散列函数求得的哈希值,对数组长度取余后,余数的位置对应的value在数组中的位置,因此在输出时的顺序并不是按照赋值时的顺序,而在python3.6之后,通过多维护一张表(入字典的次序-value数组中的位置),使得字典的输出是有序的
hash(key1) % 8 = 2
hash(key2) % 8 = 7
2 | 7 | 1 | 0 | . | . | . | . |
hash | key1 | value |
在python3.6之前可以简单的认为python中字典的key-value仅仅存在于一个二维数组中,因此hash(key) 对数组长度取余后的位置索引会出现在数组的某个位置上,例如:
{'c': 3, 'd': 4}
[[None, None, None], [None, None, None], [None, None, None], [99079589977253916124855502156832923443, 'c', 3], [None, None, None], [173422077530204247440288476180261147053, 'd', 4], [None, None, None], [None, None, None]]
获取字典中所有的键值对时,顺序遍历二维数组中的值,当某个key的hash值对长度取余后,其落在数组靠前的位置索引处,于是在遍历字典的key-value时,就不是按照字典的添加顺序展示了,如下所示:
my_dict = MyDict()
my_dict.add('c', 3)
my_dict.add('d', 4)
my_dict.add('e', 6)
# {'e': 6, 'c': 3, 'd': 4}
# [[None, None, None], [None, None, None], [299611584147932843547128611849858313266, 'e', 6],
# [99079589977253916124855502156832923443, 'c', 3], [None, None, None],
# [173422077530204247440288476180261147053, 'd', 4], [None, None, None], [None, None, None]]
hash冲突是指不同的key,利用散列函数得到的值对数组长度取余后,在数组中的索引位置相同,就会出现后者覆盖前者的情况,如下所示,字符'd'与字符'y'通过计算得到的索引值相同,于是'y'-value就覆盖了'd'-value
my_dict = MyDict()
my_dict.add('c', 3)
my_dict.add('d', 4)
my_dict.add('e', 6)
my_dict.add('y', 9)
print(my_dict._gen_unchanged_hash_value('d') % 8 == my_dict._gen_unchanged_hash_value('y') % 8) # True
# {'e': 6, 'c': 3, 'y': 9}
# [[None, None, None], [None, None, None], [299611584147932843547128611849858313266, 'e', 6],
# [99079589977253916124855502156832923443, 'c', 3], [None, None, None],
# [86828518130618008439946455853590066269, 'y', 9], [None, None, None], [None, None, None]]
此部分完整代码如下:
import hashlib
class MyDict:
def __init__(self, arr_length=8):
self.values = [[None, None, None] for i in range(arr_length)]
def add(self, k, v):
hash_value = self._gen_unchanged_hash_value(k)
location = hash_value % len(self.values)
self.values[location] = [hash_value, k, v]
def get(self, k):
hash_value = self._gen_unchanged_hash_value(k)
location = hash_value % len(self.values)
return self.values[location][2]
def __repr__(self):
temp = []
for item in self.values:
if item[0]:
temp.append('\'' + str(item[1]) + '\': ' + str(item[2]))
return '{' + ', '.join(temp) + '}'
@staticmethod
def _gen_unchanged_hash_value(k):
md5gen = hashlib.md5()
md5gen.update(k.encode(encoding='utf8'))
md5code = md5gen.hexdigest()
md5value = int(md5code, base=16)
return md5value
if __name__ == '__main__':
my_dict = MyDict()
my_dict.add('c', 3)
my_dict.add('d', 4)
print(my_dict)
print(my_dict.values)
# {'c': 3, 'd': 4}
# [[None, None, None], [None, None, None], [None, None, None], [99079589977253916124855502156832923443, 'c', 3],
# [None, None, None], [173422077530204247440288476180261147053, 'd', 4], [None, None, None], [None, None, None]]
# 字典的输出无序
my_dict.add('e', 6)
print(my_dict)
print(my_dict.values)
# {'e': 6, 'c': 3, 'd': 4}
# [[None, None, None], [None, None, None], [299611584147932843547128611849858313266, 'e', 6],
# [99079589977253916124855502156832923443, 'c', 3], [None, None, None],
# [173422077530204247440288476180261147053, 'd', 4], [None, None, None], [None, None, None]]
# hash冲突
my_dict.add('y', 9)
print(my_dict)
print(my_dict.values)
print(my_dict._gen_unchanged_hash_value('d') % 8 == my_dict._gen_unchanged_hash_value('y') % 8) # True
# {'e': 6, 'c': 3, 'y': 9}
# [[None, None, None], [None, None, None], [299611584147932843547128611849858313266, 'e', 6],
# [99079589977253916124855502156832923443, 'c', 3], [None, None, None],
# [86828518130618008439946455853590066269, 'y', 9], [None, None, None], [None, None, None]]
字典的有序输出
python3.6之后字典的输出是有序的,主要是因为底层维护了一个key-value元素的添加顺序表,进行字典输出时,遍历该列表中的值,其对应着二维数组中的key-value的存储位置,代码如下
import hashlib
class MyDict:
def __init__(self, arr_length=8):
self.values = [[None, None, None] for i in range(arr_length)]
# 维护一个key-value元素的添加顺序表
self.add_order = []
def add(self, k, v):
hash_value = self._gen_unchanged_hash_value(k)
location = hash_value % len(self.values)
self.values[location] = [hash_value, k, v]
# 记录新添加的元素位于self.values列表中的那个位置
self.add_order.append(location)
def get(self, k):
hash_value = self._gen_unchanged_hash_value(k)
location = hash_value % len(self.values)
return self.values[location][2]
# def __repr__(self):
# temp = []
# for item in self.values:
# if item[0]:
# temp.append('\'' + str(item[1]) + '\': ' + str(item[2]))
# return '{' + ', '.join(temp) + '}'
def __repr__(self):
temp = []
for index in self.add_order:
temp.append('\'' + str(self.values[index][1]) + '\': ' + str(self.values[index][2]))
return '{' + ', '.join(temp) + '}'
@staticmethod
def _gen_unchanged_hash_value(k):
md5gen = hashlib.md5()
md5gen.update(k.encode(encoding='utf8'))
md5code = md5gen.hexdigest()
md5value = int(md5code, base=16)
return md5value
if __name__ == '__main__':
my_dict = MyDict()
my_dict.add('c', 3)
my_dict.add('d', 4)
print(my_dict)
print(my_dict.values)
# {'c': 3, 'd': 4}
# [[None, None, None], [None, None, None], [None, None, None], [99079589977253916124855502156832923443, 'c', 3],
# [None, None, None], [173422077530204247440288476180261147053, 'd', 4], [None, None, None], [None, None, None]]
# 字典的输出无序
my_dict.add('e', 6)
print(my_dict)
print(my_dict.values)
# {'e': 6, 'c': 3, 'd': 4}
# [[None, None, None], [None, None, None], [299611584147932843547128611849858313266, 'e', 6],
# [99079589977253916124855502156832923443, 'c', 3], [None, None, None],
# [173422077530204247440288476180261147053, 'd', 4], [None, None, None], [None, None, None]]
print(my_dict.add_order) # [3, 5, 2]
# {'c': 3, 'd': 4, 'e': 6}
# [[None, None, None], [None, None, None], [299611584147932843547128611849858313266, 'e', 6],
# [99079589977253916124855502156832923443, 'c', 3], [None, None, None],
# [173422077530204247440288476180261147053, 'd', 4], [None, None, None], [None, None, None]]
# # hash冲突
# my_dict.add('y', 9)
# print(my_dict)
# print(my_dict.values)
# print(my_dict._gen_unchanged_hash_value('d') % 8 == my_dict._gen_unchanged_hash_value('y') % 8) # True
# # {'e': 6, 'c': 3, 'y': 9}
# # [[None, None, None], [None, None, None], [299611584147932843547128611849858313266, 'e', 6],
# # [99079589977253916124855502156832923443, 'c', 3], [None, None, None],
# # [86828518130618008439946455853590066269, 'y', 9], [None, None, None], [None, None, None]]
解决hash冲突
最简单的解决hash冲突的方法为开放定址法中的线性探查法,从发生冲突的单元起,依次判断下一个单元是否为空,当达到最后一个单元时,再从表首依次判断。直到碰到空闲的单元或者探查完全部单元为止
import hashlib
class MyDict:
def __init__(self, arr_length=8):
self.values = [[None, None, None] for i in range(arr_length)]
# 维护一个key-value元素的添加顺序表
self.add_order = []
def add(self, k, v):
hash_value = self._gen_unchanged_hash_value(k)
location = hash_value % len(self.values)
start_location = location
# 解决hash冲突:开放定址法-之-线性探查法
if self.values[location][0]:
while self.values[location][0]:
if location == len(self.values) - 1:
location = -1
location += 1
if location == start_location:
raise Exception('该对底层数组扩容了!')
self.values[location] = [hash_value, k, v]
# 记录新添加的元素位于self.values列表中的那个位置
self.add_order.append(location)
def get(self, k):
hash_value = self._gen_unchanged_hash_value(k)
location = hash_value % len(self.values)
# 解决hash冲突:开放定址法-之-线性探查法
while self.values[location][1] != k:
if location == len(self.values) - 1:
location = -1
location += 1
return self.values[location][2]
# def __repr__(self):
# temp = []
# for item in self.values:
# if item[0]:
# temp.append('\'' + str(item[1]) + '\': ' + str(item[2]))
# return '{' + ', '.join(temp) + '}'
def __repr__(self):
temp = []
for index in self.add_order:
temp.append('\'' + str(self.values[index][1]) + '\': ' + str(self.values[index][2]))
return '{' + ', '.join(temp) + '}'
@staticmethod
def _gen_unchanged_hash_value(k):
md5gen = hashlib.md5()
md5gen.update(k.encode(encoding='utf8'))
md5code = md5gen.hexdigest()
md5value = int(md5code, base=16)
return md5value
if __name__ == '__main__':
my_dict = MyDict()
my_dict.add('c', 3)
my_dict.add('d', 4)
print(my_dict)
print(my_dict.values)
# {'c': 3, 'd': 4}
# [[None, None, None], [None, None, None], [None, None, None], [99079589977253916124855502156832923443, 'c', 3],
# [None, None, None], [173422077530204247440288476180261147053, 'd', 4], [None, None, None], [None, None, None]]
# 字典的输出无序
my_dict.add('e', 6)
print(my_dict)
print(my_dict.values)
# {'e': 6, 'c': 3, 'd': 4}
# [[None, None, None], [None, None, None], [299611584147932843547128611849858313266, 'e', 6],
# [99079589977253916124855502156832923443, 'c', 3], [None, None, None],
# [173422077530204247440288476180261147053, 'd', 4], [None, None, None], [None, None, None]]
print(my_dict.add_order) # [3, 5, 2]
# {'c': 3, 'd': 4, 'e': 6}
# [[None, None, None], [None, None, None], [299611584147932843547128611849858313266, 'e', 6],
# [99079589977253916124855502156832923443, 'c', 3], [None, None, None],
# [173422077530204247440288476180261147053, 'd', 4], [None, None, None], [None, None, None]]
# hash冲突
my_dict.add('y', 9)
print(my_dict)
print(my_dict.values)
print(my_dict._gen_unchanged_hash_value('d') % 8 == my_dict._gen_unchanged_hash_value('y') % 8) # True
# # {'e': 6, 'c': 3, 'y': 9}
# # [[None, None, None], [None, None, None], [299611584147932843547128611849858313266, 'e', 6],
# # [99079589977253916124855502156832923443, 'c', 3], [None, None, None],
# # [86828518130618008439946455853590066269, 'y', 9], [None, None, None], [None, None, None]]
print(my_dict.add_order)
print(my_dict.get('y'))
# 解决hash冲突
my_dict.add('f', 10)
my_dict.add('g', 11)
my_dict.add('h', 12)
my_dict.add('i', 13)
# my_dict.add('j', 14)
print(my_dict)
print(my_dict.values)
print(my_dict.get('g') == 11) # True