HashTable
哈希表原理
765 % 13 = 11
431 % 13 = 2
…
实例代码块
# -*- coding:utf-8 -*-
class Array(object):
def __init__(self,size=32,init=None):
self._size = size
self._items = [init] * size
def __getitem__(self,index):
return self._items[index]
def __setitem__(self,index,value):
self._items[index] = value
def __len__(self):
return self._size
def clear(self,value = None):
for i in range(len(self._items)):
self._items[i] = value
def __iter__(self):
for item in self._items:
yield item
"""
定义一个 hash 表 数组的槽
注意,一个槽有三种状态,看你能否想明白.相比链接法解决冲突,二次探索法删除一个 key 的操作稍微复杂.
1.从未使用 HashMap.UNUSE.此槽没有被使用和冲突过,查找时只要找到 UNUSED 就不用在继续探查了
2.使用过但是 remove了,此时是 HashMap.EMPT,该探查点后边的元素仍可能是有key
3.槽正在使用 Slot 节点
"""
class Slot(object):
def __init__(self,key,value):
self.key,self.value = key,value
#构造一个哈希表
class HashTable(object):
UNUSED = None # slot 没有被使用过
EMPTY = Slot(None,None) # 使用过被删除
def __init__(self):
self._table = Array(8,init=HashTable.UNUSED)
self.length = 0
"""
这里定义了一个负载因子的概念,就是已经使用的 槽数/哈希表大小.
比如我们上边的例子插入了8个元素,哈希表总大小是13,它的load factor
就是 8/13 约等于 0.62.当我们继续往哈希表插入数据的时候,很快就不够用了.
通常当负载因子开始超过0.8的时候,就要新开辟空间并且重新进行散列了
"""
@property
def _load_factor(self): #定义一个负载因子
return self.length / float(len(self._table))
def __len__(self):
return self.length
def _hash(self,key):
return abs(hash(key)) % len(self._table) #abs() 得到整数值
#--------------->定义哈希表的常用操作
def _find_key(self,key):
index = self._hash(key)
_len = len(self._table)
while self._table[index] is not HashTable.UNUSED:
if self._table[index] is HashTable.EMPTY:
index = (index*5+1)%_len #cpython 使用的一种解决哈希冲突的方式
continue
elif self._table[index].key == key:
return index
else:
index = (index*5+1)%_len
return None #如果什么都没找到则返回None
#定义一个找到空槽的方法
def _slot_can_insert(self,index):
return (self._table[index] is HashTable.EMPTY or self._table[index] is HashTable.UNUSED)
#定义一个找到空槽并插入值的方法
def _find_slot_for_insert(self,key):
index = self._hash(key)
_len = len(self._table)
while not self._slot_can_insert(index):
index = (index*5 + 1)%_len
return index
def __contains__(self,key): # in operator
index = self._find_key(key)
return index is not None
def add(self,key,value):
if key in self:
index = self._find_key(key)
self._table[index].value = value
return False
else:
index = self._find_slot_for_insert(key)
self._table[index] = Slot(key,value)
self.length += 1
if self._load_factor >= 0.8:
self._rehash()
return True
def _rehash(self):
old_table = self._table
newsize = len(self._table) * 2
self.table = Array(newsize,HashTable.UNUSED)
self.length = 0
for slot in old_table:
if slot is not HashTable.UNUSED and slot is not HashTable.EMPTY:
index = self._find_slot_for_insert(slot.key)
self._table[index] = slot
self.length += 1
def get(self,key,default=None):
index = self._find_key(key)
if index is None:
return default
else:
return self._table[index].value
def remove(self,key):
index = self._find_key(key)
if index is None:
raise KeyError()
value = self._table[index].value
self.length -= 1
self._table[index] = HashTable.EMPTY
return value
def __iter__(self):
for slot in self._table:
if slot not in (HashTable.UNUSED,HashTable.EMPTY):
yield slot.key
#单测
def test_hash_table():
h = HashTable()
h.add('a',0)
h.add('b',1)
h.add('c',2)
assert len(h) == 3
assert h.get('a') == 0
assert h.get('b') == 1
assert h.get('dsad') is None
h.remove('a')
assert h.get('a') is None
assert sorted(list(h)) == ['b','c']
哈希表实现dict字典
# -*- coding:utf-8 -*-
class Array(object):
def __init__(self,size=32,init=None):
self._size = size
self._items = [init] * size
def __getitem__(self,index):
return self._items[index]
def __setitem__(self,index,value):
self._items[index] = value
def __len__(self):
return self._size
def clear(self,value = None):
for i in range(len(self._items)):
self._items[i] = value
def __iter__(self):
for item in self._items:
yield item
"""
定义一个 hash 表 数组的槽
注意,一个槽有三种状态,看你能否想明白.相比链接法解决冲突,二次探索法删除一个 key 的操作稍微复杂.
1.从未使用 HashMap.UNUSE.此槽没有被使用和冲突过,查找时只要找到 UNUSED 就不用在继续探查了
2.使用过但是 remove了,此时是 HashMap.EMPT,该探查点后边的元素仍可能是有key
3.槽正在使用 Slot 节点
"""
class Slot(object):
def __init__(self,key,value):
self.key,self.value = key,value
#构造一个哈希表
class HashTable(object):
UNUSED = None # slot 没有被使用过
EMPTY = Slot(None,None) # 使用过被删除
def __init__(self):
self._table = Array(8,init=HashTable.UNUSED)
self.length = 0
@property
def _load_factor(self): #定义一个负载因子
return self.length / float(len(self._table))
def __len__(self):
return self.length
def _hash(self,key):
return abs(hash(key)) % len(self._table) #abs() 得到整数值
#--------------->定义哈希表的常用操作
def _find_key(self,key):
index = self._hash(key)
_len = len(self._table)
while self._table[index] is not HashTable.UNUSED:
if self._table[index] is HashTable.EMPTY:
index = (index*5+1)%_len #cpython 使用的一种解决哈希冲突的方式
continue
elif self._table[index].key == key:
return index
else:
index = (index*5+1)%_len
return None #如果什么都没找到则返回None
#定义一个找到空槽的方法
def _slot_can_insert(self,index):
return (self._table[index] is HashTable.EMPTY or self._table[index] is HashTable.UNUSED)
#定义一个找到空槽并插入值的方法
def _find_slot_for_insert(self,key):
index = self._hash(key)
_len = len(self._table)
while not self._slot_can_insert(index):
index = (index*5 + 1)%_len
return index
def __contains__(self,key): # in operator
index = self._find_key(key)
return index is not None
def add(self,key,value):
if key in self:
index = self._find_key(key)
self._table[index].value = value
return False
else:
index = self._find_slot_for_insert(key)
self._table[index] = Slot(key,value)
self.length += 1
if self._load_factor >= 0.8:
self._rehash()
return True
def _rehash(self):
old_table = self._table
newsize = len(self._table) * 2
self.table = Array(newsize,HashTable.UNUSED)
self.length = 0
for slot in old_table:
if slot is not HashTable.UNUSED and slot is not HashTable.EMPTY:
index = self._find_slot_for_insert(slot.key)
self._table[index] = slot
self.length += 1
def get(self,key,default=None):
index = self._find_key(key)
if index is None:
return default
else:
return self._table[index].value
def remove(self,key):
index = self._find_key(key)
if index is None:
raise KeyError()
value = self._table[index].value
self.length -= 1
self._table[index] = HashTable.EMPTY
return value
def __iter__(self):
for slot in self._table:
if slot not in (HashTable.UNUSED,HashTable.EMPTY):
yield slot.key
##########################################################
# 通过继承 HashTable 来实现字典dict
##########################################################
class DictADT(HashTable):
def __setitem__(self,key,value):
self.add(key,value)
def __getitem__(self,key):
if key not in self:
raise KeyError()
else:
return self.get(key)
def _iter_slot(self):
for slot in self._table:
if slot not in (HashTable.EMPTY,HashTable.UNUSED):
yield slot
def items(self):
for slot in self._iter_slot():
yield (slot.key,slot.value)
def keys(self):
for slot in self._iter_slot():
yield slot.key
def value(self):
for slot in self._iter_slot():
yield slot.value
def test_dict_odb():
import random
d = DictADT()
d['a'] = 1
assert d['a'] == 1
d.remove('a')
l = list(range(30))
random.shuffle(l)
for i in l:
d.add(i,i)
for i in range(30):
assert d.get(i) == i
assert sorted(list(d.keys())) == sorted(l)
哈希表实现set集合
# -*- coding:utf-8 -*-
class Array(object):
def __init__(self,size=32,init=None):
self._size = size
self._items = [init] * size
def __getitem__(self,index):
return self._items[index]
def __setitem__(self,index,value):
self._items[index] = value
def __len__(self):
return self._size
def clear(self,value = None):
for i in range(len(self._items)):
self._items[i] = value
def __iter__(self):
for item in self._items:
yield item
"""
定义一个 hash 表 数组的槽
注意,一个槽有三种状态,看你能否想明白.相比链接法解决冲突,二次探索法删除一个 key 的操作稍微复杂.
1.从未使用 HashMap.UNUSE.此槽没有被使用和冲突过,查找时只要找到 UNUSED 就不用在继续探查了
2.使用过但是 remove了,此时是 HashMap.EMPT,该探查点后边的元素仍可能是有key
3.槽正在使用 Slot 节点
"""
class Slot(object):
def __init__(self,key,value):
self.key,self.value = key,value
#构造一个哈希表
class HashTable(object):
UNUSED = None # slot 没有被使用过
EMPTY = Slot(None,None) # 使用过被删除
def __init__(self):
self._table = Array(8,init=HashTable.UNUSED)
self.length = 0
@property
def _load_factor(self): #定义一个负载因子
return self.length / float(len(self._table))
def __len__(self):
return self.length
def _hash(self,key):
return abs(hash(key)) % len(self._table) #abs() 得到整数值
#--------------->定义哈希表的常用操作
def _find_key(self,key):
index = self._hash(key)
_len = len(self._table)
while self._table[index] is not HashTable.UNUSED:
if self._table[index] is HashTable.EMPTY:
index = (index*5+1)%_len #cpython 使用的一种解决哈希冲突的方式
continue
elif self._table[index].key == key:
return index
else:
index = (index*5+1)%_len
return None #如果什么都没找到则返回None
#定义一个找到空槽的方法
def _slot_can_insert(self,index):
return (self._table[index] is HashTable.EMPTY or self._table[index] is HashTable.UNUSED)
#定义一个找到空槽并插入值的方法
def _find_slot_for_insert(self,key):
index = self._hash(key)
_len = len(self._table)
while not self._slot_can_insert(index):
index = (index*5 + 1)%_len
return index
def __contains__(self,key): # in operator
index = self._find_key(key)
return index is not None
def add(self,key,value):
if key in self:
index = self._find_key(key)
self._table[index].value = value
return False
else:
index = self._find_slot_for_insert(key)
self._table[index] = Slot(key,value)
self.length += 1
if self._load_factor >= 0.8:
self._rehash()
return True
def _rehash(self):
old_table = self._table
newsize = len(self._table) * 2
self.table = Array(newsize,HashTable.UNUSED)
self.length = 0
for slot in old_table:
if slot is not HashTable.UNUSED and slot is not HashTable.EMPTY:
index = self._find_slot_for_insert(slot.key)
self._table[index] = slot
self.length += 1
def get(self,key,default=None):
index = self._find_key(key)
if index is None:
return default
else:
return self._table[index].value
def remove(self,key):
index = self._find_key(key)
if index is None:
raise KeyError()
value = self._table[index].value
self.length -= 1
self._table[index] = HashTable.EMPTY
return value
def __iter__(self):
for slot in self._table:
if slot not in (HashTable.UNUSED,HashTable.EMPTY):
yield slot.key
#######################################################
# 通过继承哈希表实现 集合set
#######################################################
class SetADT(HashTable):
def add(self,key):
return super(SetADT,self).add(key,True)
#定义一个方法,取两个集合的交集元素添加到新集合中
def __and__(self,other_set):
new_set = SetADT()
for element_a in self:
if element_a in other_set:
new_set.add(element_a)
for element_b in other_set:
if element_b in self:
new_set.add(element_b)
return new_set
#定义一个方法,取两个集合的非交集元素添加到新集合中
def __sub__(self,other_set):
new_set = SetADT()
for element_a in self:
if element_a not in other_set:
new_set.add(element_a)
return new_set
#定义一个方法,取两个集合的并集并添加到新集合中
def __or__(self,other_set):
new_set = SetADT()
for element_a in self:
new_set.add(element_a)
for element_b in other_set:
new_set.add(element_b)
return new_set
#单侧
def test_set_odb():
sa = SetADT()
sa.add(1)
sa.add(2)
sa.add(3)
assert 1 in sa
sb = SetADT()
sb.add(3)
sb.add(4)
sb.add(5)
assert 5 in sb
assert sorted(list(sa & sb)) == [3]
assert sorted(list(sa-sb)) == [1,2]
assert sorted(list(sa | sb)) == [1,2,3,4,5]