第十章
本章讲述哈希表,和跳跃表
有一个难点是证明二次探测定理
The quadratic probing strategy has a clustering problem related to the way it looks for open slots. Namely, when a collision occurs at bucket h(k), it checks buckets A[(h(k) +i2) mod N], for i = 1,2,…,N −1.
a. Show that i2 mod N will assume at most (N + 1)/2 distinct values,for N prime, as i ranges from 1 to N − 1. As a part of this justification, note that i2 mod N = (N −i)2 mod N for all i.
b. A better strategy is to choose a prime N such that N mod 4 = 3 and then to check the buckets A[(h(k) ± i2) mod N] as i ranges from 1 to (N − 1)/2, alternating between plus and minus. Show that this alternate version is guaranteed to check every bucket in A
这题证明用到了费马定理、二次剩余,在10.36中只做了该要说明,如有更好的证明请留言。
本章10.55题,是一个字典树.
习题代码如下(部分代码引用书中源代码,源代码位置目录在第二章答案中介绍)
# 10.1
from collections.abc import MutableMapping as mm
class Mutable(mm):
''' simple set ,only support the in-order access '''
def __init__(self):
self._data=[]
def _find_index(self,k): # find the index
return k if k<len(self) else None
def __getitem__(self,k):
index=self._find_index(k)
if index ==None:
raise Exception("Error input!")
return self._data[index]
def __setitem__(self,k,v):
index=self._find_index(k)
if index ==None:
self._data.append(v)
else:
self._data[index]=v
def __delitem__(self,k):
index=self._find_index(k)
if index ==None:
raise Exception("Error input!")
self._data.pop(index)
def __iter__(self):
for i in self._data:
yield i
def __len__(self):
return len(self._data)
# 10.1
def pop(self,k=None):
''' if k is None,raise error '''
index=self._find_index(k)
return self._data.pop(index)
# 10.2
def items(self):
for i in range(len(self)):
yield i,self._data[i]
#t=Mutable()
#for i in range(10):
# t[i]=i
# 10.1 在pop方法
# 10.2 在items方法
# text10.1
#print(t.pop(0))
# text10.2
#print(list(t.items()))
# 10.3
from TheCode.ch10.unsorted_table_map import UnsortedTableMap as UTM
class UnSorted(UTM):
def __init__(self):
super().__init__()
def items(self):
for i in self._table:
yield i._key,i._value
# 10.3 在items方法中,遍历一遍的时间复杂度为n
#us=UnSorted()
#for i in range(10):
# us[i]=i
#print(list(us.items()))
# 10.4 在这个M中,当每次添加时需要遍历整个_table,时间复杂度为O(1+2+···n) --> O(n^2)
# 10.5
from TheCode.ch10.unsorted_table_map import UnsortedTableMap as UTM
from TheCode.ch07.positional_list import PositionalList
class UnsortedLinkMap(UTM):
''' use link to store the item'''
def __init__(self):
self._link=PositionalList()
def __getitem__(self,k):
''' return the value of k,if the k in table else raise'''
for i in self._link:
if i._key==k:
return i._value
raise KeyError("TypeError ,the k is not exist")
def __setitem__(self,k,v):
''' assign the value to the key,overwriting existing value if present'''
for i in self._link:
if i._key==k:
i._value=v
return
self._link.add_last(self._Item(k,v))
def __delitem__(self,k):
''' remove item associated with key k(raise TypeError if k not found)'''
temp=self._link.first()
while temp!=None:
if temp.element()._key==k:
self._link.delete(temp)
return
temp=self._link.after(temp)
raise KeyError("the k is not exist")
def __len__(self):
return len(self._link)
def __iter__(self):
for i in self._link:
yield i._key,i._value
#uslm=UnsortedLinkMap()
#for i in range(10):
# uslm[i]=i
#print(list(uslm))
#print(uslm[2])
#del uslm[3]
#print(list(uslm))
# 10.6
#解决冲突的方法有两类:
#A:分离链表,该方法是在索引对应的空间下建立链表(储存空间),当存在冲突时将键值对添加到储存空间中
#B:开放寻址,该方法是在发生冲突时,使用不同的方法找到下一个索引(线性探测、二次探测、双哈希策略、迭代的探测)
#所以A的负载因子可以超过1,B的负载因子最多为1
# 10.7
from TheCode.ch07.positional_list import PositionalList
class HashPositional(PositionalList):
''' add the hash method'''
class Position(PositionalList.Position):
def __hash__(self):
return hash((type(self),id(self.element()))) # hash by the id
#hp=HashPositional()
#a=hp.add_first('1')
#b=hp.add_first('2')
#c=hp.first() # c==b
#print('a==b{},a==c{},b==c{},\nhash(a)={}\nhash(b)={}\nhash(b)={}'.format(a==b,a==c,b==c,hash(a),hash(b),hash(c)))
# 10.8
#不同数字hash不同
#不同字母hash不同
#数字和字母在hash中要有分界
#每个字符顺序位置要能够分开
# 10.9 一个None代表一个位置,纵向代表链表的深度
#l=[12,44,13,88,23,94,11,39,20,16,5]
#index=[(3*i+5)%11 for i in l] #[8, 5, 0, 5, 8, 1, 5, 1, 10, 9, 9]
# None None None None None None None None None None None
# 13 94 44 12 16 20
# 39 88 23 5
# 11
# 10.10
# None None None None None None None None None None None
# 13 94 39 16 44 88 11 5 12 23 20
# 10.11
#l=[12,44,13,88,23,94,11,39,20,16,5]
#res=[None]*11
#for i in l:
# for j in range(100):
# if res[((3*i+5)%11+j^2)%11]==None: # quadratic probing
# res[((3*i+5)%11+j^2)%11]=i
# break
# print(res)
# 10.12
#l=[12,44,13,88,23,94,11,39,20,16,5]
#res=[None]*11
#for i in l:
# for j in range(100):
# if res[(( 3*i+5)%11+j*(7-i%7))%11]==None: # quadratic hash j*(7-(imod7))
# res[(( 3*i+5)%11+j*(7-i%7))%11]=i
# break
# print(res)
# 10.13 最坏的情况为O(n^2),每次hash都是一个索引,每次需要遍历存入的所有节点来添加新的节点
# 最好的情况为O(n), 每次hash有不同的索引,可以直接通过索引直接存储
# 10.14
#l=[54,28,41,18,10,36,25,38,12,90]
#res=[None]*10
#for i in l:
# index=7-i%7
# if res[index]==None:
# res[index]=i
# elif type(res[index])==int:
# res[index]=[res[index],i]
# else:
# res[index].append(i)
#print(res)
# 10.15
from TheCode.ch10.hash_map_base import HashMapBase
class NewHashMapBase(HashMapBase):
''' alter method __init__ __setitem__ ,
'''
def __init__(self,cap=11,p=109345121,y=0.5):
"""Create an empty hash-table map.
y initial load factor
cap initial table size (default 11)
p positive prime used for MAD (default 109345121)
"""
super().__init__(cap,p)
self._y=y
def __setitem__(self, k, v):
j = self._hash_function(k)
# subroutine maintains self._n
self._bucket_setitem(j, k, v)
if self._n > len(self._table) // (1/self._y): # keep load factor <= 0.5
# number 2^x - 1 is often prime
self._resize(2 * len(self._table) - 1)
# 10.16
#Algorithms _find_index:
# input: the key k
# output: three times
# i=0
# sentry_index=None
# repeat:
# if (h(k)+i^2)%N ==key: return True,sentry_index
# if (h(k)+i^2)%N ==sentry and sentry_index==None: sentry_index=i
# if (h(k)+i^2)%N ==None return False,sentry_index
# i+=1
#Algorithms set_item:
# input: the key k
# output
# tuple=_find_index(k)
# set or add by tuple's content
#Algorithms del_item:
# input: the key k
# output
# tuple=_find_index(k)
# del or raise by tuple's content
# 10.17
from TheCode.ch10.probe_hash_map import ProbeHashMap
class QuadraticProbe(ProbeHashMap):
''' only alter the _find_slot method '''
def _find_slot(self,j,k):
"""Search for key k in bucket at index j.
Return (success, index) tuple, described as follows:
If match was found, success is True and index denotes its location.
If no match found, success is False and index denotes first available slot.
"""
firstAvail = None
temp=j # store the hash value
i=0
while True:
if self._is_available(j):
if firstAvail is None:
firstAvail = j # mark this as first avail
if self._table[j] is None:
return (False, firstAvail) # search has failed
elif k == self._table[j]._key:
return (True, j) # found a match
j = (temp + i^2) % len(self._table) # keep looking (cyclically)
i+=1
#qp=QuadraticProbe()
#for i in range(10):
# qp[i]=i
#print(qp[3])
#del qp[3]
#print(list(qp.items()))
#print(qp[3])
# 10.18 哈希表的更新操作需要大量的时间。为了维持顺序,很多元素需要调整顺序。
# 10.19 将SortedTableMap中的table换成 link,然后将二分查找替换为非二分查找等
# 10.20 log2n+2n +log(2n-1)+2n-1···· + log1+1 = O(log(2n!)+4n^2)
# 10.21 可以
# 新版仅仅返回 high+1
# low和high最终会在指向同一个元素,然后再次减少指向的区间,使high<low,返回k应该存在的索引
# 10.22 指的是有序映射的应用中最大值集应用
# 当每对元素的价格和性能都低于前一项时,每个元素都不被dominated,所以只需要添加元素,需O(3nlogn)的时间,最后含有n个这样的数值对
# 当每对元素的价格低于前一项性能高于前一项时,每一项的都dominate前面的所有项,需要将前面的元素杀出,需要O(5nlogn)时间
# 10.23 略
# 10.24 使用书中描述的跳跃查找方法
#Algorithms __delitem__:
# input: key k
# p=start
# temp=SkipSearch(k)
# del temp
# 10.25
from collections.abc import MutableSet
class SimpleSet(MutableSet):
'''Abstract class'''
#10.25
def pop(self):
''' del a random item,raise ValueError if len(self)==o '''
if len(self)==0: # abstract method __len__
raise ValueError("none item")
for i in self: # abstract method __iter__
self.discard(i) # abstract method discard
break
#10.26
def isdisjoint(self,other):
''' other is a set'''
min_set=self
max_set=other
if len(min_set)>len(max_set):
min_set,max_set=max_set,min_set # find the bigger set
for i in min_set:
if i not in max_set:
return False
return True
# 10.26 在SimpleSet中 isdisjoint方法实现
# 10.27
# A使用链表:将同一天当过生日的信息存储在一段连续的链中,每当度过一天就将当天过生日的节点添加到链表尾部
# B使用多映射哈希表:将多个值映射到日期键中
#A方法可以在O(n)的空间复杂度完成,查询近两天过生日的人的时间复杂度较低,但是查询指定时间过生日的时间复杂度较高
#B方法可以在O(1)的时间复杂度查询任意一天过生日的人。
# 10.28
from TheCode.ch10.unsorted_table_map import UnsortedTableMap
class AlterUTM(UnsortedTableMap):
''' rewirte setdefault method'''
def setdefault(self,k,v):
''' return k's value if k in the map,self append (k,v) to the set and return v'''
try:
return self[k] # try find the k
except KeyError:
self._table.append(self._Item(k,v)) # add the tuple to the map
return v
if self._n > len(self._table) // 2: # keep load factor <= 0.5
# number 2^x - 1 is often prime
self._resize(2 * len(self._table) - 1)
# t=AlterUTM()
# for i in range(5):
# t[i]=str(i)
# for i in range(3,7):
# temp=t.setdefault(i,i)
# print(type(temp),temp)
# for i in range(7):
# print(type(t[i]),t[i])
# 10.29
from TheCode.ch10.probe_hash_map import ProbeHashMap
class AlterPHM(ProbeHashMap):
''' rewrite setdatult method'''
def setdefault(self,k,v):
j = self._hash_function(k)
found, s = self._find_slot(j, k)
if not found:
self._table[s] = self._Item(k, v) # insert new item
self._n += 1 # size has increased
return v
else:
return self._table[s]._value # overwrite existing
if self._n > len(self._table) // 2: # keep load factor <= 0.5
# number 2^x - 1 is often prime
self._resize(2 * len(self._table) - 1)
# t=AlterPHM()
# for i in range(3):
# t[i]=i
# for i in range(5):
# temp=t.setdefault(i,str(i))
# print(temp,type(temp))
# print(type(t.setdefault(1,2)))
# for i in range(5):
# print(t[i],type(t[i]))
# 10.30
from TheCode.ch10.chain_hash_map import ChainHashMap
class AlterCHM(ChainHashMap):
''' rewrite setdatult method'''
def setdefault(self,k,v):
j=self._hash_function(k)
if self._table[j]==None: # if the item that index of map is none
self[k]=v # protected add the item to the map
return v
else:
try: # if the chain can't find the item
return self._table[j][k]
except KeyError:
self._table[j]._table.append(self._table[j]._Item(k,v))
return v
if self._n > len(self._table) //