由于之前写了一个多值hash的部分,因此这里上传一个代码。这个Python代码实现了多值hash并且能够处理碰撞。如果要追求效率,可以改写为其他语言,并且去掉其中处理碰撞的部分。
# -*- coding: utf-8 -*-
# created by axuanwu 2015.1.25
# key word: hash count
import numpy as np
import math
def getseed(str1):
"""
:param str1: 词条的utf8形式
:return: 词条的hash指纹 256的位随机数
"""
h = 0
for x in str1:
if ord(x) > 256:
h <<= 12
h += ord(x)
else:
h <<= 6
h += ord(x)
while (h >> 256) > 0:
h = (h & (2 ** 256 - 1)) ^ (h >> 256) # 数字不能太大
return h
class Mcard():
def __init__(self):
self.M_num = 8
self.N_max = 16777216
self.nummax2 = 24
self.MCARD = [0]
self._record = 1
self.Opath = ""
self.index = [0] * 8
self.__keys = ['first_NULL']
self.i_key = 1 # 新增元素增加在位置 i_key 处
self.index2 = [0] * 8
def get_keys(self, iii=-1):
if iii == -1:
return self.__keys[1:]
else:
return self.__keys[iii]
def getindex(self, str1, for_up=False):
# 获取 词条的 8个随机位置
seed = getseed(str1)
for n in range(0, self.M_num):
a = 0
k = (n + 1)
seed1 = seed
if (seed >> 64) < 0:
seed1 = seed * (n + 15048796327)
while seed1 > 0:
a ^= (seed1 & (self.N_max - 1)) + k
a = ((a << k) & (self.N_max - 1)) | (a >> (self.nummax2 - k)) # 左循环移位
seed1 >>= self.nummax2
if for_up:
self.index2[n] = a
else:
self.index[n] = a
def update_card(self, str1, num=1, add=False):
"""
:param str1: 词的utf-8编码形式
:param num: 该词需要增加的value值
"""
if self.read_card(str1, True) == 0:
# 新词
for iii in self.index:
if self.MCARD[iii] == 0:
self.MCARD[iii] = self.i_key
self.i_key += 1
self.__keys.append(str1)
def read_card(self, str1, for_up=False):
"""
:param str1: 词的utf-8编码形式
:return: 输出该次条对应的value值
"""
if for_up:
for i in xrange(0, 10): # 最多尝试10次
i_str1 = str1 + str(i)
if i > 5:
print i
self.getindex(i_str1)
aaa = min(self.MCARD[self.index])
if aaa == 0:
return 0
else:
for i in xrange(0, 10): # 最多连续处理碰撞10次
i_str1 = str1 + str(i)
if i > 5:
print i
self.getindex(i_str1)
aaa = max(self.MCARD[self.index])
if aaa == 0: # 不存在
return 0
elif aaa < self.N_max:
if str1 == self.__keys[aaa]:
return aaa
print ("warning : bad case happened , card array maybe too short when update " + str1) # hash 桶太少
return 0
def setbase(self, num1=16777216, num2=8):
"""
:param num1: 数组长度参数
:param num2: 每个词条对应的hash位置数
"""
self.nummax2 = int(math.ceil(math.log(num1, 2)))
self.N_max = 2 ** self.nummax2 # self.nummax2 2的N次方
self.M_num = num2
self.index = [0] * num2
self.index2 = [0] * num2
def set_card(self, kk=-1, dd=8):
"""
:param kk: 数组长度参数 -1表示取之前定义值
"""
if -1 == kk:
self.MCARD = np.repeat(0, self.N_max)
return 0
s1 = input('do you want to reset MCARD to zeros,all memory will be lost [y/n]:')
if s1 == 'y':
self.MCARD = np.repeat(0, self.N_max)
else:
print("no reset")
else:
self.setbase(kk, dd)
self.MCARD = np.repeat(0, 2 ** self.nummax2)
def record_num(self):
"""
:return: 返回字典词条数量
"""
return self._record - 1
def card_test(self):
"""
计算hash碰撞指数
"""
aaa = self._record
bbb = self.N_max
ccc = 0
for i in self.MCARD:
ccc += int(i > 0)
ddd = self.M_num
print math.log(1.0 * ccc / bbb, 10) * ddd, math.log((1.0 * aaa * ddd - ccc) / ccc, 10) * ddd
以上部分放在 Mcard.py中;为hash存储的实体。
</pre><pre name="code" class="python">__author__ = 'axuanwu'
import Mcard
card = Mcard.Mcard()
card.setbase(2**13,8) # 参数设置
card.set_card() #初始化hash桶
for i in xrange(1, 2001):
if i == 999:
card.update_card('99901')
else:
card.update_card(str(i))
for i in xrange(1, 2001):
a = card.read_card(str(i))
if card.get_keys(a) != str(i):
print i, a, card.get_keys(a)
print card.read_card(str(99901))
print card.get_keys()
以上为测试Mcard的简单实例,完成以下功能:
1.申请8*1024长度的hash桶,多值数为8.
2.把1到2000的字符串形式存入hash,且其中 ‘999’被替换为‘99901’;
3.检查1到2000的字符串形式是否存在与hash中;
4.计算出‘99901’在self.__keys()中的下标。