作业:编写一个程序,给文档生成simhash指纹。可以对词使用任意合理的散列函数。使用该程序对计算机上的重复文档进行检测,得出检测的准确率。检测的准确率随着指纹大小的有什么变化?
SimHash基本过程
1、文本分词,得到关键词:权重(feature:weight)
对文本进行关键词抽取(分词和计算权重),抽出权重最高的前n(关键词和权重)对,可利用jieba.analyse.extract_tags()实现,即一个文本得到一个长度为n(feature:weight)的集合。
2、hash
对获取的词(feature),进行普通的哈希操作之后,计算hash值,这样就得到一个长度为n位的二进制,得到(hash:weight)的集合。
3、加权
在获取的hash值的基础上,根据对应的weight值进行加权,W=hash*weight。即hash为1则和weight正相乘,为0则和weight负相乘。例如一个词经过hash后得到(010111:5)经过步骤(3)之后可以得到列表[-5,5,-5,5,5,5]。
4、合并
将上述得到的各个向量的加权结果进行求和,变成只有一个序列串。如[-5,5,-5,5,5,5]、[-3,-3,-3,3,-3,3]、[1,-1,-1,1,1,1]进行列向累加得到[-7,1,-9,9,3,9],这样,我们对一个文档得到,一个长度为64的列表。
5、降维
对于得到的n-bit签名的累加结果的每个值进行判断,大于0则置为1, 否则置为0,从而得到该语句的simhash值。例如,[-7,1,-9,9,3,9]得到 010111,这样,我们就得到一个文档的 simhash值。最后根据不同语句的simhash值的汉明距离来判断相似度。
代码
Python中文实现
# -*- coding:utf-8 -*-
import jieba
import jieba.analyse
import numpy as np
class simhash:
# 构造函数
def __init__(self, content):
self.hash = self.simhash(content)
def __str__(self):
return str(self.hash)
# 生成simhash值
def simhash(self, content):
seg = jieba.cut(content)
#jieba.analyse.set_stop_words('stopword.txt')
# jieba基于TF-IDF提取关键词,前10位
keyWords = jieba.analyse.extract_tags("|".join(seg), topK=10, withWeight=True, allowPOS=())
#print(keyWords)
keyList = []
for feature, weight in keyWords:
#print('feature:{},weight: {}'.format(feature,weight))
weight = int(weight)
#生成普通的的hash值
binstr = self.string_hash(feature)
temp = []
for c in binstr:
if (c == '1'):# 查看当前bit位是否为1,是的话将weight*1加入temp[]
temp.append(weight)
else:#否则的话,将weight*-1加入temp[]
temp.append(-weight)
keyList.append(temp)
listSum = np.sum(np.array(keyList), axis=0)
if (keyList == []):#编码读不出来
return '00'
simhash = ''
for i in listSum:
if (i > 0):
simhash = simhash + '1'
else:
simhash = simhash + '0'
return simhash# 整个文档的fingerprint为最终各个位>=0的和
# 求海明距离
def hamming_distance(self, other):
t1 = '0b' + self.hash
t2 = '0b' + other.hash
n = int(t1, 2) ^ int(t2, 2)
i = 0
while n:
n &= (n - 1)
i += 1
return i
#计算相似度
def similarity(self, other):
a = float(self.hash)
b = float(other.hash)
if a > b:
return b / a
else:
return a / b
# 针对source生成hash值 (一个可变长度版本的Python的内置散列)
def string_hash(self, source):
if source == "":
return 0
else:
x = ord(source[0]) << 7
m = 1000003
mask = 2 ** 128 - 1
for c in source:
x = ((x * m) ^ ord(c)) & mask
x ^= len(source)
if x == -1:
x = -2
x = bin(x).replace('0b', '').zfill(64)[-64:]
#print('strint_hash: %s, %s'%(source, x))
return str(x)
if __name__ == '__main__':
hash1 = simhash('我想洗照片')
hash2 = simhash('可以洗一张照片吗')
print("海明距离:", hash1.hamming_distance(hash2))
print("文本相似度:", hash1.similarity(hash2))
Python英文实现
class simhash:
# 构造函数
def __init__(self, tokens='', hashbits=128):
self.hashbits = hashbits
self.hash = self.simhash(tokens);
# toString函数
def __str__(self):
return str(self.hash)
# 生成simhash值
def simhash(self, tokens):
v = [0] * self.hashbits
for t in [self._string_hash(x) for x in tokens]: # t为token的普通hash值
for i in range(self.hashbits):
bitmask = 1 << i
if t & bitmask:
v[i] += 1 # 查看当前bit位是否为1,是的话将该位+1
else:
v[i] -= 1 # 否则的话,该位-1
fingerprint = 0
for i in range(self.hashbits):
if v[i] >= 0:
fingerprint += 1 << i
return fingerprint # 整个文档的fingerprint为最终各个位>=0的和
# 求海明距离
def hamming_distance(self, other):
x = (self.hash ^ other.hash) & ((1 << self.hashbits) - 1)
tot = 0
while x:
tot += 1
x &= x - 1
return tot
# 求相似度
def similarity(self, other):
a = float(self.hash)
b = float(other.hash)
#print("a:",a, b, end='\n')
if a > b:
return b / a
else:
return a / b
# 针对source生成hash值 (一个可变长度版本的Python的内置散列)
def _string_hash(self, source):
if source == "":
return 0
else:
x = ord(source[0]) << 7
m = 1000003
mask = 2 ** self.hashbits - 1
for c in source:
x = ((x * m) ^ ord(c)) & mask
x ^= len(source)
if x == -1:
x = -2
return x
if __name__ == '__main__':
s = 'This is a test string for testing'
hash1 = simhash(s.split())
s = 'This is a test string for testing also'
hash2 = simhash(s.split())
s = 'This is a test'
hash3 = simhash(s.split())
print(hash1,hash2,hash3)
print(hash1.hamming_distance(hash2), "\t", hash1.similarity(hash2))
print(hash1.hamming_distance(hash3), "\t", hash1.similarity(hash3))
Python实现作业
# -*- coding:utf-8 -*-
import jieba
import jieba.analyse
import numpy as np
import re
txt1 = r'./test1.txt'
txt2 = r'./test2.txt'
class simhash:
# 构造函数
def __init__(self, content):
self.hash = self.simhash(content)
def __str__(self):
return str(self.hash)
# 生成simhash值
def simhash(self, content):
count = 0
seg = jieba.cut(content)
# jieba基于TF-IDF提取前10位关键词
keyWords = jieba.analyse.extract_tags("|".join(seg), topK=10, withWeight=True, allowPOS=())
keyList = []
# 获取每个词的权重
for feature, weight in keyWords:
#print('feature:{},weight: {}'.format(feature, weight))
# 每个关键词的权重*总单词数
weight = int(weight * 10)
#生成普通的的hash值
binstr = self.string_hash(feature)
#打印指纹大小
if(count == 0):
print("指纹大小为:", len(binstr))
count += 1
temp = []
for c in binstr:
if (c == '1'):# 查看当前bit位是否为1,是的话将weight*1加入temp[]
temp.append(weight)
else:#否则的话,将weight*-1加入temp[]
temp.append(-weight)
keyList.append(temp)
# 将每个关键词的权重变成一维矩阵
listSum = np.sum(np.array(keyList), axis=0)
if (keyList == []):#编码读不出来
return '00'
simhash = ''
for i in listSum:
if (i > 0):
simhash = simhash + '1'
else:
simhash = simhash + '0'
return simhash# 整个文档的fingerprint为最终各个位>=0的和
# 求海明距离
def hamming_distance(self, other):
t1 = '0b' + self.hash
t2 = '0b' + other.hash
n = int(t1, 2) ^ int(t2, 2)
i = 0
while n:
n &= (n - 1)
i += 1
return i
#计算相似度
def similarity(self, other):
a = float(self.hash)
b = float(other.hash)
print(a, b)
if a > b:
return b / a
#elif a == 0.0 and b == 0.0:
#return 1
else:
return a / b
# 针对source生成hash值 (一个可变长度版本的Python的内置散列)
def string_hash(self, source):
if source == "":
return 0
else:
# 将字符转为二进制,并向左移动7位
x = ord(source[0]) << 7
m = 1000003
mask = 2 ** 128 - 1
# 拼接每个关键词中字符的特征
for c in source:
x = ((x * m) ^ ord(c)) & mask
x ^= len(source)
if x == -1:
x = -2
#通过改变.zfill(16)[-16:]来实现改变指纹大小
x = bin(x).replace('0b', '').zfill(32)[-32:]
#print('strint_hash: %s, %s' % (source, x))
return str(x)
def txt_line(txt1, txt2):
punc = './ <>_ - - = ", 。,?!“”:‘’@#¥% … &×()——+【】{};;● &~| \s:'
#获取文本中的数据
with open(txt1, 'r', encoding='gbk') as f:
list1 = f.read()
string = ''
text1 = re.sub(r'[^\w]+', '', list1)
s = jieba.cut(text1)
string = string.join(s)
line1 = re.sub(r"[{}]+".format(punc), "", string)
with open(txt2, 'r', encoding='gbk') as f:
list2 = f.read()
string = ''
text2 = re.sub(r'[^\w]+', '', list2)
s = jieba.cut(text2)
string = string.join(s)
line2 = re.sub(r"[{}]+".format(punc), "", string)
hash1 = simhash(line1)
hash2 = simhash(line2)
print("海明距离:", hash1.hamming_distance(hash2))
print("文本相似度:", hash1.similarity(hash2))
if __name__ == '__main__':
txt_line(txt1, txt2)
参考资料
simHash介绍及python实现
python使用simhash实现文本相似性对比(全代码展示)
[转]文档去重算法:SimHash和MinHash
浅谈simhash及其python实现