****由于最近需要做大规模的文本相似度的计算,所以用到了simhash+汉明距离来快速计算文本的相似度。**
**simhash的原理如下图:其中的weight采用的是jieba的tf-idf的结果。****
**附上python3的源代码:**
import math
import jieba
import jieba.analyse
class SimHash(object):
def __init__(self):
pass
def getBinStr(self, source):
if source == "":
return 0
else:
x = ord(source[0]) << 7
m = 1000003
mask = 2 ** 128 - 1
for c in source:
x = ((x * m) ^ ord(c)) & mask
x ^= len(source)
if x == -1:
x = -2
x = bin(x).replace('0b', '').zfill(64)[-64:]
return str(x)
def getWeight(self, source):
# fake weight with keyword
return ord(source)
def unwrap_weight(self, arr):
ret