android汉明距离,simhash+汉明距离计算文本类似度

****因为最近须要作大规模的文本类似度的计算,因此用到了simhash+汉明距离来快速计算文本的类似度。**

**simhash的原理以下图:其中的weight采用的是jieba的tf-idf的结果。****

bVbkT6V?w=640&h=430

**附上python3的源代码:**

import math

import jieba

import jieba.analysepython

class SimHash(object):app

def __init__(self):

pass

def getBinStr(self, source):

if source == "":

return 0

else:

x = ord(source[0]) << 7

m = 1000003

mask = 2 ** 128 - 1

for c in source:

x = ((x * m) ^ ord(c)) & mask

x ^= len(source)

if x == -1:

x = -2

x = bin(x).replace('0b', '').zfill(64)[-64:]

return str(x)

def getWeight(self, source):

# fake weight with keyword

return ord(source)

def unwrap_weight(self, arr):

ret = ""

for item in arr:

tmp = 0

if int(item) > 0:

tmp = 1

ret += str(tmp)

return ret

def simHash(self, rawstr):

seg = jieba.cut(rawstr)

keywords = jieba.analyse.extract_tags("|".join(seg), topK=100, withWeight=True)

ret = []

for keyword, weight in keywords:

binstr = self.getBinStr(keyword)

keylist = []

for c in binstr:

weight = math.ceil(weight)

if c == "1":

keylist.append(int(weight))

else:

keylist.append(-int(weight))

ret.append(keylist)

# 对列表进行"降维"

rows = len(ret)

cols = len(ret[0])

result = []

for i in range(cols):

tmp = 0

for j in range(rows):

tmp += int(ret[j][i])

if tmp > 0:

tmp = "1"

elif tmp <= 0:

tmp = "0"

result.append(tmp)

return "".join(result)

def getDistince(self, hashstr1, hashstr2):

length = 0

for index, char in enumerate(hashstr1):

if char == hashstr2[index]:

continue

else:

length += 1

return length

if name == "__main__":spa

simhash = SimHash()

s1 = u'I am very happy'

s2 = u'I am very happu'

hash1 = simhash.simHash(s1)

hash2 = simhash.simHash(s2)

distince = simhash.getDistince(hash1, hash2)

value = 5

print("海明距离:", distince, "断定距离:", value, "是否类似:", distince<=value)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值