SimHash算法中英文简短句子Python实现

最新推荐文章于 2022-09-01 15:19:19 发布

♡来年秋风起♡

最新推荐文章于 2022-09-01 15:19:19 发布

阅读量761

点赞数 1

分类专栏： simhash 文章标签： python

本文链接：https://blog.csdn.net/qq_44924407/article/details/108995726

版权

simhash 专栏收录该内容

1 篇文章

订阅专栏

本文介绍SimHash算法原理及其实现，通过Python代码演示如何为文档生成SimHash指纹，并利用该指纹检测文本间的相似度。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

作业：编写一个程序，给文档生成simhash指纹。可以对词使用任意合理的散列函数。使用该程序对计算机上的重复文档进行检测，得出检测的准确率。检测的准确率随着指纹大小的有什么变化？

SimHash基本过程

1、文本分词，得到关键词:权重（feature:weight）
对文本进行关键词抽取（分词和计算权重），抽出权重最高的前n（关键词和权重）对，可利用jieba.analyse.extract_tags()实现，即一个文本得到一个长度为n（feature:weight）的集合。
2、hash
对获取的词（feature），进行普通的哈希操作之后，计算hash值，这样就得到一个长度为n位的二进制，得到（hash：weight）的集合。
3、加权
在获取的hash值的基础上，根据对应的weight值进行加权，W=hash*weight。即hash为1则和weight正相乘，为0则和weight负相乘。例如一个词经过hash后得到（010111：5）经过步骤（3）之后可以得到列表[-5,5,-5,5,5,5]。
4、合并
将上述得到的各个向量的加权结果进行求和，变成只有一个序列串。如[-5,5,-5,5,5,5]、[-3,-3,-3,3,-3,3]、[1,-1,-1,1,1,1]进行列向累加得到[-7，1，-9，9，3，9]，这样，我们对一个文档得到，一个长度为64的列表。
5、降维
对于得到的n-bit签名的累加结果的每个值进行判断，大于0则置为1，否则置为0，从而得到该语句的simhash值。例如，[-7，1，-9，9，3，9]得到 010111，这样，我们就得到一个文档的 simhash值。最后根据不同语句的simhash值的汉明距离来判断相似度。

代码

Python中文实现

# -*- coding:utf-8 -*-
import jieba
import jieba.analyse
import numpy as np

class simhash:
    # 构造函数
    def __init__(self, content):
        self.hash = self.simhash(content)

    def __str__(self):
        return str(self.hash)

    # 生成simhash值
    def simhash(self, content):
        seg = jieba.cut(content)
        #jieba.analyse.set_stop_words('stopword.txt')
        # jieba基于TF-IDF提取关键词,前10位
        keyWords = jieba.analyse.extract_tags("|".join(seg), topK=10, withWeight=True, allowPOS=())
        #print(keyWords)

        keyList = []
        for feature, weight in keyWords:
            #print('feature:{},weight: {}'.format(feature,weight))
            weight = int(weight)
            #生成普通的的hash值
            binstr = self.string_hash(feature)
            temp = []
            for c in binstr:
                if (c == '1'):# 查看当前bit位是否为1,是的话将weight*1加入temp[]
                    temp.append(weight)
                else:#否则的话，将weight*-1加入temp[]
                    temp.append(-weight)
            keyList.append(temp)
        listSum = np.sum(np.array(keyList), axis=0)
        if (keyList == []):#编码读不出来
            return '00'
        simhash = ''
        for i in listSum:
            if (i > 0):
                simhash = simhash + '1'
            else:
                simhash = simhash + '0'
        return simhash# 整个文档的fingerprint为最终各个位>=0的和

    # 求海明距离
    def hamming_distance(self, other):
        t1 = '0b' + self.hash
        t2 = '0b' + other.hash
        n = int(t1, 2) ^ int(t2, 2)
        i = 0
        while n:
            n &= (n - 1)
            i += 1
        return i

    #计算相似度
    def similarity(self, other):
        a = float(self.hash)
        b = float(other.hash)
        if a > b:
            return b / a
        else:
            return a / b

# 针对source生成hash值   (一个可变长度版本的Python的内置散列)
    def string_hash(self, source):
        if source == "":
            return 0
        else:
            x = ord(source[0]) << 7
            m = 1000003
            mask = 2 ** 128 - 1
            for c in source:
                x = ((x * m) ^ ord(c)) & mask
            x ^= len(source)
            if x == -1:
                x = -2
            x = bin(x).replace('0b', '').zfill(64)[-64:]
            #print('strint_hash: %s, %s'%(source, x))
            return str(x)
if __name__ == '__main__':
    hash1 = simhash('我想洗照片')
    hash2 = simhash('可以洗一张照片吗')
    print("海明距离:", hash1.hamming_distance(hash2))
    print("文本相似度:", hash1.similarity(hash2))

Python英文实现

class simhash:

    # 构造函数
    def __init__(self, tokens='', hashbits=128):
        self.hashbits = hashbits
        self.hash = self.simhash(tokens);

    # toString函数
    def __str__(self):
        return str(self.hash)

    # 生成simhash值
    def simhash(self, tokens):
        v = [0] * self.hashbits
        for t in [self._string_hash(x) for x in tokens]:  # t为token的普通hash值
            for i in range(self.hashbits):
                bitmask = 1 << i
                if t & bitmask:
                    v[i] += 1  # 查看当前bit位是否为1,是的话将该位+1
                else:
                    v[i] -= 1  # 否则的话,该位-1
        fingerprint = 0
        for i in range(self.hashbits):
            if v[i] >= 0:
                fingerprint += 1 << i
        return fingerprint  # 整个文档的fingerprint为最终各个位>=0的和

    # 求海明距离
    def hamming_distance(self, other):
        x = (self.hash ^ other.hash) & ((1 << self.hashbits) - 1)
        tot = 0
        while x:
            tot += 1
            x &= x - 1
        return tot

    # 求相似度
    def similarity(self, other):
        a = float(self.hash)
        b = float(other.hash)
        #print("a:",a, b, end='\n')
        if a > b:
            return b / a
        else:
            return a / b

    # 针对source生成hash值   (一个可变长度版本的Python的内置散列)
    def _string_hash(self, source):
        if source == "":
            return 0
        else:
            x = ord(source[0]) << 7
            m = 1000003
            mask = 2 ** self.hashbits - 1
            for c in source:
                x = ((x * m) ^ ord(c)) & mask
            x ^= len(source)
            if x == -1:
                x = -2
            return x


if __name__ == '__main__':
    s = 'This is a test string for testing'
    hash1 = simhash(s.split())

    s = 'This is a test string for testing also'
    hash2 = simhash(s.split())

    s = 'This is a test'
    hash3 = simhash(s.split())
    print(hash1,hash2,hash3)
    print(hash1.hamming_distance(hash2), "\t", hash1.similarity(hash2))
    print(hash1.hamming_distance(hash3), "\t", hash1.similarity(hash3))

Python实现作业

# -*- coding:utf-8 -*-
import jieba
import jieba.analyse
import numpy as np
import re

txt1 = r'./test1.txt'
txt2 = r'./test2.txt'

class simhash:
    # 构造函数
    def __init__(self, content):
        self.hash = self.simhash(content)

    def __str__(self):
        return str(self.hash)

    # 生成simhash值
    def simhash(self, content):
        count = 0
        seg = jieba.cut(content)
        # jieba基于TF-IDF提取前10位关键词
        keyWords = jieba.analyse.extract_tags("|".join(seg), topK=10, withWeight=True, allowPOS=())

        keyList = []
        # 获取每个词的权重
        for feature, weight in keyWords:
            #print('feature:{},weight: {}'.format(feature, weight))
            # 每个关键词的权重*总单词数
            weight = int(weight * 10)
            #生成普通的的hash值
            binstr = self.string_hash(feature)
            #打印指纹大小
            if(count == 0):
                print("指纹大小为:", len(binstr))
                count += 1
            temp = []
            for c in binstr:
                if (c == '1'):# 查看当前bit位是否为1,是的话将weight*1加入temp[]
                    temp.append(weight)
                else:#否则的话，将weight*-1加入temp[]
                    temp.append(-weight)
            keyList.append(temp)
        # 将每个关键词的权重变成一维矩阵
        listSum = np.sum(np.array(keyList), axis=0)
        if (keyList == []):#编码读不出来
            return '00'
        simhash = ''
        for i in listSum:
            if (i > 0):
                simhash = simhash + '1'
            else:
                simhash = simhash + '0'
        return simhash# 整个文档的fingerprint为最终各个位>=0的和

    # 求海明距离
    def hamming_distance(self, other):
        t1 = '0b' + self.hash
        t2 = '0b' + other.hash
        n = int(t1, 2) ^ int(t2, 2)
        i = 0
        while n:
            n &= (n - 1)
            i += 1
        return i

    #计算相似度
    def similarity(self, other):
        a = float(self.hash)
        b = float(other.hash)
        print(a, b)
        if a > b:
            return b / a
        #elif a == 0.0 and b == 0.0:
        #return 1
        else:
            return a / b

# 针对source生成hash值   (一个可变长度版本的Python的内置散列)
    def string_hash(self, source):
        if source == "":
            return 0
        else:
            # 将字符转为二进制，并向左移动7位
            x = ord(source[0]) << 7
            m = 1000003
            mask = 2 ** 128 - 1
            # 拼接每个关键词中字符的特征
            for c in source:
                x = ((x * m) ^ ord(c)) & mask
            x ^= len(source)
            if x == -1:
                x = -2
            #通过改变.zfill(16)[-16:]来实现改变指纹大小
            x = bin(x).replace('0b', '').zfill(32)[-32:]
            #print('strint_hash: %s, %s' % (source, x))
            return str(x)

def txt_line(txt1, txt2):
    punc = './ <>_ - - = ", 。，？！“”：‘’@#￥% … &×（）——+【】{};；● &～| \s:'
    #获取文本中的数据
    with open(txt1, 'r', encoding='gbk') as f:
        list1 = f.read()
        string = ''
        text1 = re.sub(r'[^\w]+', '', list1)
        s = jieba.cut(text1)
        string = string.join(s)
        line1 = re.sub(r"[{}]+".format(punc), "", string)

    with open(txt2, 'r', encoding='gbk') as f:
        list2 = f.read()
        string = ''
        text2 = re.sub(r'[^\w]+', '', list2)
        s = jieba.cut(text2)
        string = string.join(s)
        line2 = re.sub(r"[{}]+".format(punc), "", string)
        hash1 = simhash(line1)
        hash2 = simhash(line2)
        print("海明距离:", hash1.hamming_distance(hash2))
        print("文本相似度:", hash1.similarity(hash2))

if __name__ == '__main__':
    txt_line(txt1, txt2)