自己实现
simash实现
# https://www.cnblogs.com/-wenli/p/11150476.html
# 使用simhash对两个文本进行相似性检测
# simhash实现
class simhash:
def __init__(self,content):
self.simhash=self.simhash(content)
def __str__(self):
return str(self.simhash)
# simhash实现,也可以直接使用simhash库
def simhash(self,content):
# seg = jieba.cut(content)
# jieba.analyse.set_stop_words('stopword.txt')
# 分析具体标签
keyWord = jieba.analyse.extract_tags(
'|'.join(content), topK=10, withWeight=True, allowPOS=())# 在这里对jieba的tfidf.py进行了修改
# 将tags = sorted(freq.items(), key=itemgetter(1), reverse=True)修改成tags = sorted(freq.items(), key=itemgetter(1,0), reverse=True)
# 即先按照权重排序,再按照词排序
keyList = []
for feature, weight in keyWord:
weight = int(weight * 10)
feature = self.string_hash(feature)
temp = []
for i in feature:
if(i == '1'):
temp.append(weight)
else:
temp.append(-weight)
# print(temp)
keyList.append(temp)
list1 = np.sum(np.array(keyList), axis=0)
#print(list1)
if(keyList==[]): #编码读不出来
return '00'
simhash = ''
for i in list1:
if(i > 0):
simhash = simhash + '1'
else:
simhash = simhash + '0'
return simhash
# 相似度
def similarity(self, other):
a = float(self.simhash)
b = float(other.simhash)
if a > b : return b / a
else: return a / b
def string_hash(self,source):
if source == "":
return 0
else:
x = ord(source[0]) << 7
m = 1000003
mask = 2 ** 128 - 1
for c in source:
x = ((x * m) ^ ord(c)) & mask
x ^= len(source)
if x == -1:
x = -2
x = bin(x).replace('0b', '').zfill(64)[-64:]
#print(source,x)
return str(x)
# 计算海明距离
def hammingDis(self, com):
t1 = '0b' + self.simhash
t2 = '0b' + com.simhash
n = int(t1, 2) ^ int(t2, 2)
i = 0
while n:
n &= (n - 1)
i += 1
return i
txt两个文本相似性检测
def get_line(fr1,fr2):
# 停用词
punc = './ <>_ - - = ", 。,?!“”:‘’@#¥% … &×()——+【】{};;● &~| \s:'
stoplist = {}.fromkeys([line.rstrip() for line in
# codecs.open(r"C:/Users/Administrator/Desktop/word_level/data/中文停用词库.txt", 'r', 'gbk')])
codecs.open(r"data/文章停用词.txt", 'r', 'utf-8')])
# 读取文件1
with open(fr1, encoding='utf-8') as f:
list1 = f.read()
string = ''
X, Y = ['\u4e00', '\u9fa5']
text1 = re.sub(r'[^\w]+', '', list1)
# 分词1
s = jieba.cut(text1)
s = [i for i in s if len(i) > 1 and X <= i <= Y and i not in stoplist]
string = string.join(s)
line1 = re.sub(r"[{}]+".format(punc), "", string)
# 读取文件2
with open(fr2, encoding='utf-8') as f:
list2 = f.read()
print(re.sub(r'\n','',list2))
string = ''
X, Y = ['\u4e00', '\u9fa5']
text2 = re.sub(r'[^\w]+', '', list2)
# 分词2
s = jieba.cut(text2)
s = [i for i in s if len(i) > 1 and X <= i <= Y and i not in stoplist]
string = string.join(s)
line2 = re.sub(r"[{}]+".format(punc), "", string)
hash1 = simhash(line1.split())
hash2 = simhash(line2.split())
# 计算simhash海明距离
# print(hash1.hammingDis(hash2))
# 如果 海明距离<=18 说明文本相似
if hash1.hammingDis(hash2) <= 3:# 18指标的确认
# print('文本相似')
else:
# print('文本不相似')
导入
# 相似性检验测试
fr1 = 'data/article/当我们对组件二次封装时我们在封装什么.txt'
fr2 = 'data/article/当我们对组件二次封装时我们在封装什么.txt'
if __name__ == '__main__':
get_line(fr1, fr2)