日常工作中,我们偶尔会遇到比如判断两篇文章是否雷同,可以通过计算两篇文章相似度去判断,下面是通过python实现的方式:
import sys
import jieba
import math
import re
#Read two txt files into s1,s2 strings
s1 = open('./ 1.txt','r').read()
s2 = open('./ 2.txt','r').read()
#Use jieba word segmentation and stop vocabulary to separate words and save them to vectors
stopwords=[]
fstop=open('stop_words.txt','r',encoding='utf-8-sig')
for eachWord in fstop:
eachWord = re.sub("\n", "", eachWord)
stopwords.append(eachWord)
stopwords.append("\n")
stopwords.append(" ")
fstop.close()
s1_cut = [i for i in jieba.cut(s1, cut_all=True) if (i not in stopwords) and i!= '']
s2_cut = [i for i in jieba.cut(s2, cut_all=True) if (i not in stopwords) and i!= '']
word_set = set(s1_cut).union(set(s2_cut))
print(s1_cut) # 1文件获得的分词
print(s2_cut) # 2文件获得的分词
#Use a dictionary to save and number all the words that appear in both articles
word_dict = dict()
i = 0
for word in word_set:
word_dict[word] = i
i += 1
#根据词袋模型统计词在每篇文档中出现的次数,形成向量
s1_cut_code = [0]*len(word_dict)
for word in s1_cut:
s1_cut_code[word_dict[word]]+=1
s2_cut_code = [0]*len(word_dict)
for word in s2_cut:
s2_cut_code[word_dict[word]]+=1
# 计算余弦相似度
sum = 0
sq1 = 0
sq2 = 0
for i in range(len(s1_cut_code)):
sum += s1_cut_code[i] * s2_cut_code[i]
sq1 += pow(s1_cut_code[i], 2)
sq2 += pow(s2_cut_code[i], 2)
try:
result = round(float(sum) / (math.sqrt(sq1) * math.sqrt(sq2)), 3)
except ZeroDivisionError:
result = 0.0
print("\n两篇文章余弦相似度为:%f"%result)