提出问题
假设现在有两篇文章,而这两篇文章的的标题十分相似,想通过计算机判断这两篇文章的相似程度,如果相似程度超过60%就断定这两篇文章有抄袭的嫌疑。需要如何实现呢?
余弦相似度
余弦相似度 (Cosine Similarity) 通过计算两个向量的夹角余弦值来评估他们的相似度。将向量根据坐标值,绘制到向量空间中,求得他们的夹角,并得出夹角对应的余弦值,此余弦值就可以用来表征这两个向量的相似性。夹角越小,余弦值越接近于1,它们的方向越吻合,则越相似。
假定A和B是两个n维向量,A是 [A1, A2, …, An] ,B是 [B1, B2, …, Bn] ,则A与B的夹角θ的余弦等于:
实现方法
1、将两段文本(或者只是两句话)分词;
def CutWord(fileName1,fileName2):
try:
lineNums = len(open(fileName1, 'r', encoding='utf-8').readlines())
source = open(fileName1, "r", encoding='utf-8')
result_fenci = open(fileName2, "w", encoding='utf-8')
i = 0
while i < lineNums:
line = source.readline()
line = line.rstrip('\n')
reg = "[^0-9A-Za-z\u4e00-\u9fa5]"
line = re.sub(reg, '', line)
fenci = jieba.cut(line, cut_all=False)
#fenci = jieba.cut_for_search(line, use_paddle=True)
fenci =" ".join(fenci)
#result_fenci.write(" ".join(fenci))
result_fenci.write(fenci+"\n")
i = i + 1
except Exception as e:
traceback.print_exc(e)
finally:
source.close()
result_fenci.close()
2.两篇文章的关键词合并成一个集合,相同的合并,不同的添加
# 统计关键词及个数
def CountKey(fileName, resultName):
try:
# 计算文件行数
lineNums = len(open(fileName, 'r',encoding='utf-8').readlines())
print('文件行数: ' + str(lineNums))
# 统计格式 格式<Key:Value> <属性:出现个数>
i = 0
table = {}
source = open(fileName, "r",encoding='utf-8')
result = open(resultName, "w",encoding='utf-8')
while i < lineNums:
line = source.readline()
line = line.rstrip('\n')
print(line)
words = line.split(" ") # 空格分隔
# 字典插入与赋值
for word in words:
if word != "" and word in table: # 如果存在次数加1
num = table[word]
table[word] = num + 1
elif word != "": # 否则初值为1
table[word] = 1
i = i + 1
# 键值从大到小排序 函数原型:sorted(dic,value,reverse)
dic = sorted(table.items(), key=lambda d: d[1], reverse=True)
for i in range(len(dic)):
# print 'key=%s, value=%s' % (dic[i][0],dic[i][1])
result.write("<" + dic[i][0] + ":" + str(dic[i][1]) + ">\n")
return dic
except Exception as e:
traceback.print_exc(e)
finally:
source.close()
result.close()
print('END\n\n')
3.计算每篇文章对于这个集合的词的词频 TF-IDF算法计算权重
4.生成两篇文章各自的词频向量
5.计算两个向量的余弦相似度,值越大表示越相似
# 统计关键词及个数 并计算相似度
def MergeKeys(dic1, dic2):
# 合并关键词 采用三个数组实现
arrayKey = []
for i in range(len(dic1)):
arrayKey.append(dic1[i][0]) # 向数组中添加元素
for i in range(len(dic2)):
if dic2[i][0] in arrayKey:
print('has_key', dic2[i][0])
else: # 合并
arrayKey.append(dic2[i][0])
else:
print('\n\n')
# 计算词频 infobox可忽略TF-IDF
arrayNum1 = [0] * len(arrayKey)
arrayNum2 = [0] * len(arrayKey)
# 赋值arrayNum1
for i in range(len(dic1)):
key = dic1[i][0]
value = dic1[i][1]
j = 0
while j < len(arrayKey):
if key == arrayKey[j]:
arrayNum1[j] = value
break
else:
j = j + 1
# 赋值arrayNum2
for i in range(len(dic2)):
key = dic2[i][0]
value = dic2[i][1]
j = 0
while j < len(arrayKey):
if key == arrayKey[j]:
arrayNum2[j] = value
break
else:
j = j + 1
print(arrayNum1)
print(arrayNum2)
print(len(arrayNum1), len(arrayNum2), len(arrayKey))
# 计算两个向量的点积
x = 0
i = 0
while i < len(arrayKey):
x = x + arrayNum1[i] * arrayNum2[i]
i = i + 1
print(x)
# 计算两个向量的模
i = 0
sq1 = 0
while i < len(arrayKey):
sq1 = sq1 + arrayNum1[i] * arrayNum1[i] # pow(a,2)
i = i + 1
print(sq1)
i = 0
sq2 = 0
while i < len(arrayKey):
sq2 = sq2 + arrayNum2[i] * arrayNum2[i]
i = i + 1
print(sq2)
result = float(x) / (math.sqrt(sq1) * math.sqrt(sq2))
return result
#主函数
def main():
fileName1 = "Tourist_spots_5A/gugong.txt"
resultName1 = "Result_Key_BD.txt"
fileName2 = "Tourist_spots_5A/tiantan.txt"
resultName2 = "Tourist_spots_5A/Result_Key_001.txt"
# 使用jieba进行分词
CutWord(fileName1, 'Tourist_spots_5A/gugong_fenci.txt')
CutWord(fileName2, 'Tourist_spots_5A/tiantan_fenci.txt')
# 计算文档1-百度的关键词及个数
dic1 = CountKey('Tourist_spots_5A/gugong_fenci.txt', resultName1)
# 计算文档2-互动的关键词及个数
dic2 = CountKey('Tourist_spots_5A/tiantan_fenci.txt', resultName2)
# 合并两篇文章的关键词及相似度计算
result = MergeKeys(dic1, dic2)
print(result)