问题背景
批量下载文档的时候未能自动的命名,导致文件整理比较麻烦;除了文档外,还下载了文档索引文件,如下所示:
索引信息(TXT文件)
文档信息(PDF文件)
问题需求
通过利用索引信息,匹配索引文档中与文档库中题目最相似的字符串,并利用最相似的字符串对文档库中的文档进行重命名。
测试环境
|----PyCharm Python3.8
|----系统:Windows
匹配算法
为了快速计算两句话的相似度,且不考虑训练模型;通过资料查找,确定采用余弦匹配算法,代码如下:
文件命名为 cosin_computer.py
import math
import re
def compute_cosine(text_a, text_b):
# 找单词及词频
words1 = text_a.split(' ')
words2 = text_b.split(' ')
# print(words1)
words1_dict = {}
words2_dict = {}
for word in words1:
# word = word.strip(",.?!;")
word = re.sub('[^a-zA-Z]', '', word)
word = word.lower()
# print(word)
if word != '' and word in words1_dict:
num = words1_dict[word]
words1_dict[word] = num + 1
elif word != '':
words1_dict[word] = 1
else:
continue
for word in words2:
# word = word.strip(",.?!;")
word = re.sub('[^a-zA-Z]', '', word)
word = word.lower()
if word != '' and word in words2_dict:
num = words2_dict[word]
words2_dict[word] = num + 1
elif word != '':
words2_dict[word] = 1
else:
continue
# print(words1_dict)
# print(words2_dict)
# return True
dic1 = sorted(words1_dict.items(), key=lambda asd: asd[1], reverse=True)
dic2 = sorted(words2_dict.items(), key=lambda asd: asd[1], reverse=True)
# print(dic1)
# print(dic2)
# 得到词向量
words_key = []
for i in range(len(dic1)):
words_key.append(dic1[i][0]) # 向数组中添加元素
for i in range(len(dic2)):
if dic2[i][0] in words_key:
# print 'has_key', dic2[i][0]
pass
else: # 合并
words_key.append(dic2[i][0])
# print(words_key)
vect1 = []
vect2 = []
for word in words_key:
if word in words1_dict:
vect1.append(words1_dict[word])
else:
vect1.append(0)
if word in words2_dict:
vect2.append(words2_dict[word])
else:
vect2.append(0)
# print(vect1)
# print(vect2)
# 计算余弦相似度
sum = 0
sq1 = 0
sq2 = 0
for i in range(len(vect1)):
sum += vect1[i] * vect2[i]
sq1 += pow(vect1[i], 2)
sq2 += pow(vect2[i], 2)
try:
result = round(float(sum) / (math.sqrt(sq1) * math.sqrt(sq2)), 2)
except ZeroDivisionError:
result = 0.0
# print(result)
return result
参考链接:link.
主函数 main.py
import os
from Py20201027_TextProcess.cosin_computer import compute_cosine #根据自己的情况做调整
# 加载文件
file = open('E:/F01_Researches/R01_ProductServiceSystem/PSS09-会议文献/CIRP Conference on Industrial Product-Service Systems/11th_ScienceDirect_citations.txt','r',encoding='UTF-8')
# 获取所有行
lines = file.readlines()
line_count = len(lines)
print(line_count)
'''
# 打印前10行
for i in range(1,line_count,11):
print(i,'-->',lines[i])
if i > 100:
break
'''
file_root = 'E:/F01_Researches/R01_ProductServiceSystem/PSS09-会议文献/CIRP Conference on Industrial Product-Service Systems'
# 加载文件列表
files = os.listdir(os.path.join(file_root,'11th'))
# 标题预处理
new_files = []
for i in range(len(files)):
temp = files[i].split('-')
temp = temp[0:-1]
x = ' '.join(temp)
new_files.append(x)
# 行预处理
new_lines = []
for i in range(len(lines)):
temp = lines[i].split('-')
x = ' '.join(temp)
new_lines.append(x)
#for i in range(10):
# print("Before: ", files[i], '|-->After: ', new_files[i])
count = 1
# 相似度计算
for i in range(1,line_count,11): #遍历text中的每个题目
print('第',count,'-->',lines[i]) #打印当前行
max_sim = 0.
max_str_index = 0
print('\t\tSim: ',end='\t')
for j in range(len(new_files)):
sim = compute_cosine(lines[i],new_files[j])
print(sim, end='\t')
if sim > max_sim:
max_sim = sim
max_str_index = j
#print('\t',new_files[j],'\t','相似度:',sim)
print('\n\t\t相似度最高的为:',files[max_str_index],'\t最大相似度为:',max_sim)
count += 1