import numpy as np
import jieba
import json
import math
class DocumentSimilarity():
def __init__(self):
self.pathA = r'合成生物学.txt'
self.pathB = r'自动化专业.txt'
self.pathC = r'计算机专业.txt'
self.mySlot = r'slot.json'
self.stopWords = [',','。','《','》',';',':','‘','’','”','“','?','!','的']
def get_slot(self)->None:
allStr = ''
with open(self.pathA,'r',encoding='utf-8') as f_obj:
allStr += f_obj.read()
with open(self.pathB,'r',encoding='utf-8') as f_obj:
allStr += f_obj.read()
with open(self.pathC,'r',encoding='utf-8') as f_obj:
allStr += f_obj.read()
orginalSlot = list(set(jieba.lcut(allStr)))
slot = [item for item in orginalSlot if item not in self.stopWords]
with open(self.mySlot,'w') as f_obj:
f_obj.write(json.dumps(slot))
def show_slot(self)->None:
with open(self.mySlot,'r') as f_obj:
print(json.loads(f_obj.read()))
input('点击任意键关闭窗口')
def get_TF(self,slot,path):
with open(path,'r',encoding='utf-8') as f_obj:
items = jieba.lcut(f_obj.read())
TF = list(np.zeros(len(slot),int))
for item in items:
if item not in self.stopWords:
TF[slot.index(item)] += 1
return TF
def get_all_TF(self):
with open(self.mySlot,'r') as f_obj:
slot = json.loads(f_obj.read())
TF_A = self.get_TF(slot,self.pathA)
TF_B = self.get_TF(slot,self.pathB)
TF_C = self.get_TF(slot,self.pathC)
return TF_A,TF_B,TF_C
def get_similarity(self,x,y):
x = np.array(x)
y = np.array(y)
return np.dot(x,y) / (math.sqrt(sum(pow(x,2))) * math.sqrt(sum(pow(y,2))))
def show_result(self):
TF_A,TF_B,TF_C = self.get_all_TF()
print(self.get_similarity(TF_A,TF_B))
print(self.get_similarity(TF_B,TF_C))
if __name__ == '__main__':
ds = DocumentSimilarity()
ds.show_result()
NLP实践:计算三篇文章的余弦相似度
最新推荐文章于 2021-11-28 12:00:00 发布