1、简介
最近一直在研究NLP的文本相似度算法,本文将利用TF-IDF特征向量和Simhash指纹计算中文文本的相似度。
2、计算过程
- 准备测试数据
- 预处理读到的数据
- 加载数据到Map中
- 输入用户问题
- 利用TF特征向量和Simhash指纹计算出 预处理的配置文件中的分值
3、效果图
4、核心代码
try:
text = re_test.run(question) # 通过正则 查找匹配数据
doc_token = jt.tokens(text) # 预处理,分词
doc_feat = fb.compute(doc_token)
doc_fl = DocFeatLoader(smb, doc_feat) # 对象包含两个参数 # fingerprint 指纹分值 # feat_vec 包含元组的列表
# 预处理后的配置文件
contentFlListMap = nodeMap
p_score_list = []
if nodeId in contentFlListMap.keys():
nodeFlList = contentFlListMap[nodeId]
print("nodeFilist",nodeFlList)
for i in range(len(nodeFlList)):
p_score_dict={}
dist = cosine_distance_nonzero(nodeFlList[i]["lableDataFeatureVector"].feat_vec, doc_fl.feat_vec, norm=False)
p_score_dict["score"] = dist
p_score_dict["labelData"] = nodeFlList[i]["labelData"]
p_score_dict["targetNodeId"] = nodeFlList[i]["targetNodeId"]
p_score_dict["conditionId"] = nodeFlList[i]["conditionId"]
p_score_list.append(p_score_dict)
p_score_list = sorted(p_score_list, key=lambda score : score["score"], reverse=True)
print("Sorted:",p_score_list)
Complete_MayBeL4 = []
Complete_MayBeL4Score = []
Complete_MayBeL4ID = []
Complete_MayBeL4Max = 3
for i, el in enumerate(p_score_list):
p_label = p_score_list[i]["labelData"]
p_score = p_score_list[i]["score"]
p_conditionId = p_score_list[i]["conditionId"]
if len(Complete_MayBeL4) < Complete_MayBeL4Max:
Complete_MayBeL4.append(p_label)
Complete_MayBeL4Score.append(p_score)
Complete_MayBeL4ID.append(p_conditionId)
else:
break
print("************************************")
print("用户问题:", question)
print("相似问(Max=%s):%s"%(Complete_MayBeL4Max,Complete_MayBeL4))
print("特征值(Max=%s):%s"%(Complete_MayBeL4Max,Complete_MayBeL4Score))
print("ID:",Complete_MayBeL4ID)
return "", "", "", "", "", ""
except Exception as e:
print("************************************")
print("Error textSimilarity:", str(e))
print("************************************")
5、此项目Github源码分享
https://github.com/ShaShiDiZhuanLan/Demo_TFIDF_Simhash_Python