0 项目说明
基于CNN和词向量的句子相似性度量
提示:适合用于课程设计或毕业设计,工作量达标,源码开放
1 开发环境
Anaconda + Pycharm
2 项目说明
毕业设计主要针对于句子相似度的计算,尤其是长句相似度,使用GoogleNews预训练的模型
GoogleNews-vectors-negative300.bin
3 数据来源
数据存放于本项目DataSet下
4 项目运行
- clone 项目到本地
- 配置config.json文件:model_path用于存放预训练模型存放的路径;dataset_path用于存放最后作实验评估的数据集存放的路径
- 从main.py开始运行,首先要点击初始化按钮加载预训练模型
- 在句子1和句子2处输入要计算相似度的句子
- 计算结果显示在相似度,WJ,WMD距离三个地方
- 可以每次换一个句子,并点击计算
- 计算完成,点击退出
5 结论验证
6 项目源码
from gensim.models import KeyedVectors
class Word2Vec():
def __init__(self, modelPath, kind='bin'):
"""
创建Word2Vec对象
modelPath: 模型路径
kind: 模型类型
bin: 二进制文件
txt: 文本文件
return: 无
"""
if kind != 'bin':
kind = False
else:
kind = True
print('loading word2vector model...')
self.model = KeyedVectors.load_word2vec_format(modelPath, binary=kind, unicode_errors='ignore')
def get_word_vector(self, word):
"""
获得词向量
word: 词语
return: 词向量
"""
if word in self.model:
return self.model[word]
return None
def word_similarity(self, word1, word2):
"""
计算词语相似度
word1: 词语1
word2: 词语2
return: 词语1与词语2的相似度
"""
if word1 not in self.model or word2 not in self.model:
return 0
return self.model.similarity(word1, word2)
def get_similar_Words(self, word, maxReturnNum):
"""
获得语义相似的词语
word: 词语
maxReturnNum: 最大返回词语数量
return: 词语及相似度 [(word, simi)...]
"""
if word not in self.model:
return None
return self.model.similar_by_word(word, topn=maxReturnNum)
def __cal_max_similarity(self, centerWord, wordList):
"""
计算词语与词语列表中词语的最大相似度
centerWord: 词语
wordList: 词语列表
return: 词语与词语列表中词语的最大相似度
"""
maxSimi = -1
if centerWord in wordList:
return 1
else:
for word in wordList:
temp = self.word_similarity(centerWord, word)
if temp == 0: continue
if temp > maxSimi: maxSimi = temp
if maxSimi == -1: return 0
return maxSimi
def sentence_similarity(self, sentence1Words, sentence2Words):
"""
计算句子相似度
sentence1Words: 句子1词语列表
sentence2Words: 句子2词语列表
return: 两个句子的相似度
"""
if len(sentence1Words) == 0 or len(sentence2Words) == 0:
return 0
vector1 = [self.__cal_max_similarity(word, sentence2Words) for word in sentence1Words]
vector2 = [self.__cal_max_similarity(word, sentence1Words) for word in sentence2Words]
return (sum(vector1) + sum(vector2)) / (len(vector1) + len(vector2))
def sentence_weight_similarity(self, sentence1Words, sentence2Words, weightVector1, weightVector2):
"""
计算句子相似度(带权值)
每一个词语都有一个对应的权值
sentence1Words: 句子1词语列表
sentence2Words: 句子2词语列表
weightVector1: 句子1权值向量
weightVector2: 句子2权值向量
return: 两个句子的相似度
"""
if len(sentence1Words) == 0 or len(sentence2Words) == 0:
return 0
if len(sentence1Words) != len(weightVector1) or len(sentence2Words) != len(weightVector2):
raise Exception('length of word list and weight vector is different')
vector1 = [self.__cal_max_similarity(word, sentence2Words) * weight for word, weight in
zip(sentence1Words, weightVector1)]
vector2 = [self.__cal_max_similarity(word, sentence1Words) * weight for word, weight in
zip(sentence2Words, weightVector2)]
return (sum(vector1) + sum(vector2)) / (sum(weightVector1) + sum(weightVector2))
# 自己写
def sentence_wmd_distance(self, sentence1Words, sentence2Words):
"""
计算句子的WMD距离
:param sentence1Words: 句子1词语列表
:param sentence2Words: 句子2词语列表
:return: 两个句子的WMD距离
"""
return self.model.wmdistance(sentence1Words, sentence2Words)
7 最后
**项目分享: ** https://gitee.com/asoonis/htw
**项目分享: ** https://gitee.com/asoonis/htw