利用字向量计算文本的相似度,我用的字向量是维基百科(100维)和BERT(768维)已经训练好的。
提供字向量网盘的下载连接
100维:
https://pan.baidu.com/s/17i3w13jjSBOyDDg8cPNdYA 提取码:zt46
768维:
https://pan.baidu.com/s/1I77O9-jTTJNcDIYEozM7tg 提取码:6eh1
1.首先获取所有的字向量
利用维基百科训练的字向量
利用BERT训练的字向量
def get_word_vector(emb_file,word_dim):
"""
获得所有的字向量
:param emb_file: 字向量文件
:param word_dim: 字向量维度
:return:
"""
try:
emb_invalid = 0
pre_trained = {}
with open(emb_file, 'r', encoding='utf-8', errors='ignore') as f:
for i, line in enumerate(f.readlines()):
line = line.rstrip().split()
if len(line) == word_dim + 1:
pre_trained[line[0]] = np.array([float(x) for x in line[1:]]).astype(np.float32)
else:
emb_invalid = emb_invalid + 1
return pre_trained
except Exception as e:
print(e)
return {}
2.得到句向量
def get_sentence_vector(txt,dict_vector,emb_size):
"""
得到句向量
:param txt: 句子
:param dict_vector: 字向量
:param emb_size: 字向量维度
:return:
"""
try:
txt_vector = np.zeros(emb_size)
for word in txt:
if word in dict_vector.keys():
vector = dict_vector.get(word)
txt_vector+=vector
return txt_vector
except Exception as e:
print(e)
return ''
3.计算相似度
def get_similarity(txt1_vector,txt2_vector):
"""
计算相似度
:param txt1_vector:
:param txt2_vector:
:return:
"""
try:
similarity = np.dot(txt1_vector, txt2_vector) / (norm(txt1_vector) * norm(txt2_vector))
return similarity
except Exception as e:
print(e)
return ''
分别看一下用维基百科和BERT字向量计算相似度的效果。
BERT字向量:
维基百科字向量:
果然,计算相似度还是需要跑BERT模型在进行Fine-turning,为了偷懒也可以这样用。哈哈~~
最后提供一下完整代码:
import numpy as np
from scipy.linalg import norm
def get_word_vector(emb_file,word_dim):
"""
获得所有的字向量
:param emb_file: 字向量文件
:param word_dim: 字向量维度
:return:
"""
try:
emb_invalid = 0
pre_trained = {}
with open(emb_file, 'r', encoding='utf-8', errors='ignore') as f:
for i, line in enumerate(f.readlines()):
line = line.rstrip().split()
if len(line) == word_dim + 1:
pre_trained[line[0]] = np.array([float(x) for x in line[1:]]).astype(np.float32)
else:
emb_invalid = emb_invalid + 1
return pre_trained
except Exception as e:
print(e)
return {}
def get_sentence_vector(txt,dict_vector,emb_size):
"""
得到句向量
:param txt: 句子
:param dict_vector: 字向量
:param emb_size: 字向量维度
:return:
"""
try:
txt_vector = np.zeros(emb_size)
for word in txt:
if word in dict_vector.keys():
vector = dict_vector.get(word)
txt_vector+=vector
return txt_vector
except Exception as e:
print(e)
return ''
def get_similarity(txt1_vector,txt2_vector):
"""
计算相似度
:param txt1_vector:
:param txt2_vector:
:return:
"""
try:
similarity = np.dot(txt1_vector, txt2_vector) / (norm(txt1_vector) * norm(txt2_vector))
return similarity
except Exception as e:
print(e)
return ''
if __name__ == '__main__':
# 用的哪个字向量文件 只用改这一个地方就行
word_vector_file = 'wiki_100.txt'
word_dim = 0
if 'bert' in word_vector_file:
word_dim = 768
elif 'wiki' in word_vector_file:
word_dim = 100
words_vector = get_word_vector(word_vector_file,word_dim)
# text1 = '我们去上海玩'
# text2 = '不想吃饭'
#
# text_vector1 = get_sentence_vector(text1, words_vector, 768)
# text_vector2 = get_sentence_vector(text2, words_vector, 768)
#
# similar = get_similarity(text_vector1,text_vector2)
# print('句子的相似度为:',similar)
print(word_vector_file)
strings = [
'我用小米手机',
'有空出去玩吧',
'休息日去哪里耍',
'今晚准备吃什么',
'我喜欢吃香蕉'
]
target = '周末我们去哪里玩'
text_vector1 = get_sentence_vector(target, words_vector, word_dim)
for text2 in strings:
text_vector2 = get_sentence_vector(text2, words_vector, word_dim)
similar = get_similarity(text_vector1,text_vector2)
print(target,'====',text2,similar)