先通过word2vec训练一个模型,将模型保存下来,然后给加载出来即可
# 我喜欢吃麻辣烫
# 我不喜欢吃麻辣烫
# 思路一:直接用全部词向量求余弦相似度
# 小明 喜欢 谁 ? 4*100 --> 1*100 #求平均,然后去求余弦相似度
# 我 想 知道 小明 倾慕 谁 6*100 --> 1*100
#还有一种思路,通过jieba分词整出来关键字,然后求余弦相似度
# jieba 小明 喜欢 ---> 2*100 ----> 1*100
# 知道 小明 倾慕 ---> 3*100 ----> 1*100
# word2vec
import jieba
import os
import gensim
import logging
import numpy as np
import jieba.analyse as aly
from sklearn.metrics.pairwise import cosine_similarity
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
def split_data(file = "train_data.txt"):
with open(file,"r",encoding="utf-8") as f:
all_data = f.read().split("\n")
result = []
for data in all_data:
ds = data.split("\t")
if len(ds)!=2:
continue
result.append(jieba.lcut(ds[0]))
if os.path.exists(os.path.join("..","data","文本分类","train_data_split.txt")) == False:
with open(os.path.join("..","data","文本分类","train_data_split.txt"),"w",encoding="utf-8") as f:
result = [" ".join(i) for i in result]
f.write("\n".join(result))
return result
def read_data(file):
with open(file,"r",encoding="utf-8") as f:
all_data = f.read().split("\n")
all_data = [i.split(" ") for i in all_data]
return all_data
def train_word2vec():
all_data = read_data(os.path.join("..", "data", "文本分类", "train_data_split.txt"))
# split_data(file=os.path.join("..","data","文本分类","train.txt"))
word_2_model = gensim.models.Word2Vec(all_data, vector_size=100, window=7, min_count=1, sg=0, hs=1)
word_2_model.save("word2vec.model")
if __name__ =="__main__":
# train_word2vec()
jieba.initialize()
word2vec = gensim.models.Word2Vec.load("word2vec.model")
while True:
# text1 = "中华女子学院:本科层次仅1专业招男生"
# text2 = "3000点之下是买入好时机"
text1 = input("请输入语句1:")
text2 = input("请输入语句2:")
# text1_s = aly.extract_tags(text1)
# text2_s = aly.extract_tags(text2)
text1_s = jieba.lcut(text1)
text2_s = jieba.lcut(text2)
text1_emb = np.array( [ word2vec.wv[i] for i in text1_s])
text2_emb = np.array( [ word2vec.wv[i] for i in text2_s])
# sent1_emb = np.mean(text1_emb,axis=0,keepdims=True)
# sent2_emb = np.mean(text2_emb,axis=0,keepdims=True)
sent1_emb = np.max(text1_emb,axis=0,keepdims=True)
sent2_emb = np.max(text2_emb,axis=0,keepdims=True)
sim_score = cosine_similarity(sent1_emb,sent2_emb)
print(f"相似度:{sim_score[0][0]:.3f}")