1115 词向量求句子相似度

先通过word2vec训练一个模型,将模型保存下来,然后给加载出来即可

# 我喜欢吃麻辣烫
# 我不喜欢吃麻辣烫
# 思路一:直接用全部词向量求余弦相似度
# 小明 喜欢 谁 ?          4*100  --> 1*100  #求平均,然后去求余弦相似度
# 我 想 知道 小明 倾慕 谁  6*100 --> 1*100
#还有一种思路,通过jieba分词整出来关键字,然后求余弦相似度
# jieba  小明 喜欢 ---> 2*100  ----> 1*100
#   知道 小明 倾慕 ---> 3*100  ----> 1*100

# word2vec
import jieba
import os
import gensim
import logging
import numpy as np
import jieba.analyse as aly
from sklearn.metrics.pairwise import cosine_similarity

# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

def split_data(file = "train_data.txt"):
    with open(file,"r",encoding="utf-8") as f:
        all_data = f.read().split("\n")

    result = []
    for data in all_data:

        ds = data.split("\t")
        if len(ds)!=2:
            continue
        result.append(jieba.lcut(ds[0]))
    if os.path.exists(os.path.join("..","data","文本分类","train_data_split.txt")) == False:
        with open(os.path.join("..","data","文本分类","train_data_split.txt"),"w",encoding="utf-8") as f:
            result = [" ".join(i) for i in result]
            f.write("\n".join(result))

    return  result

def read_data(file):
    with open(file,"r",encoding="utf-8") as f:
        all_data = f.read().split("\n")
    all_data = [i.split(" ") for i in all_data]
    return all_data

def train_word2vec():
    all_data = read_data(os.path.join("..", "data", "文本分类", "train_data_split.txt"))
    # split_data(file=os.path.join("..","data","文本分类","train.txt"))

    word_2_model = gensim.models.Word2Vec(all_data, vector_size=100, window=7, min_count=1, sg=0, hs=1)
    word_2_model.save("word2vec.model")




if __name__ =="__main__":
    # train_word2vec()
    jieba.initialize()
    word2vec = gensim.models.Word2Vec.load("word2vec.model")

    while True:

        # text1 = "中华女子学院:本科层次仅1专业招男生"
        # text2 = "3000点之下是买入好时机"

        text1 = input("请输入语句1:")
        text2 = input("请输入语句2:")

        # text1_s = aly.extract_tags(text1)
        # text2_s = aly.extract_tags(text2)

        text1_s = jieba.lcut(text1)
        text2_s = jieba.lcut(text2)

        text1_emb = np.array( [ word2vec.wv[i] for i in text1_s])
        text2_emb = np.array( [ word2vec.wv[i] for i in text2_s])

        # sent1_emb = np.mean(text1_emb,axis=0,keepdims=True)
        # sent2_emb = np.mean(text2_emb,axis=0,keepdims=True)
        sent1_emb = np.max(text1_emb,axis=0,keepdims=True)
        sent2_emb = np.max(text2_emb,axis=0,keepdims=True)

        sim_score = cosine_similarity(sent1_emb,sent2_emb)

        print(f"相似度:{sim_score[0][0]:.3f}")








评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值