NLP学习——文本相似度计算

导入模块

from gensim.models import Word2Vec
import jieba
import numpy as np
FILE_PATH = "./data/wiki_tiny.txt"
MODEL_PATH = 'word_vec.model'

读取文件

def read_text(FILE_PATH):
    sentences = []
    with open (FILE_PATH,encoding="utf-8") as f:
        for line in f.readlines():
            if line.strip():
                sentences.append(line.strip().split(" "))
            if len(sentences) == 1000:
                break
    return sentences

模型训练

def train(sentences,MODEL_PATH):
    model = Word2Vec(sentences,sg = 1,size=100, window=5, min_count=5, negative=3, hs=1, workers=4)
    model.wv.save(MODEL_PATH)
    return model.wv

句向量转为词向量

def sentence2vec(sen,vecs):
    segment = list(jieba.cut(sen))
    sen2vec = np.zeros(100)
    for seg in segment:
        try:
            sen2vec += vecs[seg]
        except:
            pass
    return sen2vec /len(segment)

余弦相似度计算

def cosine(a,b):
    return np.matmul(a, b) / np.linalg.norm(a) / np.linalg.norm(b)

读取模型

def load():
    return KeyedVectors.load('./data/word_vec.model')

模型训练

# 读取数据
sentences = read_text(FILE_PATH)
# 训练
model = train(sentences,MODEL_PATH)

测试

sen1 = '办理银行卡'
sen2 = '办理储蓄卡'
vec1 = sentence2vec(sen1, model)# 计算第一个句子的句向量
vec2 = sentence2vec(sen2, model)# 计算第二个句子的句向量

sim = cosine(vec1, vec2)# 计算相似度
print('sim: ', sim)
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\heitao\AppData\Local\Temp\jieba.cache
Loading model cost 0.776 seconds.
Prefix dict has been built succesfully.


sim:  0.9563070869153697
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值