导入模块
from gensim. models import Word2Vec
import jieba
import numpy as np
FILE_PATH = "./data/wiki_tiny.txt"
MODEL_PATH = 'word_vec.model'
读取文件
def read_text ( FILE_PATH) :
sentences = [ ]
with open ( FILE_PATH, encoding= "utf-8" ) as f:
for line in f. readlines( ) :
if line. strip( ) :
sentences. append( line. strip( ) . split( " " ) )
if len ( sentences) == 1000 :
break
return sentences
模型训练
def train ( sentences, MODEL_PATH) :
model = Word2Vec( sentences, sg = 1 , size= 100 , window= 5 , min_count= 5 , negative= 3 , hs= 1 , workers= 4 )
model. wv. save( MODEL_PATH)
return model. wv
句向量转为词向量
def sentence2vec ( sen, vecs) :
segment = list ( jieba. cut( sen) )
sen2vec = np. zeros( 100 )
for seg in segment:
try :
sen2vec += vecs[ seg]
except :
pass
return sen2vec / len ( segment)
余弦相似度计算
def cosine ( a, b) :
return np. matmul( a, b) / np. linalg. norm( a) / np. linalg. norm( b)
读取模型
def load ( ) :
return KeyedVectors. load( './data/word_vec.model' )
模型训练
sentences = read_text( FILE_PATH)
model = train( sentences, MODEL_PATH)
测试
sen1 = '办理银行卡'
sen2 = '办理储蓄卡'
vec1 = sentence2vec( sen1, model)
vec2 = sentence2vec( sen2, model)
sim = cosine( vec1, vec2)
print ( 'sim: ' , sim)
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\heitao\AppData\Local\Temp\jieba.cache
Loading model cost 0.776 seconds.
Prefix dict has been built succesfully.
sim: 0.9563070869153697