首先,建立自己的语料库
def ylk(x):
seg = jieba.cut(x, cut_all=False)
with open('D://listTwo.txt', 'a',encoding='utf-8')as f:
for word in seg:
f.write(word+" ")
f.write('\n')
训练模型
from gensim.models.word2vec import LineSentence, Word2Vec
#加载语料库
sentences = LineSentence("D://listTwo.txt")
model = Word2Vec(sentences, min_count=1, iter=1000)
model.save("D://w2v.mod")
比较相似
target = "D:/listTwo.txt"
model ="D://w2v.mod"
model_w2v = Word2Vec.load(model)
candidates = []
with open(target, encoding='utf-8')as f:
for line in f:
candidates.append(line.strip().split()) #将语料放到列表中便于操作
def xsd(text):
words = list(jieba.cut(text.strip())) #分词
flag = False
res = []
index = 0
for candidate in candidates:
# print("candidate", candidate)
for c in candidate:
if c not in model_w2v.wv.vocab:
print("candidate word %s not in dict. skip this turn" % c)
flag = True
if flag:
break
score = model_w2v.n_similarity(words, candidate)
res.append(index)
index += 1
return (len(res),res)
#返回列表中与这句话形似的文本的个数和相似的有哪些