import json
import codecs
d =dict(name=['书', '书包', '书本'], age=[20,77,9], score=[6,8,88])
f =open('./sentence.txt', 'w')
json.dump(d,f)
f.close()
g=open('./sentence.txt', 'rb')
e=json.load(g)
print(e)
from gensim.models import Word2Vec
with codecs.open('./sentence.txt', 'r', 'utf-8') as f:
sss=[]
while True:
ss=f.readline().replace('\n', '').rstrip()#对str才能操作
if ss=='':
break
s1=ss.split(" ")
sss.append(s1)
f.close()
model=Word2Vec(size=50, window=2, sg=1)
model.build_vocab(sss)
model.train(sss,total_examples=model.corpus_count, epochs=model.iter)
model.save('./gensim_w2v_sg0_model')
new_model= gensim.models.Word2Vec.load('w2v_model')#调用模型
sim_words=new_model.most_similar(positive=['书本'])
for word, similarity in sim_words:
print(word,similarity)# 输出’书本‘相近的词语和概率
print(model['书桌'])
编码遇到些错误,所以写一下:
①f = codecs.open(’./sentence.txt’,‘r’,‘utf-8’)
open(file)默认GBK,所以要注明用UTF-8来读文件。
RuntimeError: you must first build vocabulary before training the model
中文词先建立词库。
②s1=ss.split(" ".encode(encoding=‘utf-8’))
TypeError: must be str or None, not bytes
split需要str格式读取
③g =open(‘D:\Download\code\w2v\sentence.txt’, ‘rb’,'utf-8)
TypeError:an integer is required (got type str)
二进制读取不能用utf-8转换
改为:with codecs.open(’./sentence.txt’,‘r’,‘utf-8’) as f:
sss=[]
while True:
ss=f.readline().replace(’\n’,’’).rstrip()#对str才能操作
if ss==’’:
break
s1=ss.split(" ")
sss.append(s1)
f.close()