from gensim.models import Word2Vec
import pickle
import multiprocessing
class MySentences(object):
def __init__(self, filename):
with open(filename, 'rb') as f:
self.docs_set = pickle.load(f)
def __iter__(self):
for doc in self.docs_set:
for sent in doc:
yield sent
if __name__ == "__main__":
file_name = './data/docs_set1.pkl' # 已经分好词的文档集合 按照文档集,文档,句子格式存为3维的list
print('start training!')
sentences = MySentences(file_name)
model = Word2Vec(sentences, size=50, window=6, min_count=2, workers=(multiprocessing.cpu_count()-4), hs=1, sg=1, negative=10)
model.save('./models/') # 保存训练好的模型
model.wv.save_word2vec_format('./data/docs_features.txt', binary=True) # 将训练好的word embedding向量存入outp2文件中
# 从outp2文件中读取embedding向量时用gensim的KeyedVectors
print("Done!")
参考: