defword2vec_train(text,voc_dim):
min_out =2
window_size =4
cpu_count = multiprocessing.cpu_count()# cpu计数
model = Word2Vec(vector_size=voc_dim,
min_count=min_out,
window=window_size,
workers=cpu_count,
epochs=70)
model.build_vocab(text)# 构建语料库
model.train(text, total_examples=model.corpus_count, epochs=70)# 训练词向量
model.wv.save_word2vec_format('../0.model/myvector-7.vector', binary=False)# 保存
data_path ="../0.data/train-7.csv"
data = pd.read_csv(data_path)
test_path ="../0.data/val-7.csv"
test_data = pd.read_csv(test_path)
text =[]# 这里犯傻了,先划分的数据集 忘了练词向量了for line in data['text'].tolist():for word in line.split():
text.append(word)for line in test_data['text'].tolist():for word in line.split():
text.append(word)
text =list(set(text))
embedding_dim =128
word2vec_train(text, embedding_dim)