Convert the text into numerical feature vectors.
– Word Embedding(Word2Vec)
The word2vec algorithms include skip-gram and CBOW models, using either hierarchical softmax or negative sampling: Tomas Mikolov et al: Efficient Estimation of Word Representations in Vector Space, Tomas Mikolov et al: Distributed Representations of Words and Phrases and their Compositionality.
上次我们使用ti-idf进行vertorization,这次使用Word2Vec.
代码实现:
import pandas as pd
path = '~/new_data/'
train_all= pd.read_csv(path + 'train_set.csv',index_col='id')
test_data= pd.read_csv(path + 'test_set.csv',index_col='id')
#划分训练集&测试集 9:1
from sklearn.model_selection import train_test_split
X_train,X_valid,y_train,y_valid=train_test_split(train_all[['article','word_seg']],train_all['class'],test_size=0.1, random_state=0)
print(X_train.shape,X_valid.shape,y_train.shape,y_valid.shape)
#word2vec
from gensim.models import Word2Vec
vector_size = 100
def sentence2list(sentence):
return sentence.strip().split()
print("准备数据................ ")
sentences_train = list(X_train.loc[:, 'word_seg'].apply(sentence2list))
sentences_valid = list(X_valid.loc[:, 'word_seg'].apply(sentence2list))
sentences = sentences_train + sentences_valid
print("准备数据完成! ")
print("开始训练................ ")
model = Word2Vec(sentences=sentences,size=vector_size, window=5, min_count=5, workers=8, sg=1, iter=5)
print("训练完成! ")
# 提取词汇表及vectors,并保存
import pickle
import numpy as np
print(" 保存训练结果........... ")
wv = model.wv
vocab_list = wv.index2word
word_idx_dict = {}
for idx, word in enumerate(vocab_list):
word_idx_dict[word] = idx
vectors_arr = wv.vectors
vectors_arr = np.concatenate((np.zeros(vector_size)[np.newaxis, :], vectors_arr), axis=0)#第0位置的vector为'unk'的vector
feature_path = '~/new_data/feature_file/'
f_wordidx = open(feature_path + 'word_seg_word_idx_dict.pkl', 'wb')
f_vectors = open(feature_path + 'word_seg_vectors_arr.pkl', 'wb')
pickle.dump(word_idx_dict, f_wordidx)
pickle.dump(vectors_arr, f_vectors)
f_wordidx.close()
f_vectors.close()
print("训练结果已保存到该目录下! ")