自然语言处理的数据量通常很大,当数据集已经没办法装在整个内存里面了, 我们就需要分批构建word2vec。代码如下:
from gensim.models.word2vec import Word2Vec
from tqdm import tqdm
WORK_PATH = '.'
class Word2VecTrainingMaster():
def __init__(self, new_work=True, corpus_path='', work_path='.', base_name='word2vec.model', num_features=100, min_word_count=1, context=4, auto=True, batch_size=100000, step_save=False):
self.work_path = work_path
self.corpus_path = corpus_path
self.word2vec_model = self.make_word2vec_model(num_features, min_word_count, context)
self.base_name=base_name
self.word2vec_model.save(self.work_path + '/' + self.base_name)
self.step_save = step_save
if auto:
with open(self.corpus_path) as f:
reader = f.readlines()
f.close()
length = len(reader)
INIT = True
for batch_index in tqdm(range(0, length, batch_size)):
batch = reader[batch_index: batch_index+batch_size]
self.update_model(batch, init=INIT)
INIT = False
self.word2vec_model.save(self.work_path + '/' + self.base_name)
def make_word2vec_model(self, num_features, min_word_count, context):
return Word2Vec(size=num_features, min_count=min_word_count, window=context)
def update_model(self, batch, init):
if init:
self.word2vec_model.build_vocab(batch)
else:
self.word2vec_model.build_vocab(batch, update=True)
self.word2vec_model.train(batch, total_examples=self.word2vec_model.corpus_count, epochs=self.word2vec_model.iter)
if self.step_save:
self.word2vec_model.save(self.work_path + '/' + self.base_name)
if __name__ == '__main__':
w2v = Word2VecTrainingMaster(corpus_path='lang.txt', num_features=300, work_path=WORK_PATH)