因为从中文维基百科中取得的语料库大约有11G,在使用gensim进行训练时,经常出现killed的问题,最后想到把语料库文件进行切割,然后再训练。具体代码如下:
import gensim
import time
import os
import glob
# 将语料库分成多个文件
corpus_path = '/mnt/e/ProLearn/NLP/wiki_nlp_data/zhwiki_simplified_yh.txt'
size_per_file = 100 * 1024 * 1024 # 每个文件的大小为500M
with open(corpus_path, 'r', encoding='utf-8') as f:
chunk = f.read(size_per_file)
count = 0
while chunk:
file_path = '/mnt/e/ProLearn/NLP/wiki_nlp_data/txt_part/zhwiki_simplified_yh_{}.txt'.format(count)
with open(file_path, 'w', encoding='utf-8') as out:
out.write(chunk)
count += 1
print('加载文件 {}...'.format(count))
chunk = f.read(size_per_file)
# 读取所有分割后的语料库文件
corpus_files = glob.glob('/mnt/e/ProLearn/NLP/wiki_nlp_data/txt_part/zhwiki_simplified_yh_*.txt')
# 构建词向量模型
model = gensim.models.FastText(vector_size=100, window=3, min_count=5, sg=1)
sentences = gensim.utils.ConcatenatedCorpus([gensim.models.word2vec.LineSentence(f) for f in corpus_files])
model.build_vocab(sentences)
# 训练词向量模型并显示进度
print('开始训练...')
start_time = time.time()
model.train(sentences, total_examples=model.corpus_count, epochs=5, report_delay=30)
print('已经训练了 {} 秒.'.format(time.time() - start_time))
# 保存模型
print('保存模型...')
model.save('/mnt/e/ProLearn/NLP/wiki_nlp_data/fasttext_model.bin')
print('已保存')
# 加载模型
print('加载模型...')
model = gensim.models.FastText.load('/mnt/e/ProLearn/NLP/wiki_nlp_data/fasttext_model.bin')
print('模型已加载.')
# 获取单词向量
vec = model.wv['单词']
# 获取相似单词
similar_words = model.wv.most_similar('单词')
print("测试结果:"+similar_words)