word2vec增量训练
python word2vec_add.py patent.model new_patent.txt patent_new.model patent_new.vector
import logging
import os.path
import sys
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
if __name__ == '__main__':
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logging.info('running %s' % ' '.join(sys.argv))
if len(sys.argv) < 5:
print(globals()['__doc__'] % locals())
'''
model_path:已有模型对,对应patent.model
inp:增加分好词的数据,对应new_patent.txt
output_model:新保存的模型,对应patent_new.model
output_txt:保存的词向量,对应patent_new.vector
'''
model_path, inp, output_model, output_txt = sys.argv[1:4]
sentences = LineSentence(inp)
model = Word2Vec.load(model_path)
model.build_vocab(sentences, update=True)
model.train(sentences, total_examples=2000000, epochs=5)
model.save(output_model)
model.wv.save_word2vec_format(output_txt, binary=False)