#encoding:utf-8
import os
import logging
import re
import time
import codecs
import jieba
from gensim.models import word2vec
file_corpus='data/file_sentence.txt'
file_voc='data/voc_char.txt'
file_voc2='data/word_char.txt'
class CharSentences(object):
def __init__(self, file_corpus):
self.file_corpus = file_corpus
pass
def __iter__(self):
with codecs.open(self.file_corpus,'r',encoding='utf-8') as f:
for _,line in enumerate(f):
seglist = list(line.strip())
yield seglist
class WordSentences(object):
def __init__(self, file_corpus):
self.file_corpus = file_corpus
pass
def __iter__(self):
with codecs.open(self.file_corpus,'r',encoding='utf-8') as f:
for _,line in enumerate(f):
seglist = list(jieba.cut(line.strip()))
yield seglist
if __name__ == '__main__':
t1 = time.time()
sentences=CharSentences(file_corpus)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
model=word2vec.Word2Vec(sentences,vector_size=50,window=5,min_count=1,workers=6,max_vocab_size=5000)
model.wv.save_word2vec_format(file_voc, binary=False)
print('-------------------------------------------')
print("Training word2vec model cost %.3f seconds...\n" % (time.time() - t1))
08-01
1890
12-01
3886
10-23
10-23