__author__ = 'su'
import os
import logging
import sys
import re
import jieba
import multiprocessing
import gensim
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
def process_wiki(inp, outp):
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
i = 0
output = open(outp, 'w', encoding='utf-8')
wiki = WikiCorpus(inp, lemmatize=False, dictionary={})
for text in wiki.get_texts():
output.write(b' '.join(text).decode('utf-8') + '\n')
i = i + 1
if i % 10000 == 0:
logger.info('Saved ' + str(i) + ' articles')
output.close()
word2vec 计算相似度(基于Wiki)
最新推荐文章于 2024-04-21 17:00:41 发布