安装依赖工具包:
pip install -U gensim
安装过程中出现已经安装过的工具包冲突可以uninstall或删除Lib目录下相关包或使用:
pip install -U gensim --ignore-installed scipy
Word2Vec、Doc2Vec原理可以参考链接:
深度学习笔记——Word2vec和Doc2vec原理理解并结合代码分析_mpk_no1的博客-CSDN博客
简单操作代码如下:
# -*- coding:utf-8 -*-
import os
import sys
import jieba
import logging
import pymongo
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from gensim.models.word2vec import Text8Corpus, LineSentence, Word2Vec
default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
reload(sys)
sys.setdefaultencoding(default_encoding)
logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO)
# 网上随便爬取一些新闻存入数据库
client = pymongo.MongoClient(host='192.168.0.1', port=27017)
db = client['news']
# 停用词
chinese_stop_words_file = os.path.abspath(os.getcwd() + os.sep + '..' + os.sep + 'static' + os.sep + 'dic' + os.sep + 'chinese_stop_words.txt')
chinese_stop_words = [line.strip() for line in open(chinese_stop_words_file, 'r').readlines()]
total_cut_word_count = 0
# 句子分割
def sentence_segment(sentence):
global total_cut_word_count
result = []
cut_words = jieba.cut(sentence)
for cut_word in cut_words:
if cut_word not in chinese_stop_words:
result.append(cut_word)
total_cut_word_count += 1
return result
# 准备语料库
def prepare_word_