import jieba
import gensim
from gensim import corpora
from gensim.matutils import corpus2dense
f = open("背影.txt", "r+") #选取文档为中文的《背影》,将文档放在当前目录下
text1 = f.readlines() #读取文件,按行读取,存入列表
read = text1
#text1 = f.read() #直接全部读取,是一个字符串
#text1.splitlines() #按照 /n 切分
f.close()
f = open("stop_words.txt", "r+", encoding="GBK") #读取停用词,网上随便下载即可,注意编码方式
text2 = f.read()
stop_word = text2.splitlines()
text = []
for i in range(len(read)): #逐行读取
seg_useful = []
segs = jieba.cut(read[i]) #结巴分词,注意结巴分词只能针对字符串,无法处理列表
for seg in segs:
if seg not in stop_word: #删除停用词
seg_useful.append(seg)
text.append(seg_useful) #收集有用的词
dictionary = corpora.Dictionary(text) #建立字典
word_count = [dictionary.doc2bow(text[i]) for i in range(len(text))] #建立文档-词项矩阵
dtm_matrix = corpus2dense(word_count, len(dictionary))
dtm_matrix.T
from gensim import models
print(len(word_count))
tfidf_model = models.TfidfModel(word_count) #建立tfidf模型
tfidf = tfidf_model[word_count]
print(tfidf)
tfidf_matrix = corpus2dense(tfidf, len(dictionary))
tfidf_matrix
model = gensim.models.Word2Vec(text, size=100, window=5, min_count=2) #训练词向量
model.wv[u'月台']
因为库函数可能会更新,导致部分函数无法使用,所以使用时请注意时间,现在是2018/3/29