调用gensim库对训练集文本得到文本的TF-IDF表示
# 将文档转换成词典和词库
def CorpusAndDic(texts):
# 词典
dictionary = gensim.corpora.Dictionary(texts)
# 词库,以(词,词频方式存储)
corpus = [dictionary.doc2bow(text) for text in texts]
print("词典:", dictionary)
print("词库:", corpus)
return dictionary, corpus
dictionary, corpus = CorpusAndDic(out_sentences)
# 初始化TF-IDF模型,corpus作为语料库
tfidf = gensim.models.TfidfModel(corpus)
# 使用tfidf模型将自身的词库转换成tf-idf表示
corpus_tfidf = tfidf[corpus]
index = 1
for doc in corpus_tfidf:
print('第',index,'个文档:',doc)
index+=1
词向量部分代码参考:Gensim:word2vec(jieba分词,去停用词)