这是第一次接单遇到主题模型,最开始只是简单的用python实现lda模型功能,后面加了一个判断主题模型个数,问题在于lda模型有两种调用方法,一种是原生pip install gensim,一种是scikit-lenrn里面包含,我推荐使用scikit-lenrn包,因为下载方便,参数查看方便
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import matplotlib.pyplot as plt
%matplotlib inline
test=pd.read_excel('1.xlsx',header = None)
test.head()
test.loc[:,'test']=test.loc[:,[0,1,2,3,4,5]]
test['test']=test.loc[:,[0,1,2,3,4,5]]
test=test.drop([0, 1, 2, 3, 4, 5],axis=1)
test.dropna(inplace=True)
a=test['test']
n_features = 2000
n_top_words = 1000
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,max_features=n_features,stop_words='english')###选取至少出现过两次并且数量为前2000的单词用来生成文本表示向量
tf = tf_vectorizer.fit_transform(a)###使用向量生成器转化测试集
grid = dict()
for i in range(1,100,5): ###100个主题,以5为间隔
grid[i] = list()
n_topics = i
lda = LatentDirichletAllocation(n_components=n_topics, max_iter=5,learning_method='online',learning_offset=50.,random_state=0) ###定义lda模型
lda.fit(tf) ###训练参数
train_gamma = lda.transform(tf) ##得到topic-document 分布
train_perplexity = lda.perplexity(tf)
grid[i].append(train_perplexity)
df = pd.DataFrame(grid)
plt.figure(figsize=(14,8), dpi=120)
#plt.subplot(221)
plt.plot(df.columns.values, df.iloc[0].values, '#007A99')
plt.xticks(df.columns.values)
plt.ylabel('train Perplexity')
plt.show()
doc_clean=[i.split(' ') for i in a]
from gensim import corpora
# 创建语料的词语词典,每个单独的词语都会被赋予一个索引
dictionary = corpora.Dictionary(doc_clean)
# 使用上面的词典,将转换文档列表(语料)变成 DT 矩阵
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
from gensim import models
Lda = models.ldamodel.LdaModel
# 在 DT 矩阵上运行和训练 LDA 模型
ldamodel = Lda(doc_term_matrix, num_topics=6, id2word = dictionary, passes=50)
print(ldamodel.print_topics(num_topics=6, num_words=10))