import pandas as pd#新增第六次
import jieba
df = pd.read_csv("abstract after pre-process.csv", encoding='UTF-8')
df.shape
import jieba
def chinese_word_cut(mytext):
return" ".join(jieba.cut(mytext))
df["content_cutted"]=df.content.apply(chinese_word_cut)
df.content_cutted.head()
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
n_features=1000
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
max_features=n_features,
stop_words='english',
max_df = 0.5,
min_df = 10)
tf = tf_vectorizer.fit_transform(df.content_cutted)
from sklearn.decomposition import LatentDirichletAllocation
n_components = range(10, 200, 10)
perplexityLst = [1.0]*len(n_components)
from time import*
import matplotlib.pyplot as plt
import os
import codecs
#训练LDA并打印训练时间
lda_models = []
for idx, n_topic in enumerate(n_components):
lda = LatentDirichletAllocation(n_components=n_topic,
max_iter=20,
learning_method='batch',
evaluate_every=200,
# perp_tol=0.1, #default
# doc_topic_prior=1/n_topic, #default
# topic_word_prior=1/n_topic, #default
verbose=0)
t0 = time()
lda.fit(tf)
perplexityLst[idx] = lda.perplexity(tf)
lda_models.append(lda)
print ("# of Topic: %d, " % n_components[idx],)
print ("done in %0.3fs, N_iter %d, " % ((time() - t0), lda.n_iter_),)
print ("Perplexity Score %0.3f" % perplexityLst[idx])
#打印最佳模型
best_index = perplexityLst.index(min(perplexityLst))
best_n_topic = n_components[best_index]
best_model = lda_models[best_index]
print ("Best # of Topic: ", best_n_topic)
#绘制不同主题数perplexity的不同
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.plot(n_components, perplexityLst)
ax.set_xlabel("# of topics")
ax.set_ylabel("Approximate Perplexity")
plt.grid(True)
plt.savefig("困惑度2.png")
plt.show()
计算困惑度
最新推荐文章于 2023-04-24 19:46:54 发布