# python下进行lda主题挖掘(三)——计算困惑度perplexity

1.LDA主题模型困惑度

perplexity是一种信息理论的测量方法，b的perplexity值定义为基于b的熵的能量（b可以是一个概率分布，或者概率模型），通常用于概率模型的比较
wiki上列举了三种perplexity的计算：
1.1 概率分布的perplexity

1.2 概率模型的perplexity

1.3单词的perplexity
perplexity经常用于语言模型的评估，物理意义是单词的编码大小。例如，如果在某个测试语句上，语言模型的perplexity值为2^190，说明该句子的编码需要190bits
2.困惑度perplexity公式
$perplexity = e^ {\frac{ - ∑log(p(w))}{N}}$

3.计算困惑度的代码

PS：将语料经过TFIDF训练模型后计算得到的困惑度要远大于直接进行训练的困惑度（在我这边是这样），应该是正常情况，不必惊慌。

#-*-coding:utf-8-*-
import sys
sys.setdefaultencoding('utf-8')
import os
from gensim.corpora import Dictionary
from gensim import corpora, models
from datetime import datetime
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s : ', level=logging.INFO)

def perplexity(ldamodel, testset, dictionary, size_dictionary, num_topics):
"""calculate the perplexity of a lda-model"""
# dictionary : {7822:'deferment', 1841:'circuitry',19202:'fabianism'...]
print ('the info of this ldamodel: \n')
print ('num of testset: %s; size_dictionary: %s; num of topics: %s'%(len(testset), size_dictionary, num_topics))
prep = 0.0
prob_doc_sum = 0.0
topic_word_list = [] # store the probablity of topic-word:[(u'business', 0.010020942661849608),(u'family', 0.0088027946271537413)...]
for topic_id in range(num_topics):
topic_word = ldamodel.show_topic(topic_id, size_dictionary)
dic = {}
for word, probability in topic_word:
dic[word] = probability
topic_word_list.append(dic)
doc_topics_ist = [] #store the doc-topic tuples:[(0, 0.0006211180124223594),(1, 0.0006211180124223594),...]
for doc in testset:
doc_topics_ist.append(ldamodel.get_document_topics(doc, minimum_probability=0))
testset_word_num = 0
for i in range(len(testset)):
prob_doc = 0.0 # the probablity of the doc
doc = testset[i]
doc_word_num = 0 # the num of words in the doc
for word_id, num in doc.items():
prob_word = 0.0 # the probablity of the word
doc_word_num += num
word = dictionary[word_id]
for topic_id in range(num_topics):
# cal p(w) : p(w) = sumz(p(z)*p(w|z))
prob_topic = doc_topics_ist[i][topic_id][1]
prob_topic_word = topic_word_list[topic_id][word]
prob_word += prob_topic*prob_topic_word
prob_doc += math.log(prob_word) # p(d) = sum(log(p(w)))
prob_doc_sum += prob_doc
testset_word_num += doc_word_num
prep = math.exp(-prob_doc_sum/testset_word_num) # perplexity = exp(-sum(p(d)/sum(Nd))
print ("the perplexity of this ldamodel is : %s"%prep)
return prep

if __name__ == '__main__':
middatafolder = r'E:\work\lda' + os.sep
dictionary_path = middatafolder + 'dictionary.dictionary'
corpus_path = middatafolder + 'corpus.mm'
ldamodel_path = middatafolder + 'lda.model'
corpus = corpora.MmCorpus(corpus_path)
num_topics = 50
testset = []
# sample 1/300
for i in range(corpus.num_docs/300):
testset.append(corpus[i*300])
prep = perplexity(lda_multi, testset, dictionary, len(dictionary.keys()), num_topics)


1.LDA主题模型评估方法
2.LDA perplexity计算 java写的代码，本博客中的代码是参照改博客函数写的。
3.Perplexity(困惑度)