LDA代码:
class LDAModel(object):
def __init__(self, dpre):
self.dpre = dpre # 获取预处理参数
#
# 模型参数
# 聚类个数K,迭代次数iter_times,每个类特征词个数top_words_num,超参数α(alpha) β(beta)
#
self.K = K
self.beta = beta
self.alpha = alpha
self.iter_times = iter_times
self.top_words_num = top_words_num
#
# 文件变量
# 分好词的文件trainfile
# 词对应id文件wordidmapfile
# 文章-主题分布文件thetafile
# 词-主题分布文件phifile
# 每个主题topN词文件topNfile
# 最后分派结果文件tassginfile
# 模型训练选择的参数文件paramfile
#
self.wordidmapfile = wordidmapfile
self.trainfile = trainfile
self.thetafile = thetafile
self.phifile = phifile
self.topNfile = topNfile
self.tassginfile = tassginfile
self.paramfile = paramfile
# p,概率向量 double类型,存储采样的临时变量
# nw,词word在主题topic上的分布
# nwsum,每各topic的词的总数
# nd,每个doc中各个topic的词的总数
# ndsum,每各doc中词的总数
self.p = np.zeros(self.K)
self.nw = np.zeros((self.dpre.words_count, self.K), dtype="int")
self.nwsum = np.zeros(self.K, dtype="int")
self.nd = np.zeros((self.dpre.docs_count, self.K), dtype="int")
self.ndsum = np.zeros(dpre.docs_count, dtype="int")
self.Z = np.array(
[[0 for y in xrange(dpre.docs[x].length)] for x in xrange(dpre.docs_count)]) # M*doc.size(),文档中词的主题分布
# 随机先分配类型
for x in xrange(len(self.Z)):
self.ndsum[x] = self.dpre.docs[x].length
for y in xrange(self.dpre.docs[x].length):
topic = random.randint(0, self.K - 1)
self.Z[x][y] = topic
self.nw[self.dpre.docs[x].words[y]][topic] += 1
self.nd[x][topic] += 1
self.nwsum[topic] += 1
self.theta = np.array([[0.0 for y in xrange(self.K)] for x in xrange(self.dpre.docs_count)])
self.phi = np.array([[0.0 for y in xrange(self.dpre.words_count)] for x in xrange(self.K)])
def sampling(self, i, j):
topic = self.Z[i][j]
word = self.dpre.docs[i].words[j]
self.nw[word][topic] -= 1
self.nd[i][topic] -= 1
self.nwsum[topic] -= 1
self.ndsum[i] -= 1
Vbeta = self.dpre.words_count * self.beta
Kalpha = self.K * self.alpha
self.p = (self.nw[word] + self.beta) / (self.nwsum + Vbeta) * \
(self.nd[i] + self.alpha) / (self.ndsum[i] + Kalpha)
for k in xrange(1, self.K):
self.p[k] += self.p[k - 1]
u = random.uniform(0, self.p[self.K - 1])
for topic in xrange(self.K):
if self.p[topic] > u:
break
self.nw[word][topic] += 1
self.nwsum[topic] += 1
self.nd[i][topic] += 1
self.ndsum[i] += 1
return topic
def est(self):
# Consolelogger.info(u"迭代次数为%s 次" % self.iter_times)
for x in xrange(self.iter_times):
for i in xrange(self.dpre.docs_count):
for j in xrange(self.dpre.docs[i].length):
topic = self.sampling(i, j)
self.Z[i][j] = topic
logger.info(u"迭代完成。")
logger.debug(u"计算文章-主题分布")
self._theta()
logger.debug(u"计算词-主题分布")
self._phi()
logger.debug(u"保存模型")
self.save()
def _theta(self):
for i in xrange(self.dpre.docs_count):
self.theta[i] = (self.nd[i] + self.alpha) / (self.ndsum[i] + self.K * self.alpha)
def _phi(self):
for i in xrange(self.K):
self.phi[i] = (self.nw.T[i] + self.beta) / (self.nwsum[i] + self.dpre.words_count * self.beta)
困惑度:
def f_perplexity(word_frequency,word_count): #计算困惑度
duishu=-math.log(word_frequency)
kuohaoli=duishu/word_count
perplexity=math.exp(kuohaoli)
return perplexity