前段时间研究了一下爬虫,正好用它从csdn主页“大数据”、“移动开发”、“软件开发”三个栏目下爬取了一些技术资讯文章,从每个栏目下各取20片文档,看看能否用用LDA主题模型从中提取一些有意义的关键词,三个栏目下的文章都放在文本文件里,部分内容如下图所示
参考 《LDA漫游指南》以及论文《Parameter estimation for text analysis》,自己尝试实现了LDA模型参数估计的Gibbs抽样算法,代码如下 :
lda.py
#coding=utf-8
from __future__ import division
import random
import copy
class ldaModel:
def __init__(self, documents = None, V = 0):
self.documents = documents
self.V = V # 词的总数
self.K = 0 # 主题总数
self.iterations = 10000 # Gibbs抽样迭代次数
self.burnIn = 2000
self.interval = 100 # Gibbs抽样的预烧期
self.theta = None # 文档-主题分布律矩阵
self.phi = None # 主题-词分布律矩阵
self.Z = [] # 文档中词的主题
# 配置参数
def configure(self, iterations, burnIn, interval):
self.iterations = iterations
self.burnIn = burnIn
self.interval = interval
# Gibbs抽样
def gibbsSampling(self, K, alpha, beta):
self.K = K
M = len(self.documents)
numStats = 0
nw = [[0 for col in range(self.V)] for row in range(self.K)]
nwSum = [0 for row in range(self.K)]
nd = [[0 for col in range(self.K)] for row in range(M)]
ndSum = [0 for row in range(M)]
thetaSum = [[0 for col in range(self.K)] for row in range(M)]
phiSum = [[0 for col in range(self.V)] for row in range(self.K)]
self.initialState(nw, nwSum, nd, ndSum)
for i in range(self.iterations):
for m in range(M):
for n in range(len(self.documents[m])):
k = self.Z[m][n]
t = self.documents[m][n]
nw[k][t] = nw[k][t] - 1
nwSum[k] = nwSum[k] -1
nd[m][k] = nd[m][k] - 1
ndSum[m] = ndSum[m] -1
k1 = self.reSampling(m, t, alpha, beta, nw, nwSum, nd, ndSum)
self.Z[m][n] = k1
nw[k1][t] = nw[k1][t] + 1
nwSum[k1] = nwSum[k1] + 1
nd[m][k1] = nd[m][k1] + 1
ndSum[m] = ndSum[m] + 1
if ((i > self.burnIn) and ((i - self.burnIn) % self.interval == 0)):
for m in range(len(self.documents)):
for k in range(self.K):
# thetaSum[m][k] += (nd[m][k] + alpha) / (ndSum[m] + alpha * self.K)
thetaSum[m][k] = (nd[m][k] + alpha) / (ndSum[m] + alpha * self.K)
for k in range(self.K):
for t in range(self.V):
# phiSum[k][t] += (nw[k][t] + beta) / (nwSum[k] + beta * self.V)
phiSum[k][t] = (nw[k][t] + beta) / (nwSum[k] + beta * self.V)
# numStats = numStats + 1
numStats = numStats + 1
self.updatePara(thetaSum, phiSum, numStats)
return nw, nwSum # 返回值,在预测新文档的主题分布时需要用到这两个量
def initialState(self, nw, nwSum, nd, ndSum):
M = len(self.documents)
self.theta = [[0 for col in range(self.K)] for row in range(M)]
self.phi = [[0 for col in range(self.V)] for row in range(self.K)]
for m in range(M):
N = len(self.documents[m])
self.Z.append([])
for n in range(N):
k = random.choice(range(self.K)) # 随机生成第m篇文档中第n个词的主题编号
self.Z[m].append(k)
t = self.documents[m][n]
nw[k][t] = nw[k][t] + 1
nwSum[k] = nwSum[k] + 1
nd[m][k] = nd[m][k] + 1
ndSum[m] = ndSum[m] + 1
def reSampling(self, m, t, alpha, beta, nw, nwSum, nd, ndSum):
p = [0 for i in range(self.K)]
fo