/*
* (C) Copyright 2005, Gregor Heinrich (gregor :: arbylon : net) (This file is
* part of the org.knowceans experimental software packages.)
*/
/*
* LdaGibbsSampler is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option) any
* later version.
*/
/*
* LdaGibbsSampler is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*/
/*
* You should have received a copy of the GNU General Public License along with
* this program; if not, write to the Free Software Foundation, Inc., 59 Temple
* Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Mar 6, 2005
*/
packagecom.xh.lda;
importjava.text.DecimalFormat;
importjava.text.NumberFormat;
/**
* Gibbs sampler for estimating the best assignments of topics for words and
* documents in a corpus. The algorithm is introduced in Tom Griffiths' paper
* "Gibbs sampling in the generative model of Latent Dirichlet Allocation"
* (2002).
*
* @author heinrich
*/
publicclassLdaGibbsSampler {
/**
* document data (term lists)
*/
int[][] documents;
/**
* vocabulary size
*/
intV;
/**
* number of topics
*/
intK;
/**
* Dirichlet parameter (document--topic associations)
*/
doublealpha;
/**
* Dirichlet parameter (topic--term associations)
*/
doublebeta;
/**
* topic assignments for each word.
*/
intz[][];
/**
* cwt[i][j] number of instances of word i (term?) assigned to topic j.
*/
int[][] nw;
/**
* na[i][j] number of words in document i assigned to topic j.
*/
int[][] nd;
/**
* nwsum[j] total number of words assigned to topic j.
*/
int[] nwsum;
/**
* nasum[i] total number of words in document i.
*/
int[] ndsum;
/**
* cumulative statistics of theta
*/
double[][] thetasum;
/**
* cumulative statistics of phi
*/
double[][] phisum;
/**
* size of statistics
*/
intnumstats;
/**
* sampling lag (?)
*/
privatestaticintTHIN_INTERVAL =20;
/**
* burn-in period
*/
privatestaticintBURN_IN =100;
/**
* max iterations
*/
privatestaticintITERATIONS =1000;
/**
* sample lag (if -1 only one sample taken)
*/
privatestaticintSAMPLE_LAG;
privatestaticintdispcol =0;
/**
* Initialise the Gibbs sampler with data.
*
* @param V
* vocabulary size
* @param data
*/
publicLdaGibbsSampler(int[][] documents,intV) {
this.documents = documents;
this.V = V;
}
/**
* Initialisation: Must start with an assignment of observations to topics ?
* Many alternatives are possible, I chose to perform random assignments
* with equal probabilities
*
* @param K
* number of topics
* @return z assignment of topics to words
*/
publicvoidinitialState(intK) {
inti;
intM = documents.length;
// initialise count variables.
nw =newint[V][K];
nd =newint[M][K];
nwsum =newint[K];
ndsum =newint[M];
// The z_i are are initialised to values in [1,K] to determine the
// initial state of the Markov chain.
z =newint[M][];
for(intm =0; m
intN = documents[m].length;
z[m] =newint[N];
for(intn =0; n
inttopic = (int) (Math.random() * K);
z[m][n] = topic;
// number of instances of word i assigned to topic j
nw[documents[m][n]][topic]++;
// number of words in document i assigned to topic j.
nd[m][topic]++;
// total number of words assigned to topic j.
nwsum[topic]++;
}
// total number of words in document i
ndsum[m] = N;
}
}
/**
* Main method: Select initial state ? Repeat a large number of times: 1.
* Select an element 2. Update conditional on other elements. If
* appropriate, output summary for each run.
*
* @param K
* number of topics
* @param alpha
* symmetric prior parameter on document--topic associations
* @param beta
* symmetric prior parameter on topic--term associations
*/
publicvoidgibbs(intK,doublealpha,doublebeta) {
this.K = K;
this.alpha = alpha;
this.beta = beta;
// init sampler statistics
if(SAMPLE_LAG >0) {
thetasum =newdouble[documents.length][K];
phisum =newdouble[K][V];
numstats =0;
}
// initial state of the Markov chain:
initialState(K);
System.out.println("Sampling "+ ITERATIONS
+" iterations with burn-in of "+ BURN_IN +" (B/S="
+ THIN_INTERVAL +").");
for(inti =0; i
// for all z_i
for(intm =0; m
for(intn =0; n
// (z_i = z[m][n])
// sample from p(z_i|z_-i, w)
inttopic = sampleFullConditional(m, n);
z[m][n] = topic;
}
}
if((i
// System.out.print("B");
dispcol++;
}
// display progress
if((i > BURN_IN) && (i % THIN_INTERVAL ==0)) {
// System.out.print("S");
dispcol++;
}
// get statistics after burn-in
if((i > BURN_IN) && (SAMPLE_LAG >0) && (i % SAMPLE_LAG ==0)) {
updateParams();
// System.out.print("|");
if(i % THIN_INTERVAL !=0)
dispcol++;
}
if(dispcol >=100) {
// System.out.println();
dispcol =0;
}
}
}
/**
* Sample a topic z_i from the full conditional distribution: p(z_i = j |
* z_-i, w) = (n_-i,j(w_i) + beta)/(n_-i,j(.) + W * beta) * (n_-i,j(d_i) +
* alpha)/(n_-i,.(d_i) + K * alpha)
*
* @param m
* document
* @param n
* word
*/
privateintsampleFullConditional(intm,intn) {
// remove z_i from the count variables
inttopic = z[m][n];
nw[documents[m][n]][topic]--;
nd[m][topic]--;
nwsum[topic]--;
ndsum[m]--;
// do multinomial sampling via cumulative method:
double[] p =newdouble[K];
for(intk =0; k
p[k] = (nw[documents[m][n]][k] + beta) / (nwsum[k] + V * beta)
* (nd[m][k] + alpha) / (ndsum[m] + K * alpha);
}
// cumulate multinomial parameters
for(intk =1; k
p[k] += p[k -1];
}
// scaled sample because of unnormalised p[]
doubleu = Math.random() * p[K -1];
for(topic =0; topic
if(u
break;
}
// add newly estimated z_i to count variables
nw[documents[m][n]][topic]++;
nd[m][topic]++;
nwsum[topic]++;
ndsum[m]++;
returntopic;
}
/**
* Add to the statistics the values of theta and phi for the current state.
*/
privatevoidupdateParams() {
for(intm =0; m
for(intk =0; k
thetasum[m][k] += (nd[m][k] + alpha) / (ndsum[m] + K * alpha);
}
}
for(intk =0; k
for(intw =0; w
phisum[k][w] += (nw[w][k] + beta) / (nwsum[k] + V * beta);
}
}
numstats++;
}
/**
* Retrieve estimated document--topic associations. If sample lag > 0 then
* the mean value of all sampled statistics for theta[][] is taken.
*
* @return theta multinomial mixture of document topics (M x K)
*/
publicdouble[][] getTheta() {
double[][] theta =newdouble[documents.length][K];
if(SAMPLE_LAG >0) {
for(intm =0; m
for(intk =0; k
theta[m][k] = thetasum[m][k] / numstats;
}
}
}else{
for(intm =0; m
for(intk =0; k
theta[m][k] = (nd[m][k] + alpha) / (ndsum[m] + K * alpha);
}
}
}
returntheta;
}
/**
* Retrieve estimated topic--word associations. If sample lag > 0 then the
* mean value of all sampled statistics for phi[][] is taken.
*
* @return phi multinomial mixture of topic words (K x V)
*/
publicdouble[][] getPhi() {
System.out.println("K is:"+K+",V is:"+V);
double[][] phi =newdouble[K][V];
if(SAMPLE_LAG >0) {
for(intk =0; k
for(intw =0; w
phi[k][w] = phisum[k][w] / numstats;
}
}
}else{
for(intk =0; k
for(intw =0; w
phi[k][w] = (nw[w][k] + beta) / (nwsum[k] + V * beta);
}
}
}
returnphi;
}
/**
* Configure the gibbs sampler
*
* @param iterations
* number of total iterations
* @param burnIn
* number of burn-in iterations
* @param thinInterval
* update statistics interval
* @param sampleLag
* sample interval (-1 for just one sample at the end)
*/
publicvoidconfigure(intiterations,intburnIn,intthinInterval,
intsampleLag) {
ITERATIONS = iterations;
BURN_IN = burnIn;
THIN_INTERVAL = thinInterval;
SAMPLE_LAG = sampleLag;
}
/**
* Driver with example data.
*
* @param args
*/
publicstaticvoidmain(String[] args) {
// words in documents
int[][] documents = {
{1,4,3,2,3,1,4,3,2,3,1,4,3,2,3,6},
{2,2,4,2,4,2,2,2,2,4,2,2},
{1,6,5,6,0,1,6,5,6,0,1,6,5,6,0,0},
{5,6,6,2,3,3,6,5,6,2,2,6,5,6,6,6,0},
{2,2,4,4,4,4,1,5,5,5,5,5,5,1,1,1,1,0},
{5,4,2,3,4,5,6,6,5,4,3,2},
};
// vocabulary
intV =7;
intM = documents.length;
// # topics
intK =2;
// good values alpha = 2, beta = .5
doublealpha =2;
doublebeta = .5;
System.out.println("Latent Dirichlet Allocation using Gibbs Sampling.");
LdaGibbsSampler lda =newLdaGibbsSampler(documents, V);
lda.configure(10000,2000,100,10);
lda.gibbs(K, alpha, beta);//用gibbs抽样
double[][] theta = lda.getTheta();//Theta是我们所希望的一种分布可能
double[][] phi = lda.getPhi();
System.out.println();
System.out.println();
System.out.println("Document--Topic Associations, Theta[d][k] (alpha="
+ alpha +")");
System.out.print("d\\k\t");
for(intm =0; m
System.out.print(" "+ m %10+" ");
}
System.out.println();
for(intm =0; m
System.out.print(m +"\t");
for(intk =0; k
System.out.print(theta[m][k] +" ");
// System.out.print(shadeDouble(theta[m][k], 1) + " ");
}
System.out.println();
}
System.out.println();
System.out.println("Topic--Term Associations, Phi[k][w] (beta="+ beta
+")");
System.out.print("k\\w\t");
for(intw =0; w
System.out.print(" "+ w %10+" ");
}
System.out.println();
for(intk =0; k
System.out.print(k +"\t");
for(intw =0; w
System.out.print(phi[k][w] +" ");
// System.out.print(shadeDouble(phi[k][w], 1) + " ");
}
System.out.println();
}
}
}