LDA主题模型的java代码实现

最新推荐文章于 2021-03-06 16:56:41 发布

mm_bit

最新推荐文章于 2021-03-06 16:56:41 发布

阅读量9.5k

点赞数 3

分类专栏：数据挖掘文本处理机器学习分类算法

本文链接：https://blog.csdn.net/mm_bit/article/details/47803633

版权

数据挖掘文本处理同时被 2 个专栏收录

17 篇文章 2 订阅

订阅专栏

机器学习分类算法

11 篇文章 1 订阅

订阅专栏

public class LdaGibbsSampling {
	public static class modelparameters {  
        float alpha = 0.5f; //usual value is 50 / K  
        float beta = 0.1f;//usual value is 0.1  
        int topicNum = 100;  
        int iteration = 100;  
        int saveStep = 10;  
        int beginSaveIters = 50;  
    }  
      
    /**Get parameters from configuring file. If the  
     * configuring file has value in it, use the value. 
     * Else the default value in program will be used 
     * @param ldaparameters 
     * @param parameterFile 
     * @return void 
     */  
    private static void getParametersFromFile(modelparameters ldaparameters,  
            String parameterFile) {  
        // TODO Auto-generated method stub  
        ArrayList<String> paramLines = new ArrayList<String>();  
        paramLines = FileUtil.readList(parameterFile);  
        for(String line : paramLines){  
            String[] lineParts = line.split("\t");  
            switch(parameters.valueOf(lineParts[0])){  
            case alpha:  
                ldaparameters.alpha = Float.valueOf(lineParts[1]);  
                break;  
            case beta:  
                ldaparameters.beta = Float.valueOf(lineParts[1]);  
                break;  
            case topicNum:  
                ldaparameters.topicNum = Integer.valueOf(lineParts[1]);  
                break;  
            case iteration:  
                ldaparameters.iteration = Integer.valueOf(lineParts[1]);  
                break;  
            case saveStep:  
                ldaparameters.saveStep = Integer.valueOf(lineParts[1]);  
                break;  
            case beginSaveIters:  
                ldaparameters.beginSaveIters = Integer.valueOf(lineParts[1]);  
                break;  
            }  
        }  
    }  
      
    public enum parameters{  
        alpha, beta, topicNum, iteration, saveStep, beginSaveIters;  
    } 
    
    /**
     * 训练LDA主题模型，对给定的测试样本集进行主题预测，找出每个样本的最大概率主题下的前20个词的集合，作为该测试样本集的主题代表关键词集合
     * @param trainPathDir
     * @param parameterFile
     * @param resultPath
     * @param testPath
     * @return
     * @throws IOException
     */
    public Set<Word> trainAndPredictLDA(String trainPathDir,String parameterFile,String resultPath,String testPath) throws IOException{
    	 modelparameters ldaparameters = new modelparameters();  
         getParametersFromFile(ldaparameters, parameterFile); 
         Documents docSet = new Documents();  
         docSet.readDocs(trainPathDir);
         System.out.println("wordMap size " + docSet.termToIndexMap.size());  
         FileUtil.mkdir(resultPath);
         LdaModel model = new LdaModel(ldaparameters);  
         System.out.println("1 Initialize the model ...");  
         model.initializeModel(docSet);  
         System.out.println("2 Learning and Saving the model ...");  
         model.inferenceModel(docSet);  
         System.out.println("3 Output the final model ...");  
//         model.saveIteratedModel(ldaparameters.iteration, docSet);  
//         System.out.println("Done!"); 
         
         //预测新文本
         Documents testDocs = new Documents();
         List<Message> messages = FileUtil.readMessageFromFile(testPath);
         Set<Integer> topicIndexSet = new HashSet<Integer> ();
         for(Message message : messages){
        	 String content = message.getContent();
        	 Document doc = new Document(content);
        	 testDocs.docs.add(doc);
             topicIndexSet.add(model.predictNewSampleTopic(doc));
         }
         /**
          * 预测每条短信，得到每条的最大概率主题，最后找到每个最大概率主题的前20个词，集合,计算tf-idf
          */
         Set<Word> wordSet = model.getWordByTopics(topicIndexSet, 20);
         LDAFeatureProcess.calTFIDFAsWeight(docSet, wordSet);
         return wordSet;
    }
    @Test
    public void test() throws IOException{
    	String resultPath = "ldaResult/";  
        String parameterFile= "source/lda_parameters.txt";
        String trainPathDir = "LDATrain/";
        String testPath = "train/train_messages.txt";
        Set<Word> wordSet = trainAndPredictLDA(trainPathDir,parameterFile,resultPath,testPath);
        FileUtil.writeKeyWordFile("ldaWords/keyWords.doc", new ArrayList<Word>(wordSet));
    }

      
    /** 
     * @param args 
     * @throws IOException  
     */  
    public static void main(String[] args) throws IOException {  
        // TODO Auto-generated method stub          
        String resultPath = "ldaResult/";  
        String parameterFile= "source/lda_parameters.txt";  
          
        modelparameters ldaparameters = new modelparameters();  
        getParametersFromFile(ldaparameters, parameterFile); 
        String dirPath = "LDATrain/";
        Documents docSet = new Documents();  
        docSet.readDocs(dirPath);
        System.out.println("wordMap size " + docSet.termToIndexMap.size());  
        FileUtil.mkdir(resultPath);
        LdaModel model = new LdaModel(ldaparameters);  
        System.out.println("1 Initialize the model ...");  
        model.initializeModel(docSet);  
        System.out.println("2 Learning and Saving the model ...");  
        model.inferenceModel(docSet);  
        System.out.println("3 Output the final model ...");  
        model.saveIteratedModel(ldaparameters.iteration, docSet);  
        System.out.println("Done!");  
        
        //预测新文本
        String messStr = "好消息！！薇町婚纱造型推出老带新活动啦！已在本店预定的新娘推荐新顾客来本店，定单后即赠送新、老顾客各一支价值58元定妆隔离水（在婚礼当";
        Document doc = new Document(messStr);
        int topicIndex = model.predictNewSampleTopic(doc);
        Set<Word> wordSet  = model.getWordByTopic(topicIndex);        
        FileUtil.writeKeyWordFile("ldaWords/comparedkeyWords.doc", new ArrayList<Word>(wordSet));        
    }  

}

public class LdaModel {
	
    int [][] doc;//word index array  
    int V, K, M;//vocabulary size, topic number, document number  
    int [][] z;//topic label array  
    float alpha; //doc-topic dirichlet prior parameter   
    float beta; //topic-word dirichlet prior parameter  
    int [][] nmk;//given document m, count times of topic k. M*K  
    int [][] nkt;//given topic k, count times of term t. K*V  
    int [] nmkSum;//Sum for each row in nmk  
    int [] nktSum;//Sum for each row in nkt  
    double [][] phi;//Parameters for topic-word distribution K*V  
    double [][] theta;//Parameters for doc-topic distribution M*K  
    int iterations;//Times of iterations  
    int saveStep;//The number of iterations between two saving  
    int beginSaveIters;//Begin save model at this iteration  
    Map<String, Integer> wordIndexMap;
    Documents docSet;
      
    public LdaModel(LdaGibbsSampling.modelparameters modelparam) {  
        // TODO Auto-generated constructor stub  
        alpha = modelparam.alpha;  
        beta = modelparam.beta;  
        iterations = modelparam.iteration;  
        K = modelparam.topicNum;  
        saveStep = modelparam.saveStep;  
        beginSaveIters = modelparam.beginSaveIters;  
    }  
  
    public void initializeModel(Documents docSet) { 
    	this.docSet = docSet;
        // TODO Auto-generated method stub  
        M = docSet.docs.size();  
        V = docSet.termToIndexMap.size();  
        nmk = new int [M][K];  
        nkt = new int[K][V];  
        nmkSum = new int[M];  
        nktSum = new int[K];  
        phi = new double[K][V];  
        theta = new double[M][K];  
        this.wordIndexMap = new HashMap<String, Integer> ();
          
        //initialize documents index array  
        doc = new int[M][];  
        for(int m = 0; m < M; m++){  
            //Notice the limit of memory  
            int N = docSet.docs.get(m).docWords.length;  
            doc[m] = new int[N];  
            for(int n = 0; n < N; n++){  
                doc[m][n] = docSet.docs.get(m).docWords[n];  
            }  
        }  
          
        //initialize topic lable z for each word  
        z = new int[M][];  
        for(int m = 0; m < M; m++){  
            int N = docSet.docs.get(m).docWords.length;  
            z[m] = new int[N];  
            for(int n = 0; n < N; n++){  
            	//随机初始化！
                int initTopic = (int)(Math.random() * K);// From 0 to K - 1  
                z[m][n] = initTopic;  
                //number of words in doc m assigned to topic initTopic add 1  
                nmk[m][initTopic]++;  
                //number of terms doc[m][n] assigned to topic initTopic add 1  
                nkt[initTopic][doc[m][n]]++;  
                // total number of words assigned to topic initTopic add 1  
                nktSum[initTopic]++;  
            }  
             // total number of words in document m is N  
            nmkSum[m] = N;  
        }  
    }  
  
    public void inferenceModel(Documents docSet) throws IOException {  
        // TODO Auto-generated method stub  
        if(iterations < saveStep + beginSaveIters){  
            System.err.println("Error: the number of iterations should be larger than " + (saveStep + beginSaveIters));  
            System.exit(0);  
        }  
        for(int i = 0; i < iterations; i++){  
            System.out.println("Iteration " + i);  
            if((i >= beginSaveIters) && (((i - beginSaveIters) % saveStep) == 0)){  
                //Saving the model  
                System.out.println("Saving model at iteration " + i +" ... ");  
                //Firstly update parameters  
                updateEstimatedParameters();  
                //Secondly print model variables  
                saveIteratedModel(i, docSet);  
            }  
              
            //Use Gibbs Sampling to update z[][]  
            for(int m = 0; m < M; m++){  
                int N = docSet.docs.get(m).docWords.length;  
                for(int n = 0; n < N; n++){  
                    // Sample from p(z_i|z_-i, w)  
                    int newTopic = sampleTopicZ(m, n);  
                    z[m][n] = newTopic;  
                }  
            }  
        }  
    }  
      
    private void updateEstimatedParameters() {  
        // TODO Auto-generated method stub  
        for(int k = 0; k < K; k++){  
            for(int t = 0; t < V; t++){  
                phi[k][t] = (nkt[k][t] + beta) / (nktSum[k] + V * beta);  
            }  
        }  
          
        for(int m = 0; m < M; m++){  
            for(int k = 0; k < K; k++){  
                theta[m][k] = (nmk[m][k] + alpha) / (nmkSum[m] + K * alpha);  
            }  
        }  
    }  
  
    private int sampleTopicZ(int m, int n) {  
        // TODO Auto-generated method stub  
        // Sample from p(z_i|z_-i, w) using Gibbs upde rule  
          
        //Remove topic label for w_{m,n}  
        int oldTopic = z[m][n];  
        nmk[m][oldTopic]--;  
        nkt[oldTopic][doc[m][n]]--;  
        nmkSum[m]--;  
        nktSum[oldTopic]--;  
          
        //Compute p(z_i = k|z_-i, w)  
        double [] p = new double[K];  
        for(int k = 0; k < K; k++){  
            p[k] = (nkt[k][doc[m][n]] + beta) / (nktSum[k] + V * beta) * (nmk[m][k] + alpha) / (nmkSum[m] + K * alpha);  
        }  
          
        //Sample a new topic label for w_{m, n} like roulette  
        //Compute cumulated probability for p  
        for(int k = 1; k < K; k++){  
            p[k] += p[k - 1];  
        }  
        double u = Math.random() * p[K - 1]; //p[] is unnormalised  
        int newTopic;  
        for(newTopic = 0; newTopic < K; newTopic++){  
            if(u < p[newTopic]){  
                break;  
            }  
        }  
          
        //Add new topic label for w_{m, n}  
        nmk[m][newTopic]++;  
        nkt[newTopic][doc[m][n]]++;  
        nmkSum[m]++;  
        nktSum[newTopic]++;  
        return newTopic;  
    } 
    /**
     * 对给定的待预测的文本，将其分词结果的单词与训练集的单词的索引对应上
     * @param predictWordSet
     * @return
     */
    public Map<String,String> matchTermIndex(Set<Word> predictWordSet){
    	/**
    	 * key:word的内容 value：文档index-单词index，如“1-2”
    	 */
    	Map<String,String> wordIndexMap = new HashMap<String, String> ();
    	for(Word word : predictWordSet){
    		String content = word.getContent();
    		String indexStr = getTermIndex(content);
    		wordIndexMap.put(content, indexStr);
    	}
    	return wordIndexMap;
    }
    /**
     * 对于给定单词，找到该单词在训练集中对应的文档和单词索引
     * @param content
     * @return
     */
    public String getTermIndex(String content){
    	for(Integer m : docSet.getDocWordsList().keySet()){
    		LinkedList<String> list = docSet.getDocWordsList().get(m);
    		for(int i = 0; i < list.size(); i ++){
    			if(list.get(i).equals(content))
                   return m+"-"+i;
    		}
    	}
    	return "none";
    }
    /**
     * 在训练完LDA模型后，根据给定的主题索引set，得到每个主题的topNum单词列表集合
     * @param topicIndexSet
     * @param topNum
     * @return
     */
    public Set<Word> getWordByTopics(Set<Integer> topicIndexSet, int topNum){
    	Set<Word> wordSet = new HashSet<Word> ();
    	for(Integer indexT : topicIndexSet){
    		List<Integer> tWordsIndexArray = new ArrayList<Integer>();   
            for(int j = 0; j < V; j++) 
                tWordsIndexArray.add(new Integer(j));            
            Collections.sort(tWordsIndexArray, new LdaModel.TwordsComparable(phi[indexT]));
            for(int t = 0; t < topNum; t++){
            	String content = docSet.indexToTermMap.get(tWordsIndexArray.get(t));
            	Word word = new Word(content);
            	if(SegmentWordsResult.getStopWordsSet().contains(content)||
            			ProcessKeyWords.remove(word) || ProcessKeyWords.isMeaninglessWord(content))
            		continue;
      		    wordSet.add(word);
            }
    	}
    	return wordSet;
    }
    
    public Set<Word> getWordByTopic(Integer topicIndex){
    	  Set<Word> wordSet = new HashSet<Word> ();
    	  List<Integer> tWordsIndexArray = new ArrayList<Integer>();   
	      for(int j = 0; j < V; j++){  
	          tWordsIndexArray.add(new Integer(j));  
	      }  
	      Collections.sort(tWordsIndexArray, new LdaModel.TwordsComparable(phi[topicIndex]));    
	      for(int t = 0; t < V; t++){  
	    	  String content = docSet.indexToTermMap.get(tWordsIndexArray.get(t));
	      	  Word word = new Word(content);
	      	  word.setWeight(phi[topicIndex][tWordsIndexArray.get(t)]);
	      	  if(SegmentWordsResult.getStopWordsSet().contains(content)||
	      			ProcessKeyWords.remove(word) || ProcessKeyWords.isMeaninglessWord(content))
	      		  continue;
	      	  if(phi[topicIndex][tWordsIndexArray.get(t)] <= 0.0)
	      		  continue;
	      	wordSet.add(word);
	      }               	
	    	return wordSet;
    }
    
    
    public int predictNewSampleTopic(Document doc){
    	double topicProb[] = new double[K];
    	Map<String,String> wordIndexMap = matchTermIndex(doc.getWordMap().keySet()); 
    	int predict_v = doc.getWordCount();
    	int [][] predict_nkt;//given topic k, count times of term t. K*V 
    	double [][] predict_phi;//Parameters for topic-word distribution K*V
    	int [] predict_z;//topic label array
    	int [] predict_nk;//该文档覆盖的主题索引，值为该文档覆盖指定主题的次数
    	
    	predict_nkt = new int[K][predict_v];
    	predict_phi = new double[K][predict_v];
    	predict_z = new int[predict_v];
    	predict_nk = new int[K];
    	for(int index = 0; index < predict_v; index++){
    		String content = doc.getWordsList().get(index);
    		String indexStr = wordIndexMap.get(content);
    		if(indexStr.indexOf("-") == -1)
    			continue;
    		int m = Integer.valueOf(indexStr.substring(0, indexStr.indexOf("-")));
    		int n = Integer.valueOf(indexStr.substring(indexStr.indexOf("-")+1));
            // Sample from p(z_i|z_-i, w)  
            int newTopic = predictSampleTopicZ(m, n);  
            predict_z[index] = newTopic;  
            predict_nkt[newTopic][index] ++;
            predict_nk[newTopic] ++;
        }
    	for(int k = 0; k < K; k++){  
    		topicProb[k] = (predict_nk[k] + alpha) / (predict_v + K * alpha);  
        }
    	return getTopic(topicProb);     	
    	
    }
    
    public int getTopic(double[] topicProp){
    	int maxIndex = 0;
    	double maxProp = topicProp[0];
    	Set<String> words = new HashSet<String> ();
    	for(int k = 1; k < K; k ++){
    		if(maxProp < topicProp[k]){
    			maxProp = topicProp[k];
    			maxIndex = k;
    		}
    	}
    	return maxIndex;
    }
    
    public int predictSampleTopicZ(int m, int n){
    	 // TODO Auto-generated method stub  
        // Sample from p(z_i|z_-i, w) using Gibbs upde rule                   
          
        //Compute p(z_i = k|z_-i, w)  
        double [] p = new double[K];  
        for(int k = 0; k < K; k++){  
            p[k] = (nkt[k][doc[m][n]] + beta) / (nktSum[k] + V * beta) * (nmk[m][k] + alpha) / (nmkSum[m] + K * alpha);  
        }  
          
        //Sample a new topic label for w_{m, n} like roulette  
        //Compute cumulated probability for p  
        for(int k = 1; k < K; k++){  
            p[k] += p[k - 1];  
        }  
        double u = Math.random() * p[K - 1]; //p[] is unnormalised  
        int newTopic;  
        for(newTopic = 0; newTopic < K; newTopic++){  
            if(u < p[newTopic]){  
                break;  
            }  
        }  
          
        //Add new topic label for w_{m, n}   
        return newTopic;  
    }
  
    public void saveIteratedModel(int iters, Documents docSet) throws IOException {  
        // TODO Auto-generated method stub  
        //lda.params lda.phi lda.theta lda.tassign lda.twords  
        //lda.params 
    	String resultPath = "ldaResult/"; 
        String modelName = "lda_" + iters;  
        ArrayList<String> lines = new ArrayList<String>();  
        lines.add("alpha = " + alpha);  
        lines.add("beta = " + beta);  
        lines.add("topicNum = " + K);  
        lines.add("docNum = " + M);  
        lines.add("termNum = " + V);  
        lines.add("iterations = " + iterations);  
        lines.add("saveStep = " + saveStep);  
        lines.add("beginSaveIters = " + beginSaveIters);  
        FileUtil.writeLines(resultPath + modelName + ".params", lines);  
          
        //lda.phi K*V  
        BufferedWriter writer = new BufferedWriter(new FileWriter(resultPath + modelName + ".phi"));         
        for (int i = 0; i < K; i++){  
            for (int j = 0; j < V; j++){  
                writer.write(phi[i][j] + "\t");  
            }  
            writer.write("\n");  
        }  
        writer.close();  
          
        //lda.theta M*K  
        writer = new BufferedWriter(new FileWriter(resultPath + modelName + ".theta"));  
        for(int i = 0; i < M; i++){  
            for(int j = 0; j < K; j++){  
                writer.write(theta[i][j] + "\t");  
            }  
            writer.write("\n");  
        }  
        writer.close();  
          
        //lda.tassign  
        writer = new BufferedWriter(new FileWriter(resultPath + modelName + ".tassign"));  
        for(int m = 0; m < M; m++){  
            for(int n = 0; n < doc[m].length; n++){  
                writer.write(doc[m][n] + ":" + z[m][n] + "\t");  
            }  
            writer.write("\n");  
        }  
        writer.close();  
        List<Word> appendwords = new ArrayList<Word> ();  
        //lda.twords phi[][] K*V  
        writer = new BufferedWriter(new FileWriter(resultPath + modelName + ".twords"));  
        int topNum = 10; //Find the top 20 topic words in each topic  
        for(int i = 0; i < K; i++){  
            List<Integer> tWordsIndexArray = new ArrayList<Integer>();   
            for(int j = 0; j < V; j++){  
                tWordsIndexArray.add(new Integer(j));  
            }  
            Collections.sort(tWordsIndexArray, new LdaModel.TwordsComparable(phi[i]));  
            writer.write("topic " + i + "\t:\t");  
            for(int t = 0; t < topNum; t++){  
                writer.write(docSet.indexToTermMap.get(tWordsIndexArray.get(t)) + " " + phi[i][tWordsIndexArray.get(t)] + "\t");  
                Word word = new Word(docSet.indexToTermMap.get(tWordsIndexArray.get(t)));
                word.setWeight(phi[i][tWordsIndexArray.get(t)]);
                appendwords.add(word);
            }  
            writer.write("\n");  
        }        
        writer.close(); 
        //lda.words
        writer = new BufferedWriter(new FileWriter(resultPath + modelName + ".words"));
        for(Word word : appendwords){
        	if(word.getContent().trim().equals(""))
        		continue;
        	writer.write(word.getContent()+"\t"+word.getWeight()+"\n");
        }
        writer.close();
    }  
      
    public class TwordsComparable implements Comparator<Integer> {  
          
        public double [] sortProb; // Store probability of each word in topic k  
          
        public TwordsComparable (double[] sortProb){  
            this.sortProb = sortProb;  
        }  
  
        @Override  
        public int compare(Integer o1, Integer o2) {  
            // TODO Auto-generated method stub  
            //Sort topic word index according to the probability of each word in topic k  
            if(sortProb[o1] > sortProb[o2]) return -1;  
            else if(sortProb[o1] < sortProb[o2]) return 1;  
            else return 0;  
        }  
    } 
    
    public static void main(String[] args){
    	 
    }

}

public class Documents {
ArrayList<Document> docs;   
    Map<String, Integer> termToIndexMap;  
    ArrayList<String> indexToTermMap;  
    Map<String,Integer> termCountMap;
    private static NLPIRUtil npr = new NLPIRUtil();
    private static Set<String> stopWordsSet = SegmentWordsResult.getStopWordsSet();
    private Map<Word,Integer> wordDocMap;
    private Map<Integer, LinkedList<String>> docWordsList;//key:第i篇文档，value：单词列表，为了与lda模型中的doc[m][n]的索引对应
    
      
    public Documents(){  
        docs = new ArrayList<Document>();  
        termToIndexMap = new HashMap<String, Integer>();  
        indexToTermMap = new ArrayList<String>();  
        termCountMap = new HashMap<String, Integer>();
        this.wordDocMap = new HashMap<Word, Integer> ();
        this.docWordsList = new HashMap<Integer, LinkedList<String>> ();
    }  
      
public Map<String, Integer> getTermCountMap() {
return termCountMap;
}


public void setTermCountMap(Map<String, Integer> termCountMap) {
this.termCountMap = termCountMap;
}

 public Map<Word, Integer> getWordDocMap() {
return wordDocMap;
}


public void setWordDocMap(Map<Word, Integer> wordDocMap) {
this.wordDocMap = wordDocMap;
}


public Map<Integer, LinkedList<String>> getDocWordsList() {
return docWordsList;
}


public void setDocWordsList(Map<Integer, LinkedList<String>> docWordsList) {
this.docWordsList = docWordsList;
}


public void readDocs(String docsPath){ 
int index = 0;
        for(File docFile : new File(docsPath).listFiles()){ 
            Document doc = new Document(docFile.getAbsolutePath(), termToIndexMap, indexToTermMap, termCountMap);  
            docs.add(doc); 
            for(Word word : doc.getWordMap().keySet()){
            if(this.wordDocMap.containsKey(word))
            this.wordDocMap.put(word, this.wordDocMap.get(word));
            else
            this.wordDocMap.put(word, 1);
            }
            this.docWordsList.put(index++, doc.getWordsList());
        } 
       
    }  


}

public class Document {
	private static NLPIRUtil npr = new NLPIRUtil();
	private static Set<String> stopWordsSet = SegmentWordsResult.getStopWordsSet();
	private String docName;  
    int[] docWords; 
    private int wordCount;
    private Map<Word, Integer> wordMap ;
    private LinkedList<String> wordsList;//为了和docWords的索引对应，即单词内容对应索引值
    
    public int getWordCount() {
		return wordCount;
	}

	public void setWordCount(int wordCount) {
		this.wordCount = wordCount;
	}

	public Map<Word, Integer> getWordMap() {
		return wordMap;
	}

	public void setWordMap(Map<Word, Integer> wordMap) {
		this.wordMap = wordMap;
	}

	public LinkedList<String> getWordsList() {
		return wordsList;
	}

	public void setWordsList(LinkedList<String> wordsList) {
		this.wordsList = wordsList;
	}
	
	public Document(String docContent){ 
		this.wordMap = new HashMap<Word, Integer> ();
    	this.wordsList = new LinkedList<String> ();
    	String splitResult = npr.NLPIR_ParagraphProcess(ProcessMessage.dealWithSentence(docContent), 0);
        String[] wordsArray = splitResult.split(" ");
        this.docWords = new int[wordsArray.length];
        int index = 0;   
        //Transfer word to index
        for(String str : wordsArray){
        	String content = ProcessMessage.dealSpecialString(str);
        	Word word = new Word(content);
			if(ProcessKeyWords.remove(word) || stopWordsSet.contains(content))   	
				continue;
			else if(content.length() <= 1 || RegexMatch.specialMatch(content))
			    continue;
			this.wordCount ++;
			if(!wordMap.containsKey(content)){
				int newIndex = wordMap.size();  
				wordMap.put(word, 1);
                docWords[index++] = newIndex;
			}else{
				 wordMap.put(word, wordMap.get(word)+1);
				 docWords[index++] = wordMap.get(content);
			}
			this.wordsList.add(content);
        }
        
	}

	public Document(String filePath,Map<String, Integer> termToIndexMap, ArrayList<String> indexToTermMap, Map<String, Integer> termCountMap){  
		this(FileUtil.readContent(filePath));
		this.docName = filePath; 
    	this.wordMap = new HashMap<Word, Integer> ();
    	this.wordsList = new LinkedList<String> ();
        //Read file and initialize word index array    
        String docContent = FileUtil.readContent(docName); 
       
        String splitResult = npr.NLPIR_ParagraphProcess(docContent, 0);
        String[] wordsArray = splitResult.split(" ");
        this.docWords = new int[wordsArray.length];
        int index = 0;   
        //Transfer word to index
        for(String str : wordsArray){
        	String content = ProcessMessage.dealSpecialString(str);
        	Word word = new Word(content);
			if(ProcessKeyWords.remove(word) || stopWordsSet.contains(content))   	
				continue;
			else if(ProcessKeyWords.isMeaninglessWord(content))
				continue;
			this.wordCount ++;
			if(!termToIndexMap.containsKey(content)){
				int newIndex = termToIndexMap.size();  
				termToIndexMap.put(str, newIndex);  
                indexToTermMap.add(str);  
                termCountMap.put(str, new Integer(1)); 
                docWords[index++] = newIndex;
			}else{
				 termCountMap.put(content, termCountMap.get(content) + 1); 
				 docWords[index++] = termToIndexMap.get(content);
			}
			this.wordsList.add(content);
		    if(wordMap.containsKey(word))
		    	wordMap.put(word, wordMap.get(word)+1);
		    else
		    	wordMap.put(word, 1);
        }
        
    }  
      
    public boolean isNoiseWord(String string) {  
        // TODO Auto-generated method stub  
        string = string.toLowerCase().trim();  
        Pattern MY_PATTERN = Pattern.compile(".*[a-zA-Z]+.*");  
        Matcher m = MY_PATTERN.matcher(string);  
        // filter @xxx and URL  
        if(string.matches(".*www\\..*") || string.matches(".*\\.com.*") ||   
                string.matches(".*http:.*") )  
            return true;  
        else  
            return false;  
    }  
      
}

上述中的LdaModel中包含了预测新样本的方法predictNewSampleTopic，返回的是该样本的最大概率主题索引，LdaGibbsSampling中是训练LDA主题模型的流程
主题-单词分布的部分结果如下：

topic 0 : ⒐ 0.0029859442729502916 住宅 0.002257665153592825制造 0.002257665153592825 行为 0.002257665153592825收益 0.0015293860342353582 西北 0.0015293860342353582红星 0.0015293860342353582 轻松 0.0015293860342353582小商品 0.0015293860342353582 搜房网 0.0015293860342353582

topic 1 : 贵宾 0.0030435749795287848 商城 0.0023012396413832903 太平洋保险 0.0015589043032377958 建设 0.0015589043032377958 储蓄 0.0015589043032377958 周四 0.0015589043032377958 完成 0.0015589043032377958 区内 0.0015589043032377958 王志钢 0.0015589043032377958 872944 0.0015589043032377958
topic 2 : 油田 0.0017282527405768633 雀巢 0.0017282527405768633 金千 0.0017282527405768633 山腰 9.052753448486328E-4
代办 9.052753448486328E-4 洋房 9.052753448486328E-4 月饼 9.052753448486328E-4 三星 9.052753448486328E-4 集成 9.052753448486328E-4 大桥 9.052753448486328E-4
topic 3 : 美容 0.0016053818399086595 疯狂 0.0016053818399086595 获取 0.0016053818399086595 名牌 0.0016053818399086595 风神 0.0016053818399086595 小额 0.0016053818399086595 璀璨 0.0016053818399086595 一千 0.0016053818399086595 专注 0.0016053818399086595 发放 0.0016053818399086595
topic 4 : 焦点 0.002957939635962248 搜狐 0.002236490836367011
房屋 0.002236490836367011 玉兰 0.002236490836367011 短期 0.002236490836367011 理疗 0.002236490836367011 4001080000 0.0015150421531870961 命题 0.0015150421531870961 公开 0.0015150421531870961 乐器 0.0015150421531870961
topic 5 : 实验 0.0023698494769632816 每块 0.0023698494769632816 收费 0.0023698494769632816 博览 0.0016053818399086595 重新 0.0016053818399086595 任意 0.0016053818399086595 借款 0.0016053818399086595 保底 0.0016053818399086595 预期 0.0016053818399086595 初二 0.0016053818399086595
topic 6 : 宗旨 0.0016625761054456234 陈勇军 0.0016625761054456234 拨打 0.0016625761054456234 家人 0.0016625761054456234 工业 0.0016625761054456234 百货店 0.0016625761054456234 实业 0.0016625761054456234 6222024000068818521 0.0016625761054456234 18692297994 0.0016625761054456234 13300 0.0016625761054456234
topic 7 : → 0.005167018622159958 餐厅 0.00298377126455307 保修 0.00298377126455307 英语 0.0022560220677405596
红 0.0022560220677405596 普通 0.0022560220677405596 学习 0.001528272987343371 龙湖 0.001528272987343371 电大 0.001528272987343371 任意 0.001528272987343371
topic 8 : 登陆 0.0025078877806663513 食宿 0.001698891632258892 急需 0.001698891632258892 建行 0.001698891632258892 葡萄酒 0.001698891632258892 新版 0.001698891632258892 富豪 0.001698891632258892 对比 0.001698891632258892 泥工 0.001698891632258892 相信 8.898956584744155E-4
topic 9 : 体育 0.7940398454666138 活动 0.005577780772000551 优惠 0.0038460372015833855 欢迎 0.003806901630014181 银行 0.0032981408294290304 电话 0.003268789267167449 联系 0.0031611667945981026 公司 0.002769812010228634 地址 0.0024860799312591553 】 0.002339322119951248
topic 10 : 年级 0.0023899467196315527
车主 0.0023899467196315527 过程 0.0016189961461350322 华联 0.0016189961461350322 家电 0.0016189961461350322 大业 0.0016189961461350322 时代 0.0016189961461350322 迪赛尼斯 0.0016189961461350322 稀缺 0.0016189961461350322 稳定 0.0016189961461350322
topic 11 : 利率 0.002570267766714096 知名 0.002570267766714096 南湖 0.0017411491135135293 实现 0.0017411491135135293 立秋 0.0017411491135135293 就读 0.0017411491135135293 罗马 0.0017411491135135293 广电局 0.0017411491135135293 独具 0.0017411491135135293 静候 0.0017411491135135293
topic 12 : 哥哥 0.0029536776710301638 家里 0.0029536776710301638 化妆 0.0029536776710301638 名品 0.0022332684602588415
一 0.0022332684602588415 四川 0.0015128592494875193 二手车 0.0015128592494875193 订购 0.0015128592494875193 多种 0.0015128592494875193 潜力 0.0015128592494875193
topic 13 : 建行 0.002435001078993082 开发商 0.0016495168674737215 美容 0.0016495168674737215 奔驰 0.0016495168674737215 比例 0.0016495168674737215 英伦 0.0016495168674737215 开通 0.0016495168674737215 开班 0.0016495168674737215 打开 0.0016495168674737215 英国 0.0016495168674737215
topic 14 : 增值 0.002355444012209773 [验] 0.002355444012209773 公开 0.0015956234419718385 打印机 0.0015956234419718385 家中 0.0015956234419718385 宾馆 0.0015956234419718385 12000 0.0015956234419718385 渠道 0.0015956234419718385 租赁 0.0015956234419718385 无效 0.0015956234419718385
topic 15 : 自由 0.0024857670068740845
巴拉巴 0.0024857670068740845
丰 0.0024857670068740845 朝阳 0.001683906652033329 家人 0.001683906652033329 84725588 0.001683906652033329 老弟 0.001683906652033329 商住 0.001683906652033329 县委 0.001683906652033329 德国 8.820463554002345E-4
topic 16 : ￥10亿 0.002975110663101077 楼下 0.002249473938718438 感恩 0.002249473938718438 独栋 0.002249473938718438 前来 0.0015238370979204774 手机 0.0015238370979204774 申请 0.0015238370979204774
乐 0.0015238370979204774 考点 0.0015238370979204774 3008300 0.0015238370979204774
topic 17 : 批发 0.00239548715762794 总监 0.0016227493761107326 车子 0.0016227493761107326 饭店 0.0016227493761107326 伙伴 0.0016227493761107326 直属 0.0016227493761107326 事后 0.0016227493761107326 翰林 0.0016227493761107326 专题片 0.0016227493761107326 装修 8.500116528011858E-4
topic 18 : 期待 0.0024758405052125454
价 0.0016771822702139616 你好 0.0016771822702139616 决定 0.0016771822702139616 助剂 0.0016771822702139616 人员 0.0016771822702139616 雄伟 0.0016771822702139616 只用 0.0016771822702139616 享受 8.785240934230387E-4 四川 8.785240934230387E-4
topic 19 : 房价 0.003103474387899041 底价 0.0023465293925255537 湖南 0.0015895843971520662
凡 0.0015895843971520662 送礼 0.0015895843971520662 恒大 0.0015895843971520662 一生 0.0015895843971520662 代言人 0.0015895843971520662 专车 0.0015895843971520662 大唐 0.0015895843971520662
topic 20 : 企业主 0.0023483068216592073 讲师 0.0023483068216592073
6222021001055293358 0.0023483068216592073 首发 0.0015907884808257222 认购 0.0015907884808257222 请问 0.0015907884808257222 发布 0.0015907884808257222 中午 0.0015907884808257222 开幕 0.0015907884808257222 ⒍ 0.0015907884808257222
topic 21 : 重新 0.002323663793504238 帮忙 0.002323663793504238 85654475 0.002323663793504238
宾 0.002323663793504238
中国 0.0015740948729217052 学历 0.0015740948729217052 ＂ 0.0015740948729217052 温州 0.0015740948729217052 好久 0.0015740948729217052 钢板 0.0015740948729217052
topic 22 : 可口 0.0024103878531605005 形象 0.0024103878531605005 减轻 0.0024103878531605005 高层 0.0016328433994203806 爸爸 0.0016328433994203806 基金 0.0016328433994203806 营业额 0.0016328433994203806 意大利 0.0016328433994203806 正常 0.0016328433994203806 吉智 0.0016328433994203806
topic 23 : 关系 0.0024738647043704987 经营 0.0016758438432589173 美容 0.0016758438432589173 梦想 0.0016758438432589173 喷漆 0.0016758438432589173 肌肤 0.0016758438432589173 刘汉琳 0.0016758438432589173 索菲 0.0016758438432589173 依依 0.0016758438432589173 欢迎 8.778230403549969E-4
topic 24 : 考试 0.0016652129124850035 上班 0.0016652129124850035 金条 0.0016652129124850035
宝 0.0016652129124850035 澳门 0.0016652129124850035 粘贴 0.0016652129124850035 收缩 0.0016652129124850035 18800574923 0.0016652129124850035 豪华 8.722544298507273E-4 老师 8.722544298507273E-4
topic 25 : 长期 0.0030594731215387583 开发区 0.0023132602218538523 低价 0.0023132602218538523 ⑥ 0.0023132602218538523 转告 0.0023132602218538523
新 0.0015670472057536244 得到 0.0015670472057536244 [通] 0.0015670472057536244 融资 0.0015670472057536244 万科 0.0015670472057536244
topic 26 : 开发区 0.002339445985853672 石油 0.0015847859904170036 宁波 0.0015847859904170036 更换 0.0015847859904170036 不用 0.0015847859904170036 会议 0.0015847859904170036 初三 0.0015847859904170036 汽车站 0.0015847859904170036 抽空 0.0015847859904170036 实用 0.0015847859904170036
topic 27 : 代办 0.0016745076281949878 代表 0.0016745076281949878 女性 0.0016745076281949878 13825139678 0.0016745076281949878 承担 0.0016745076281949878 影响力 0.0016745076281949878 13934141989 0.0016745076281949878 槐花 0.0016745076281949878
沐 0.0016745076281949878 过敏 0.0016745076281949878
topic 28 : 婚礼 0.00862991251051426 海尔 0.002210969338193536 电影 0.002210969338193536 小乔 0.002210969338193536 15953174009 0.002210969338193536 茶店 0.002210969338193536 7627292. 0.002210969338193536 15985917304 0.002210969338193536 新余 0.001497753313742578 资料 0.001497753313742578
topic 29 : 【 0.021667908877134323
你 0.015670640394091606 您好 0.01555958017706871 光临 0.014560035429894924
尊敬 0.014337914064526558 现在 0.013005186803638935 】 0.012338823638856411 享受 0.010783976875245571 信用 0.009451250545680523 详情 0.007896402850747108
topic 30 : 西吉 0.0024778195656836033 封顶 0.0016785229090601206 押金 0.0016785229090601206 海外 0.0016785229090601206 澜庭 0.0016785229090601206 账户 0.0016785229090601206 原因 0.0016785229090601206
6222021001036927348 0.0016785229090601206 欧莱雅 0.0016785229090601206 推荐 8.792263106442988E-4