说一下Train函数的实现。
在上文中,我提到过,朴素贝叶斯也有两种模型:贝努力模型和多项式模型。小弟第一次实现朴素贝叶斯,就老老实实按照基本原理做了一个贝努力模型;多项式模型也不难,变通一下就行。不废话了,直接上代码了,有点长,不过很容易看懂:
bool NaiveBayes::Train (const char * sFileSample, int iClassNum, int iFeaTypeNum, string & sSegmenter, int iFeaExtractNum, const char * sFileModel, bool bCompactModel) { // 防御性代码 if (iClassNum <= 0 || iFeaTypeNum <= 0 || iFeaExtractNum <= 0) return false; ifstream in (sFileSample, ios_base::binary); ofstream out (sFileModel); if (!in || !out) { cerr << "Can not open the file" << endl; return false; } // 这些都是临时数据结构,用来临时存储模型参数,特征选择需要的参数等等 // 1. the temp data structure for model parameters // 1.1 the total number of document in training samples int iTotalDocNum = 0; // 1.2 the prior probability of class, temparaly it store the doc number in this class double * pClassPriorProb = new double [iClassNum]; memset (pClassPriorProb, 0, iClassNum*sizeof(double)); // 1.3 the prior probability of feature type, temparaly it stores the doc number in this feature (这个主要用于特征选择,bayes模型本身并不需要这个参数) double * pFeaItemPriorProb = new double [iFeaTypeNum]; memset (pFeaItemPriorProb, 0, iFeaTypeNum*sizeof(double)); // 1.4 the chi-square value that feature falls into class, temparaly it stores the doc number for this class and feature (可以看到,特征选择算法主要用卡方选择) double ** ppChiMatrix = new double * [iClassNum]; for (int i=0; i<iClassNum; i++) { ppChiMatrix[i] = new double [iFeaTypeNum]; memset (ppChiMatrix[i], 0, iFeaTypeNum*sizeof(double)); } // 1.5 the post-probability for class and feature double ** ppPProbMatrix = new double * [iClassNum]; for (int i=0; i<iClassNum; i++) { ppPProbMatrix[i] = new double [iFeaTypeNum]; memset (ppChiMatrix[i], 0, iFeaTypeNum*sizeof(double)); } // 1.6 for the feature selection (表示哪些特征被选中了) int * pFeaSelected = new int [iFeaTypeNum]; memset (pFeaSelected, 0, iFeaTypeNum*sizeof(int)); // 2. iterate the training samples and fill count into the temp data structure string sLine; int i = 0; while (getline (in, sLine)) { // show some information on screen if (0 == i%10000) cout << i << "\n"; i++; // 2.1 the total number of doc iTotalDocNum++; // 2.2 split the sample into class and feature items string::size_type iSeg = sLine.find_first_of (sSegmenter); string sTmp = sLine.substr (0, iSeg); int iClassId = atoi (sTmp.c_str()); if (iClassId >= iClassNum) continue; pClassPriorProb [iClassId]++; // 2.3 count the rest feature items iSeg += sTmp.length(); sTmp = sLine.substr (iSeg); istringstream isLine (sTmp); string sTmpItem; while (isLine >> sTmpItem) { int iFeaItemId = atoi (sTmpItem.c_str()); if (iFeaItemId >= iFeaTypeNum) continue; // add the count pFeaItemPriorProb [iFeaItemId]++; ppChiMatrix [iClassId][iFeaItemId]++; } } // 3. calculate the model parameters // 3.1 the chi-square value as well as the post-probabilty for (int i=0; i<iClassNum; i++) { for (int j=0; j<iFeaTypeNum; j++) { double dA = ppChiMatrix[i][j]; double dB = pFeaItemPriorProb[j] - dA; // currently pFeaItemPriorProb[i] == sum_i (ppChiMatrix[i][j]) double dC = pClassPriorProb [i] - dA; // currently pClassPriorProb[i] == sum_j (ppChiMatrix[i][j]) double dD = (double)iTotalDocNum - dA - dB - dC; // the chi value double dNumerator = dA * dD; dNumerator -= dB * dC; dNumerator = pow (dNumerator, 2.0); double dDenominator = dA + dB; dDenominator *= (dC + dD); dDenominator += DBL_MIN; // for smoothing ppChiMatrix[i][j] = dNumerator / dDenominator; // the post-probability: p(feature|class) ppPProbMatrix[i][j] = dA / pClassPriorProb [i]; } } // 3.2 the prior probability of class for (int i=0; i<iClassNum; i++) pClassPriorProb [i] /= iTotalDocNum; // 3.3 the prior probability of feature for (int i=0; i<iFeaTypeNum; i++) pFeaItemPriorProb [i] /= iTotalDocNum; // 4. feature selection (这个函数下一篇文章再详细讲) FeaSelByChiSquare (ppChiMatrix, ppPProbMatrix, iClassNum, iFeaTypeNum, iFeaExtractNum, pFeaSelected); // 5. dump the model into txt file if (bCompactModel) // output the parameters only for predicting { // 5.1 the prior probability of class out << iClassNum << endl; for (int i=0; i<iClassNum; i++) { out << pClassPriorProb [i] << "\n"; } // 5.2 the actual selected feature type number int iActualFeaNum = 0; for (int j=0; j<iFeaTypeNum; j++) { if (1 == pFeaSelected[j]) iActualFeaNum ++; } out << iActualFeaNum << endl; // 5.3 the post probability for (int i=0; i<iClassNum; i++) { for (int j=0; j<iFeaTypeNum; j++) { if (1 == pFeaSelected[j]) { out << j << ":" << ppPProbMatrix[i][j] << "\n"; } } } } else // output the full information { // 5.1 the total number of document out << iTotalDocNum << endl; // 5.2 the prior probability of class out << iClassNum << endl; for (int i=0; i<iClassNum; i++) // classindex:priorprob { out << i << ":" << pClassPriorProb [i] << "\n"; } // 5.3 the prior probability of feature type: this is NO used in bayes model, record this for more info // and whether this feature is selected or not by any class out << iFeaTypeNum << "\n"; for (int i=0; i<iFeaTypeNum; i++) // featureId:priorprob:selected or not { out << i << ":" << pFeaItemPriorProb[i] << ":" << pFeaSelected << "\n"; } // 5.4 the chi-square value for class-feature pair for (int i=0; i<iClassNum; i++) { for (int j=0; j<iFeaTypeNum; j++) { out << ppChiMatrix[i][j] << "\n"; } } // 5.5 the post probability for (int i=0; i<iClassNum; i++) { for (int j=0; j<iFeaTypeNum; j++) { out << ppPProbMatrix[i][j] << "\n"; } } } // last, release the memory delete [] pClassPriorProb; delete [] pFeaItemPriorProb; for (int i=0; i<iClassNum; i++) { delete [] ppChiMatrix[i]; } delete [] ppChiMatrix; for (int i=0; i<iClassNum; i++) { delete [] ppPProbMatrix[i]; } delete [] ppPProbMatrix; delete [] pFeaSelected; return true; }
在这个函数的主要功能就是统计,一方面统计后验概率,另一方面统计卡方特征选择所需要的参数,他们分别存储在ppChiMaxtrix和ppProbMatrix。这两个都是二维数组,数组的维度由类别数目和特征类型总数决定。数据结构的设计风格都是c的风格,自己搭建数据,并负责释放;并没有用c++ stl中的复杂数据结构,如:set、map等。中间经过特征选择,最后模型存储在文本文件当中。
接下来,讲特征选择算法。