把卡方计算的代码贴出来,供大家重现实验结果。
对于LROne project的数据样本进行处理,输入特征为0-1特征,样本输入格式为:分类 特征1 特征2 ...
bool CalChiSquare::CalculateChiSquareBinaryFrmFile (const char * sFileSample,
int iFeaNum, int iSampleNum, const char * sFileChiSquare)
{
if (iFeaNum <=0 || iSampleNum <=0)
return false;
ifstream in (sFileSample);
ofstream out (sFileChiSquare);
if (!in || !out)
{
cerr << "Can not open the files" << endl;
return false;
}
// allocate the memory for samples (the class and the feature)
double ** ppClassFea;
ppClassFea = new double * [2];
for (int i=0; i<2; i++)
ppClassFea[i] = new double [iFeaNum];
double * pFeaPrior = new double [iFeaNum];
memset (pFeaPrior, 0, iFeaNum * sizeof(double));
double * pClassPrior = new double [2];
memset (pClassPrior, 0, 2 * sizeof(double));
// initialize
for (int i=0; i<2; i++)
for (int j=0; j<iFeaNum; j++)
ppClassFea[i][j] = 0;
// load the samples from txt file
string sLine;
double dTotalVal = 0.0;
while (getline (in, sLine))
{
istringstream isLine (sLine);
int iClassIndex = -1;
isLine >> iClassIndex;
if (iClassIndex >= 2)
continue;
string sTmp;
while (isLine >> sTmp)
{
int iFeaIndex = atoi (sTmp.c_str());
ppClassFea [iClassIndex][iFeaIndex] += 1.0;
pClassPrior [iClassIndex] += 1.0;
pFeaPrior [iFeaIndex] += 1.0;
dTotalVal += 1.0;
}
}
// calculate the chi-square
for (int i=0; i<2; i++)
{
for (int j=0; j<iFeaNum; j++)
{
double dA = ppClassFea[i][j];
double dB = pFeaPrior[j] - dA;
double dC = pClassPrior[i] - dA;
double dD = dTotalVal - dA - dB - dC;
// the chi value
double dNumerator = dA * dD;
dNumerator -= dB * dC;
dNumerator = pow (dNumerator, 2.0);
double dDenominator = dA + dB;
dDenominator *= (dC + dD);
dDenominator += DBL_MIN; // for smoothing
ppClassFea[i][j] = dNumerator / dDenominator;
}
}
// dump the chi-square
for (int i=0; i<iFeaNum; i++)
{
out << i+1 << " ";
for (int j=0; j<2; j++)
{
out << ppClassFea[j][i] << " ";
}
out << "\n";
}
// clean up
for (int i=0; i<2; i++)
delete [] ppClassFea[i];
delete [] ppClassFea;
delete [] pFeaPrior;
delete [] pClassPrior;
return true;
}
卡方计算方法参考很久以前的博文 《 【朴素贝叶斯】实战朴素贝叶斯_代码实现_特征选择1 》。
下面这段程序,是对 LRTwo project 的输入样本进行处理,输入特征为实值特征,样本格式:类别 特征索引1:特征值1 特征索引2:特征值2......
bool CalChiSquare::CalculateChiSquareFrmFile (const char * sFileSample,
int iClassNum, int iFeaNum, int iSampleNum, const char * sFileChiSquare)
{
if (iClassNum <=0 || iFeaNum <=0 || iSampleNum <=0)
return false;
ifstream in (sFileSample);
ofstream out (sFileChiSquare);
if (!in || !out)
{
cerr << "Can not open the files" << endl;
return false;
}
// allocate the memory for samples (the class and the feature)
double ** ppClassFea;
ppClassFea = new double * [iClassNum];
for (int i=0; i<iClassNum; i++)
ppClassFea[i] = new double [iFeaNum];
double * pFeaPrior = new double [iFeaNum];
memset (pFeaPrior, 0, iFeaNum * sizeof(double));
double * pClassPrior = new double [iClassNum];
memset (pClassPrior, 0, iClassNum * sizeof(double));
// initialize
for (int i=0; i<iClassNum; i++)
for (int j=0; j<iFeaNum; j++)
ppClassFea[i][j] = 0;
// load the samples from txt file
string sLine;
double dTotalVal = 0.0;
while (getline (in, sLine))
{
istringstream isLine (sLine);
int iClassIndex = -1;
isLine >> iClassIndex;
if (iClassIndex >= iClassNum)
continue;
string sTmp;
while (isLine >> sTmp)
{
int iFeaIndex = atoi (sTmp.substr (0, sTmp.find (':')).c_str());
if (iFeaIndex >= iFeaNum)
continue;
double dFeaVal = atof (sTmp.substr (sTmp.find (':')+1).c_str());
dFeaVal = abs (dFeaVal);
ppClassFea [iClassIndex][iFeaIndex] += dFeaVal;
pClassPrior [iClassIndex] += dFeaVal;
pFeaPrior [iFeaIndex] += dFeaVal;
dTotalVal += dFeaVal;
}
}
// calculate the chi-square
for (int i=0; i<iClassNum; i++)
{
for (int j=0; j<iFeaNum; j++)
{
double dA = ppClassFea[i][j];
double dB = pFeaPrior[j] - dA;
double dC = pClassPrior[i] - dA;
double dD = dTotalVal - dA - dB - dC;
// the chi value
double dNumerator = dA * dD;
dNumerator -= dB * dC;
dNumerator = pow (dNumerator, 2.0);
double dDenominator = dA + dB;
dDenominator *= (dC + dD);
dDenominator += DBL_MIN; // for smoothing
ppClassFea[i][j] = dNumerator / dDenominator;
}
}
// dump the chi-square
for (int i=0; i<iFeaNum; i++)
{
out << i+1 << " ";
for (int j=0; j<iClassNum; j++)
{
out << ppClassFea[j][i] << " ";
}
out << "\n";
}
// clean up
for (int i=0; i<iClassNum; i++)
delete [] ppClassFea[i];
delete [] ppClassFea;
delete [] pFeaPrior;
delete [] pClassPrior;
return true;
}
完。
转载请注明出处:http://blog.csdn.net/xceman1997/article/details/19348575