卡方选择(chi-square)能用到logistic regression当中么?(三)

把卡方计算的代码贴出来,供大家重现实验结果。


对于LROne project的数据样本进行处理,输入特征为0-1特征,样本输入格式为:分类 特征1 特征2 ...

bool CalChiSquare::CalculateChiSquareBinaryFrmFile (const char * sFileSample, 
		 int iFeaNum, int iSampleNum, const char * sFileChiSquare)
{
	if (iFeaNum <=0 || iSampleNum <=0)
		return false;

	ifstream in (sFileSample);
	ofstream out (sFileChiSquare);
	if (!in || !out)
	{
		cerr << "Can not open the files" << endl;
		return false;
	}

	// allocate the memory for samples (the class and the feature)
	double ** ppClassFea;
	ppClassFea = new double * [2];
	for (int i=0; i<2; i++)
		ppClassFea[i] = new double [iFeaNum];
	double * pFeaPrior = new double [iFeaNum];
	memset (pFeaPrior, 0, iFeaNum * sizeof(double));
	double * pClassPrior = new double [2];
	memset (pClassPrior, 0, 2 * sizeof(double));

	// initialize 
	for (int i=0; i<2; i++)
		for (int j=0; j<iFeaNum; j++)
			ppClassFea[i][j] = 0;

	// load the samples from txt file
	string sLine;
	double dTotalVal = 0.0;
	while (getline (in, sLine))
	{
		istringstream isLine (sLine);
		int iClassIndex = -1;
		isLine >> iClassIndex;
		if (iClassIndex >= 2)
			continue;
		string sTmp;
		while (isLine >> sTmp)
		{
			int iFeaIndex = atoi (sTmp.c_str());
			ppClassFea [iClassIndex][iFeaIndex] += 1.0;
			pClassPrior [iClassIndex] += 1.0;
			pFeaPrior [iFeaIndex] += 1.0;
			dTotalVal += 1.0;
		}
	}

	// calculate the chi-square
	for (int i=0; i<2; i++)
	{
		for (int j=0; j<iFeaNum; j++)
		{
			double dA = ppClassFea[i][j];
			double dB = pFeaPrior[j] - dA; 
			double dC = pClassPrior[i] - dA; 
			double dD = dTotalVal - dA - dB - dC;

			// the chi value   
			double dNumerator = dA * dD;  
			dNumerator -= dB * dC;  
			dNumerator = pow (dNumerator, 2.0);  
			double dDenominator = dA + dB;  
			dDenominator *= (dC + dD);  
			dDenominator += DBL_MIN; // for smoothing  
			ppClassFea[i][j] = dNumerator / dDenominator;  
		}
	}

	// dump the chi-square
	for (int i=0; i<iFeaNum; i++)
	{
		out << i+1 << " ";
		for (int j=0; j<2; j++)
		{
			out << ppClassFea[j][i] << " ";
		}
		out << "\n";
	}

	// clean up
	for (int i=0; i<2; i++)
		delete [] ppClassFea[i];
	delete [] ppClassFea;
	delete [] pFeaPrior;
	delete [] pClassPrior;

	return true;
}

卡方计算方法参考很久以前的博文 【朴素贝叶斯】实战朴素贝叶斯_代码实现_特征选择1 》。

下面这段程序,是对 LRTwo project 的输入样本进行处理,输入特征为实值特征,样本格式:类别 特征索引1:特征值1 特征索引2:特征值2......

bool CalChiSquare::CalculateChiSquareFrmFile (const char * sFileSample, 
		int iClassNum, int iFeaNum, int iSampleNum, const char * sFileChiSquare)
{
	if (iClassNum <=0 || iFeaNum <=0 || iSampleNum <=0)
		return false;

	ifstream in (sFileSample);
	ofstream out (sFileChiSquare);
	if (!in || !out)
	{
		cerr << "Can not open the files" << endl;
		return false;
	}

	// allocate the memory for samples (the class and the feature)
	double ** ppClassFea;
	ppClassFea = new double * [iClassNum];
	for (int i=0; i<iClassNum; i++)
		ppClassFea[i] = new double [iFeaNum];
	double * pFeaPrior = new double [iFeaNum];
	memset (pFeaPrior, 0, iFeaNum * sizeof(double));
	double * pClassPrior = new double [iClassNum];
	memset (pClassPrior, 0, iClassNum * sizeof(double));

	// initialize 
	for (int i=0; i<iClassNum; i++)
		for (int j=0; j<iFeaNum; j++)
			ppClassFea[i][j] = 0;

	// load the samples from txt file
	string sLine;
	double dTotalVal = 0.0;
	while (getline (in, sLine))
	{
		istringstream isLine (sLine);
		int iClassIndex = -1;
		isLine >> iClassIndex;
		if (iClassIndex >= iClassNum)
			continue;
		string sTmp;
		while (isLine >> sTmp)
		{
			int iFeaIndex = atoi (sTmp.substr (0, sTmp.find (':')).c_str());
			if (iFeaIndex >= iFeaNum)
				continue;
			double dFeaVal = atof (sTmp.substr (sTmp.find (':')+1).c_str());
			dFeaVal = abs (dFeaVal);
			ppClassFea [iClassIndex][iFeaIndex] += dFeaVal;
			pClassPrior [iClassIndex] += dFeaVal;
			pFeaPrior [iFeaIndex] += dFeaVal;
			dTotalVal += dFeaVal;
		}
	}

	// calculate the chi-square
	for (int i=0; i<iClassNum; i++)
	{
		for (int j=0; j<iFeaNum; j++)
		{
			double dA = ppClassFea[i][j];
			double dB = pFeaPrior[j] - dA; 
			double dC = pClassPrior[i] - dA; 
			double dD = dTotalVal - dA - dB - dC;

			// the chi value   
			double dNumerator = dA * dD;  
			dNumerator -= dB * dC;  
			dNumerator = pow (dNumerator, 2.0);  
			double dDenominator = dA + dB;  
			dDenominator *= (dC + dD);  
			dDenominator += DBL_MIN; // for smoothing  
			ppClassFea[i][j] = dNumerator / dDenominator;  
		}
	}

	// dump the chi-square
	for (int i=0; i<iFeaNum; i++)
	{
		out << i+1 << " ";
		for (int j=0; j<iClassNum; j++)
		{
			out << ppClassFea[j][i] << " ";
		}
		out << "\n";
	}

	// clean up
	for (int i=0; i<iClassNum; i++)
		delete [] ppClassFea[i];
	delete [] ppClassFea;
	delete [] pFeaPrior;
	delete [] pClassPrior;

	return true;
}

完。

转载请注明出处:http://blog.csdn.net/xceman1997/article/details/19348575

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值