Bayes分类算法

class  Bayes{
private: int CiD_num = 0;//yes类的元组数量
		 int D_num = 0;//数据集中总的元组数量

		 vector<double>P_Ci_vector;

		 vector<vector<string>> datas;//数据
		 struct attribute_item//属性取值ai
		 {
			 string ai_name;//属性取值ai的名称
			 int Dai_num = 0;//属性A上的取值为ai的元组数
			 int aici_num = 0;//属性A上取值为ai的元组是yes类的元组数
			 double P_xk_Ci = 0;P(xk|Ci)
		 };
		 struct attribute//属性A
		 {
			 string Ai_name;//属性A的名称
			 bool isStraggling = true;//属性A是否是离散型变量
			 double P_xk_Ci = 0;//属性是连续型变量时P(xk|Ci)
			 //vector<string> Ai_value;
			 map<string, attribute_item> attributes_info;
//属性取值名-存储属性取值的结构体
			 class average//连续型变量的均值
			 {
			 public: double yes_average = 0;//yes类的均值
					 double no_average = 0;//no类的均值
			 }average;
			 class variance//方差
			 {
			 public: double yes_variance = 0;
					 double no_variance = 0;
			 }variance;

		 };
		 vector<attribute>attr_vector;//属性数组
public: void init_data( string filename );//
		void statistics( );
		string  calculate( vector<string>tuple );
		friend void print( vector<vector<string >> data, int n, int m );
		vector<vector<string>> get_datas( )
		{
			return datas;
		}
};
/*
*function calculate; 预测测试数据分类结果
parameter tuple  测试数据元组
*/
string Bayes::calculate( vector<string> tuple )
{
	double P_yes = 1, P_no = 1;

	for (int i = 0; i < attr_vector.size()-1; i++)
	{
		if (attr_vector[i].isStraggling)
		{
			P_yes *= attr_vector[i].attributes_info[tuple[i]].P_xk_Ci;
			P_no *= 1 - attr_vector[i].attributes_info[tuple[i]].P_xk_Ci;
		}
		else
		{
			P_yes *= calculate_P( string2double( tuple[i] ), attr_vector[i].average.yes_average, attr_vector[i].variance.yes_variance );
			P_no *= calculate_P( string2double( tuple[i] ), attr_vector[i].average.no_average, attr_vector[i].variance.no_variance );
		}
	}
	return P_yes > P_no ? "Yes" : "No";
}



/*
*function statistics; 统计并计算Bayes算法所需要的数据
如CiD_num、D_num 、D中离散型属性Ak的值为xj的Ci类元组数、Ci类训练元组中连续型属性Ak的均值,标准差
*/
void Bayes::statistics( )
{
	D_num = datas.size( );
	CiD_num = 0;
	int attr_num = attr_vector.size( ) - 1;
	//离散型,统计数据
	for (int i = 0; i < D_num; ++i)
	{
		if (datas[i][attr_num] == "yes")
			CiD_num++;
	}

	for (int i = 0; i < D_num; ++i)
	{
		for (int j = 0; j < attr_num; ++j)
		{
			if (attr_vector[j].isStraggling)
			{
				attr_vector[j].attributes_info[datas[i][j]].Dai_num++;
				datas[i][attr_num] == "yes" ? attr_vector[j].attributes_info[datas[i][j]].aici_num++ : 1;
			}
		}
	}
	/*离散型,拉普拉斯校准*/
	bool flag1 = false;
	for (int j = 0; j < attr_num; ++j)
	{
		if (attr_vector[j].isStraggling)
		{
			for (auto iterator = attr_vector[j].attributes_info.begin( );
				  iterator != attr_vector[j].attributes_info.end( );
				  iterator++)
			{
			
				if (iterator->second.aici_num == 0 || iterator->second.aici_num == iterator->second.Dai_num)
				{
					flag1 = true;
					break;
				}
			}
		}
		if (flag1)
			break;
	}
	if (flag1)
	{
		for (int j = 0; j < attr_num; ++j){
			for (auto iterator = attr_vector[j].attributes_info.begin( );
				  iterator != attr_vector[j].attributes_info.end( );
				  iterator++)
			{
				iterator->second.aici_num++;
				iterator->second.Dai_num+=2;
				CiD_num++;
				D_num+=2;
			}
		}
	}

	P_Ci_vector.push_back( CiD_num*1.0 / (1.0*D_num) );
	P_Ci_vector.push_back( 1 - P_Ci_vector[0] );
	for (int i = 0; i < attr_num; ++i)
	{
		if (attr_vector[i].isStraggling){
			for (auto iter = attr_vector[i].attributes_info.begin( );
				  iter != attr_vector[i].attributes_info.end( );
				  iter++)
			{
				iter->second.P_xk_Ci = iter->second.aici_num * 1.0 / (iter->second.Dai_num * 1.0);
			}
		}
	}

	//连续型变量,假设服从正态分布
	for (int j = 0; j < attr_num; j++)
	{
		if (!attr_vector[j].isStraggling)
		{
			vector<double> yes_value;
			vector<double> no_value;
			for (int i = 0; i < datas.size(); i++)
			{
				if (datas[i][attr_num] == "yes")
					yes_value.push_back( string2double( datas[i][j] ) );
				else
					no_value.push_back( string2double( datas[i][j] ) );
			}
			attr_vector[j].average.yes_average = calculate_average( yes_value );
			attr_vector[j].average.no_average = calculate_average( no_value );
			attr_vector[j].variance.no_variance = calculate_variance( no_value );
			attr_vector[j].variance.yes_variance = calculate_variance( yes_value );
		}
	}	
}


/*
*function  init_data;从文件读取数据
包括属性信息、所有数据元组
*parameter  string filename ;文件名
*/

void Bayes::init_data( string filename )
{
	int attribute_num = 0;
	string data_line;
	vector<string>works;
	ifstream read( filename );
	getline( read, data_line );
	getline( read, data_line );

	while (true)
	{
		getline( read, data_line );
		works = get_word( data_line );
		if (works.empty( ))
		{
			break;
		}
		if (works[0] == "attribute")
		{
			attribute attr_temp;
			attribute_item attr_item_temp;
			attr_temp.Ai_name = works[1];
			for (int i = 2; i < works.size( ); ++i)
			{
				if (works[i] == "real")
				{
					attr_temp.isStraggling = false;
					break;
				}
				attr_temp.isStraggling = true;
				attr_item_temp.ai_name = works[i];
				attr_temp.attributes_info[works[i]] = attr_item_temp;
			}
			attr_vector.push_back( attr_temp );
		}
	}
	works.clear( );
	while (getline( read, data_line ) && !data_line.empty( ))
	{
		works = get_word( data_line );
		if (works[0] == "data")
		{
			continue;
		}
		datas.push_back( works );
	}
}
/*
*function  string2double;string 到 double的装换

*parameter  string str ;待转换的字符串
*return num ;得到的double类型数据
*/



double string2double( const string str )
{
	istringstream iss( str );
	double num;
	iss >> num;
	return num;
}

double calculate_average( vector<double> value )
{
	int num = value.size( );
	double sum = 0.0;
	for (int i = 0; i < num; ++i)
	{
		sum += value[i];
	}
	return sum / (num *1.0);
}

double calculate_variance( vector<double> value )
{
	double variance = 0;
	double aver_temp = calculate_average( value );
	for (int i = 0; i < value.size( ); i++)
	{
		variance += pow( aver_temp - value[i], 2.0 );
	}
	variance /= (value.size( )*1.0);
	return sqrt( variance );
}

double calculate_P( double x, double average, double variance )
{

	double temp = 1.0 / sqrt( 2 * M_PI ) / variance *(pow( M_E, (-(pow( x - average, 2 ) / 2.0 / pow( variance, 2 ))) ));
	return temp;
}
/*
*function get_word 分离字符串中的单词;
*parameter str 待处理的字符串
*return vector<string> 存放分离出的字符串数组
*/
vector<string> get_word( string str )
{
	char step[] = " ,@{}";// 分隔符
	vector<string>words;
	words.clear( );
	char *str_temp = ( char* ) str.c_str( );
	char *buf = NULL;

	char *word_temp = strtok_s( str_temp, step, &buf );
	while (word_temp != NULL)
	{
		string temp = word_temp;
		words.push_back( temp );
		word_temp = strtok_s( NULL, step, &buf );
	}
	return words;
}

#include "Bayes.h"


int main( )
{
	
	string filename1 = "weather.txt";
	
	Bayes bayes;
	bayes.init_data( filename1 );
	bayes.statistics( );
	cout << "trainning data :\n";
	print( bayes.get_datas(),14,5 );

	vector <string>data_test;
	string data_line;
	string filename2 = "test.txt";
	ifstream read( filename2 );
	cout << "test data :\n";
	while (getline( read, data_line )){
		cout<< "	"<<data_line<<"		answer: ";
		cout<< bayes.calculate( get_word( data_line ) )<<endl;
	}
	string str;
	cout << "press any key to exit!" << endl;
	cin >> str;
	return 0;
}

void print( vector<vector<string >> data, int n, int m ){

	for (int i = 0; i < n; i++)
	{
		cout << "	";
		for (int j = 0; j < m; j++)
			cout<< data[i][j] << " , ";
		cout << endl;
	}
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值