C++ 朴素贝叶斯模型(Naive Bayesian Model,NBM)实现, 西瓜实验数据集 基于周志华老师机器学习
版权声明:本文为博主原创文章,未经博主允许不得转载。
标注
学习朴素贝叶斯算法得了解一些基本知识,比如全概率公式和贝叶斯公式。大学基本都学过不在赘述。
数据样本
编号 | 色泽 | 根蒂 | 敲声 | 纹理 | 脐部 | 触感 | 密度 | 含糖率 | 好瓜 |
---|---|---|---|---|---|---|---|---|---|
1 | 2 | 2 | 2 | 1 | 3 | 1 | 0.697 | 0.46 | 1 |
2 | 3 | 2 | 3 | 1 | 3 | 1 | 0.744 | 0.376 | 1 |
3 | 3 | 2 | 2 | 1 | 3 | 1 | 0.634 | 0.264 | 1 |
4 | 2 | 2 | 3 | 1 | 3 | 1 | 0.608 | 0.318 | 1 |
5 | 1 | 2 | 2 | 1 | 3 | 1 | 0.556 | 0.215 | 1 |
6 | 2 | 1 | 2 | 1 | 2 | 2 | 0.403 | 0.237 | 1 |
7 | 3 | 1 | 2 | 2 | 2 | 2 | 0.481 | 0.149 | 1 |
8 | 3 | 1 | 2 | 1 | 2 | 1 | 0.437 | 0.211 | 1 |
9 | 3 | 1 | 3 | 2 | 2 | 1 | 0.666 | 0.091 | 0 |
10 | 2 | 3 | 1 | 1 | 1 | 2 | 0.243 | 0.267 | 0 |
11 | 1 | 3 | 1 | 3 | 1 | 1 | 0.245 | 0.057 | 0 |
12 | 1 | 2 | 2 | 3 | 1 | 2 | 0.343 | 0.099 | 0 |
13 | 2 | 1 | 2 | 2 | 3 | 1 | 0.639 | 0.161 | 0 |
14 | 1 | 1 | 3 | 2 | 3 | 1 | 0.657 | 0.198 | 0 |
15 | 3 | 1 | 2 | 1 | 2 | 2 | 0.36 | 0.37 | 0 |
16 | 1 | 2 | 2 | 3 | 1 | 1 | 0.593 | 0.042 | 0 |
17 | 2 | 2 | 3 | 2 | 2 | 1 | 0.719 | 0.103 | 0 |
表格含义
色泽 1-3代表 浅白 青绿 乌黑
根蒂 1-3代表 稍蜷 蜷缩 硬挺
敲声 1-3代表 清脆 浊响 沉闷
纹理 1-3代表 清晰 稍糊 模糊
脐部 1-3代表 平坦 稍凹 凹陷
好瓜 1代表 是 0 代表 不是
算法定义
朴素贝叶斯算法定义如下:
代码块
//bayesian.h
#pragma once
//定义训练数据
#define M 17
#define N 9
/*
*色泽 1—3代表 浅白 青绿 乌黑
*根蒂 1-3代表 稍蜷 蜷缩 硬挺
*敲声1-3代表 清脆 浊响 沉闷
*纹理 1-3代表 清晰 稍糊 模糊
*脐部1-3代表 平坦 稍凹 凹陷
*触感 1-2 代表 硬滑 软粘
*好瓜 1代表是 0 代表不是
*/
double A[M][N]= {
{2,2,2,1,3,1,0.697,0.460,1},// 1
{3,2,3,1,3,1,0.744,0.376,1},// 2
{3,2,2,1,3,1,0.634,0.264,1},// 3
{2,2,3,1,3,1,0.608,0.318,1},// 4
{1,2,2,1,3,1,0.556,0.215,1},// 5
{2,1,2,1,2,2,0.403,0.237,1},// 6
{3,1,2,2,2,2,0.481,0.149,1},// 7
{3,1,2,1,2,1,0.437,0.211,1},// 8
{3,1,3,2,2,1,0.666,0.091,0},// 9
{2,3,1,1,1,2,0.243,0.267,0},// 10
{1,3,1,3,1,1,0.245,0.057,0},// 11
{1,2,2,3,1,2,0.343,0.099,0},// 12
{2,1,2,2,3,1,0.639,0.161,0},// 13
{1,1,3,2,3,1,0.657,0.198,0},// 14
{3,1,2,1,2,2,0.360,0.370,0},// 15
{1,2,2,3,1,1,0.593,0.042,0},// 16
{2,2,3,2,2,1,0.719,0.103,0} // 17
};
struct Px1
{
double x1;
double y;
double p_x1y;
};
struct Px2
{
double x2;
double y;
double p_x2y;
};
struct Px3
{
double x3;
double y;
double p_x3y;
};
struct Px4
{
double x4;
double y;
double p_x4y;
};
struct Px5
{
double x5;
double y;
double p_x5y;
};
struct Px6
{
double x6;
double y;
double p_x6y;
};
struct Px7
{
double x7;
double y;
double p_x7y;
};
struct Px8
{
double x8;
double y;
double p_x8y;
};
// struct MeAVa
// {
// double mean;
// double stdev;
// };
double p[2];
Px1 px1[6];
Px2 px2[6];
Px3 px3[6];
Px4 px4[6];
Px5 px5[6];
Px6 px6[6];
Px7 px7[2];
Px8 px8[2];
//bayesian .cpp
#include "bayesian.h"
#include <iostream>
#include <set>
#include <vector>
#include <numeric>
#include <algorithm>
#include <iomanip>
//#include <math.h>
#include <cmath>
using namespace std;
//好瓜密度概率计算
double m_MeansAndAver(double x)
{
double resultSet[17];
double p;
for (int i = 0; i < M; i++)
{
resultSet[i]=A[i][6];
}
double sum = std::accumulate(std::begin(resultSet), std::begin(resultSet)+8, 0.0);
double mean= sum /8; //均值
double accum = 0.0;
std::for_each (std::begin(resultSet), std::begin(resultSet)+8, [&](const double d) {
accum += (d-mean)*(d-mean);
});
double stdev = sqrt(accum/(7)); //方差
// std::cout<<"--------------------test1-------------------------------"<<stdev<<endl;
// std::cout<<"均值为"<<mean<<"方差为:"<<stdev<<endl;
// std::cout<<"--------------test---------------"<<endl;
p = (1/(sqrt(2*3.14)*stdev))*exp(-(pow((x-mean),2)/(2*pow(stdev,2))));
//px7[0]=p;
px7[0].p_x7y=p ;
return p;
}
//坏瓜密度概率计算
double m_w_MeansAndAver(double x)
{
double resultSet[17];
double p;
for (int i = 0; i < M; i++)
{
resultSet[i]=A[i][6];
}
double sum = std::accumulate(std::begin(resultSet)+8, std::end(resultSet), 0.0);
double mean= sum /9; //均值
double accum = 0.0;
std::for_each ( std::begin(resultSet)+8,std::end(resultSet), [&](const double d) {
accum += (d-mean)*(d-mean);
});
double stdev = sqrt(accum/(8)); //方差
// std::cout<<"--------------------test2-------------------------------"<<stdev<<endl;
// std::cout<<"均值为"<<mean<<"方差为:"<<stdev<<endl;
// std::cout<<"--------------test---------------"<<endl;
p = (1/(sqrt(2*3.14)*stdev))*exp(-(pow((x-mean),2)/(2*pow(stdev,2))));
px7[1].p_x7y=p ;
return p;
}
//好瓜含糖量概率计算
double h_MeansAndAver(double x)
{
double resultSet[17];
double p;
for (int i = 0; i < M; i++)
{
resultSet[i]=A[i][7];
}
double sum = std::accumulate(std::begin(resultSet), std::begin(resultSet)+8, 0.0);
double mean= sum /8; //均值
double accum = 0.0;
std::for_each (std::begin(resultSet), std::begin(resultSet)+8, [&](const double d) {
accum += (d-mean)*(d-mean);
});
double stdev = sqrt(accum/(7)); //方差
// std::cout<<"--------------------test3--------------------------------"<<stdev<<endl;
// std::cout<<"均值为"<<mean<<"方差为:"<<stdev<<endl;
// std::cout<<"--------------test---------------"<<endl;
p = (1/(sqrt(2*3.14)*stdev))*exp(-(pow((x-mean),2)/(2*pow(stdev,2))));
px8[0].p_x8y=p;
return p;
}
//坏瓜含糖量概率计算
double h_w_MeansAndAver(double x)
{
double resultSet[17];
double p;
for (int i = 0; i < M; i++)
{
resultSet[i]=A[i][7];
}
double sum = std::accumulate(std::begin(resultSet)+8, std::end(resultSet), 0.0);
double mean= sum /9; //均值
double accum = 0.0;
std::for_each (std::begin(resultSet)+8, std::end(resultSet), [&](const double d) {
accum += (d-mean)*(d-mean);
});
double stdev = sqrt(accum/(8)); //方差
// std::cout<<"--------------------test4--------------------------------"<<stdev<<endl;
// std::cout<<"均值为"<<mean<<"方差为:"<<stdev<<endl;
// std::cout<<"--------------test---------------"<<endl;
p = (1/(sqrt(2*3.14)*stdev))*exp(-(pow((x-mean),2)/(2*pow(stdev,2))));
px8[1].p_x8y=p;
return p;
}
//计算先验概率和条件概率
void calP()
{
//计算先验
//double p[2];
int i, j, k;
multiset<double> m_x1, m_x2,m_x3, m_x4,m_x5, m_x6,m_x7, m_x8, m_y;//多重集容器
multiset<double>::iterator pos1;
set<double> x1, x2,x3, x4,x5, x6,x7, x8, y;//集合容器
set<double>::iterator pos2, pos3;
//运用多重集容器和集合容器
for(i = 0; i < M; i++)
{
m_x1.insert(A[i][0]);
m_x2.insert(A[i][1]);
m_x3.insert(A[i][2]);
m_x4.insert(A[i][3]);
m_x5.insert(A[i][4]);
m_x6.insert(A[i][5]);
m_x7.insert(A[i][6]);
m_x8.insert(A[i][7]);
m_y.insert(A[i][8]);
x1.insert(A[i][0]);
x2.insert(A[i][1]);
x3.insert(A[i][2]);
x4.insert(A[i][3]);
x5.insert(A[i][4]);
x6.insert(A[i][5]);
x7.insert(A[i][6]);
x8.insert(A[i][7]);
y.insert(A[i][8]);
}
p[0] = m_y.count(1) / (double)M; //p(Y = 1)
p[1] = m_y.count(0) / (double)M; //p(Y = 2)
cout << endl << "************先验***********" << endl;
//p[0]代表好瓜所占的比例 p[1]代表坏瓜所占的比例
cout << "p(Y = 1) = " << p[0] << endl;
cout << "p(Y = 0) = " << p[1] << endl;
//计算条件概率
cout << endl;
cout << "*********条件概率********" << endl;
// int px1_num = 3 * 2;
// int px2_num = 3 * 2;
//p(x1 | y)概率
j=0;
for(pos2 = y.begin(); pos2 != y.end(); pos2++)
{
for(pos3 = x1.begin(); pos3 != x1.end(); pos3++)
{
px1[j].y = *pos2;
px1[j].x1 = *pos3;
int count_x1y = 0;
for(k = 0; k < M; k++)
{
if(A[k][0] == px1[j].x1 && A[k][8] == px1[j].y)
count_x1y++;
}
px1[j].p_x1y = count_x1y / (double)m_y.count(px1[j].y);//计算p(x1 | y)的概率
j++;
}
}
cout << "p(x1 | y):" << endl;
for(j = 0; j < 6; j++)
{
cout << px1[j].x1 << " " << px1[j].y << " " << px1[j].p_x1y << endl;
}
//p(x2|y)概率
j=0;
for(pos2 = y.begin(); pos2 != y.end(); pos2++)
{
for(pos3 = x2.begin(); pos3 != x2.end(); pos3++)
{
px2[j].y = *pos2;
px2[j].x2 = *pos3;
int count_x2y = 0;
for(k = 0; k < M; k++)
{
if(A[k][1] == px2[j].x2 && A[k][8] == px2[j].y)
count_x2y++;
}
px2[j].p_x2y = count_x2y / (double)m_y.count(px2[j].y);//计算p(x2 | y)的概率
j++;
}
}
cout << "p(x2 | y):" << endl;
for(j = 0; j < 6; j++)
{
cout << px2[j].x2 << " " << px2[j].y << " " << px2[j].p_x2y << endl;
}
//p(x3|y)概率
j=0;
for(pos2 = y.begin(); pos2 != y.end(); pos2++)
{
for(pos3 = x3.begin(); pos3 != x3.end(); pos3++)
{
px3[j].y = *pos2;
px3[j].x3 = *pos3;
int count_x3y = 0;
for(k = 0; k < M; k++)
{
if(A[k][2] == px3[j].x3 && A[k][8] == px3[j].y)
count_x3y++;
}
px3[j].p_x3y = count_x3y / (double)m_y.count(px3[j].y);//计算p(x2 | y)的概率
j++;
}
}
cout << "p(x3 | y):" << endl;
for(j = 0; j < 6; j++)
{
cout << px3[j].x3 << " " << px3[j].y << " " << px3[j].p_x3y << endl;
}
//p(x4|y)概率
j=0;
for(pos2 = y.begin(); pos2 != y.end(); pos2++)
{
for(pos3 = x4.begin(); pos3 != x4.end(); pos3++)
{
px4[j].y = *pos2;
px4[j].x4 = *pos3;
int count_x4y = 0;
for(k = 0; k < M; k++)
{
if(A[k][3] == px4[j].x4 && A[k][8] == px4[j].y)
count_x4y++;
}
px4[j].p_x4y = count_x4y / (double)m_y.count(px4[j].y);//计算p(x4 | y)的概率
j++;
}
}
cout << "p(x4 | y):" << endl;
for(j = 0; j < 6; j++)
{
cout << px4[j].x4 << " " << px4[j].y << " " << px4[j].p_x4y << endl;
}
//p(x5|y)概率
j=0;
for(pos2 = y.begin(); pos2 != y.end(); pos2++)
{
for(pos3 = x5.begin(); pos3 != x5.end(); pos3++)
{
px5[j].y = *pos2;
px5[j].x5 = *pos3;
int count_x5y = 0;
for(k = 0; k < M; k++)
{
if(A[k][4] == px5[j].x5 && A[k][8] == px5[j].y)
count_x5y++;
}
px5[j].p_x5y = count_x5y / (double)m_y.count(px5[j].y);//计算p(x5 | y)的概率
j++;
}
}
cout << "p(x5 | y):" << endl;
for(j = 0; j < 6; j++)
{
cout << px5[j].x5 << " " << px5[j].y << " " << px5[j].p_x5y << endl;
}
//p(x6|y)概率
j=0;
for(pos2 = y.begin(); pos2 != y.end(); pos2++)
{
for(pos3 = x6.begin(); pos3 != x6.end(); pos3++)
{
px6[j].y = *pos2;
px6[j].x6 = *pos3;
int count_x6y = 0;
for(k = 0; k < M; k++)
{
if(A[k][5] == px6[j].x6 && A[k][8] == px6[j].y)
count_x6y++;
}
px6[j].p_x6y = count_x6y / (double)m_y.count(px6[j].y);//计算p(x6 | y)的概率
j++;
}
}
cout << "p(x6 | y):" << endl;
for(j = 0; j < 6; j++)
{
cout << px6[j].x6 << " " << px6[j].y << " " << px6[j].p_x6y << endl;
}
//p(x7|y)概率
}
int main()
{
int i = 0, j = 0;
//输出训练数据
cout << "***********训练数据************" << endl;
for(i = 0; i < M; i++)
{
for(int j = 0; j < N; j++)
{
cout << " "<< A[i][j];
}
cout << endl;
}
calP();//计算先验和条件概率
int s_x1, s_x2, s_x3, s_x4, s_x5, s_x6;
double s_x7, s_x8;
double result[2];
int class_y = 1;
cout<< "##########################< 提 示 >##########################"<<endl;
cout<<setw(10)<<"色泽"<<setw(10)<<"1-3代表"<<setw(10)<<"浅白"<<setw(10)<<"青绿"<<setw(10)<<"乌黑"<<endl;
cout<<setw(10)<<"根蒂"<<setw(10)<<"1-3代表"<<setw(10)<<"稍蜷"<<setw(10)<<"蜷缩"<<setw(10)<<"硬挺"<<endl;
cout<<setw(10)<<"敲声"<<setw(10)<<"1-3代表"<<setw(10)<<"清脆"<<setw(10)<<"浊响"<<setw(10)<<"沉闷"<<endl;
cout<<setw(10)<<"纹理"<<setw(10)<<"1-3代表"<<setw(10)<<"清晰"<<setw(10)<<"稍糊"<<setw(10)<<"模糊"<<endl;
cout<<setw(10)<<"脐部"<<setw(10)<<"1-3代表"<<setw(10)<<"平坦"<<setw(10)<<"稍凹"<<setw(10)<<"凹陷"<<endl;
cout<<setw(10)<<"触感"<<setw(10)<<"1-2代表"<<setw(10)<<"硬滑"<<setw(10)<<"软粘"<<endl;
cout<<" 密度以及含糖量 0<Xi<1 "<<endl;
cout<<" 请按照以上范围输入"<<endl;
cout<< "###############################################################"<<endl;
/************************************************************************/
/*
色泽 1-3代表 浅白 青绿 乌黑
根蒂 1-3代表 稍蜷 蜷缩 硬挺
敲声 1-3代表 清脆 浊响 沉闷
纹理 1-3代表 清晰 稍糊 模糊
脐部 1-3代表 平坦 稍凹 凹陷
触感 1-2代表 硬滑 软粘
好瓜 1代表 是 0 代表 不是
*/
/************************************************************************/
cout <<endl<< "##########################< 预 测 >##########################"<<endl;
cout <<endl<<"Input:";
cin >> s_x1 >> s_x2>> s_x3>> s_x4>> s_x5>> s_x6>> s_x7>> s_x8;
cout << "##########<连续属性X7与x8的 p(x7|y)、<p(x8|y)计算结果>##########"<<endl<<endl;
cout<<"好瓜密度其概率为:"<<m_MeansAndAver(s_x7)<<endl;//当前密度,在是好瓜的情况下可能发生的概率
cout<<"坏瓜密度的概率"<<m_w_MeansAndAver(s_x7)<<endl;//准确
cout<<"好瓜其概率为:"<<h_MeansAndAver(s_x8)<<endl;//准确
cout<<"好瓜其概率为:"<<h_w_MeansAndAver(s_x8)<<endl<<endl;//准确
for(i = 0; i < 2; i++)
{
double s_px_1, s_px_2, s_px_3, s_px_4, s_px_5, s_px_6, s_px_7, s_px_8;
for(j = 0; j < 6; j++)
{
if(s_x1 == px1[j].x1 && px1[j].y == class_y)
s_px_1 = px1[j].p_x1y;
if(s_x2 == px2[j].x2 && px2[j].y == class_y)
s_px_2 = px2[j].p_x2y;
if(s_x3 == px3[j].x3 && px3[j].y == class_y)
s_px_3 = px3[j].p_x3y;
if(s_x4 == px4[j].x4 && px4[j].y == class_y)
s_px_4 = px4[j].p_x4y;
if(s_x5 == px5[j].x5 && px5[j].y == class_y)
s_px_5 = px5[j].p_x5y;
if(s_x6 == px6[j].x6 && px6[j].y == class_y)
s_px_6 = px6[j].p_x6y;
}
s_px_7=px7[i].p_x7y;
s_px_8=px8[i].p_x8y;
result[i] = p[i] * s_px_1 * s_px_2*s_px_3* s_px_4* s_px_5* s_px_6*s_px_7*s_px_8; //p[0]代表好瓜所占的比例 p[1]代表坏瓜所占的比例
class_y--;
}
cout << "###########################<分类结果>###########################"<<endl;
cout << endl << "all results:";
cout <<"可能为好瓜的概率"<< result[0] << " " <<"可能为坏瓜的概率"<< result[1] << endl<<endl; //0代表否(不是好瓜),1代表是好瓜,其中result[0]存放好瓜可能概率result[1]坏瓜所占比例
cout << "###########################<预测结果>###########################"<<endl<<endl;
i =0;
if(result[i] < result[i+1]) //如果坏瓜概率>好瓜概率
{
class_y = 0;
cout << "属性为:("<< s_x1 << "," << s_x2 << "," << s_x3 << "," << s_x4 << ","
<< s_x5 << "," << s_x6<< "," << s_x7<< "," << s_x8 << ")所属的类是:" << class_y<< "-----------坏瓜"<<endl<<endl;
}
else //好瓜概率>坏瓜概率
{
class_y=1;
cout << "属性为:("<< s_x1 << "," << s_x2 << "," << s_x3 << "," << s_x4 << ","
<< s_x5 << "," << s_x6 << "," << s_x7<< "," << s_x8 << ")所属的类是:" << class_y <<"-----------好瓜"<< endl<<endl;
}
/*cout << "("<< s_x1 << "," << s_x2 << ")所属的类是:" << class_y + 1 << endl; */
system("pause");
return 0;
}
“`