OpenCV的机器学习类定义在ml.hpp文件中,基础类是CvStatModel,其他各种分类器从这里继承而来。
今天研究CvNormalBayesClassifier分类器。
1.类定义
在ml.hpp中有以下类定义:
- class CV_EXPORTS_W CvNormalBayesClassifier : public CvStatModel
- {
- public:
- CV_WRAP CvNormalBayesClassifier();
- virtual ~CvNormalBayesClassifier();
- CvNormalBayesClassifier( const CvMat* trainData, const CvMat* responses,
- const CvMat* varIdx=0, const CvMat* sampleIdx=0 );
- virtual bool train( const CvMat* trainData, const CvMat* responses,
- const CvMat* varIdx = 0, const CvMat* sampleIdx=0, bool update=false );
- virtual float predict( const CvMat* samples, CV_OUT CvMat* results=0 ) const;
- CV_WRAP virtual void clear();
- CV_WRAP CvNormalBayesClassifier( const cv::Mat& trainData, const cv::Mat& responses,
- const cv::Mat& varIdx=cv::Mat(), const cv::Mat& sampleIdx=cv::Mat() );
- CV_WRAP virtual bool train( const cv::Mat& trainData, const cv::Mat& responses,
- const cv::Mat& varIdx = cv::Mat(), const cv::Mat& sampleIdx=cv::Mat(),
- bool update=false );
- CV_WRAP virtual float predict( const cv::Mat& samples, CV_OUT cv::Mat* results=0 ) const;
- virtual void write( CvFileStorage* storage, const char* name ) const;
- virtual void read( CvFileStorage* storage, CvFileNode* node );
- protected:
- int var_count, var_all;
- CvMat* var_idx;
- CvMat* cls_labels;
- CvMat** count;
- CvMat** sum;
- CvMat** productsum;
- CvMat** avg;
- CvMat** inv_eigen_values;
- CvMat** cov_rotate_mats;
- CvMat* c;
- };
2.示例
此类使用方法如下:(引用别人的代码,忘记出处了,非常抱歉这个。。。)
- //openCV中贝叶斯分类器的API函数用法举例
- //运行环境:win7 + VS2005 + openCV2.4.5
- #include "global_include.h"
- using namespace std;
- using namespace cv;
- //10个样本特征向量维数为12的训练样本集,第一列为该样本的类别标签
- double inputArr[10][13] =
- {
- 1,0.708333,1,1,-0.320755,-0.105023,-1,1,-0.419847,-1,-0.225806,0,1,
- -1,0.583333,-1,0.333333,-0.603774,1,-1,1,0.358779,-1,-0.483871,0,-1,
- 1,0.166667,1,-0.333333,-0.433962,-0.383562,-1,-1,0.0687023,-1,-0.903226,-1,-1,
- -1,0.458333,1,1,-0.358491,-0.374429,-1,-1,-0.480916,1,-0.935484,0,-0.333333,
- -1,0.875,-1,-0.333333,-0.509434,-0.347032,-1,1,-0.236641,1,-0.935484,-1,-0.333333,
- -1,0.5,1,1,-0.509434,-0.767123,-1,-1,0.0534351,-1,-0.870968,-1,-1,
- 1,0.125,1,0.333333,-0.320755,-0.406393,1,1,0.0839695,1,-0.806452,0,-0.333333,
- 1,0.25,1,1,-0.698113,-0.484018,-1,1,0.0839695,1,-0.612903,0,-0.333333,
- 1,0.291667,1,1,-0.132075,-0.237443,-1,1,0.51145,-1,-0.612903,0,0.333333,
- 1,0.416667,-1,1,0.0566038,0.283105,-1,1,0.267176,-1,0.290323,0,1
- };
- //一个测试样本的特征向量
- double testArr[]=
- {
- 0.25,1,1,-0.226415,-0.506849,-1,-1,0.374046,-1,-0.83871,0,-1
- };
- int _tmain(int argc, _TCHAR* argv[])
- {
- Mat trainData(10, 12, CV_32FC1);//构建训练样本的特征向量
- for (int i=0; i<10; i++)
- {
- for (int j=0; j<12; j++)
- {
- trainData.at<float>(i, j) = inputArr[i][j+1];
- }
- }
- Mat trainResponse(10, 1, CV_32FC1);//构建训练样本的类别标签
- for (int i=0; i<10; i++)
- {
- trainResponse.at<float>(i, 0) = inputArr[i][0];
- }
- CvNormalBayesClassifier nbc;
- bool trainFlag = nbc.train(trainData, trainResponse);//进行贝叶斯分类器训练
- if (trainFlag)
- {
- cout<<"train over..."<<endl;
- nbc.save("normalBayes.txt");
- }
- else
- {
- cout<<"train error..."<<endl;
- system("pause");
- exit(-1);
- }
- CvNormalBayesClassifier testNbc;
- testNbc.load("normalBayes.txt");
- Mat testSample(1, 12, CV_32FC1);//构建测试样本
- for (int i=0; i<12; i++)
- {
- testSample.at<float>(0, i) = testArr[i];
- }
- float flag = testNbc.predict(testSample);//进行测试
- cout<<"flag = "<<flag<<endl;
- system("pause");
- return 0;
- }
3.步骤
两步走:
1.调用train函数训练分类器;
2.调用predict函数,判定测试样本的类别。
以上示例代码还延时了怎样使用save和load函数,使得训练好的分类器可以保存在文本中。
4.初始化
接下来,看CvNormalBayesClassifier类的无参数初始化:
- CvNormalBayesClassifier::CvNormalBayesClassifier()
- {
- var_count = var_all = 0;
- var_idx = 0;
- cls_labels = 0;
- count = 0;
- sum = 0;
- productsum = 0;
- avg = 0;
- inv_eigen_values = 0;
- cov_rotate_mats = 0;
- c = 0;
- default_model_name = "my_nb";
- }
- CvNormalBayesClassifier::CvNormalBayesClassifier(
- const CvMat* _train_data, const CvMat* _responses,
- const CvMat* _var_idx, const CvMat* _sample_idx )
- {
- var_count = var_all = 0;
- var_idx = 0;
- cls_labels = 0;
- count = 0;
- sum = 0;
- productsum = 0;
- avg = 0;
- inv_eigen_values = 0;
- cov_rotate_mats = 0;
- c = 0;
- default_model_name = "my_nb";
- train( _train_data, _responses, _var_idx, _sample_idx );
- }
另外,以Mat参数形式的对应函数版本,功能是一致的,只不过为了体现2.0以后版本的C++特性罢了。如下:
- CV_WRAP CvNormalBayesClassifier( const cv::Mat& trainData, const cv::Mat& responses,
- const cv::Mat& varIdx=cv::Mat(), const cv::Mat& sampleIdx=cv::Mat() );
- CV_WRAP virtual bool train( const cv::Mat& trainData, const cv::Mat& responses,
- const cv::Mat& varIdx = cv::Mat(), const cv::Mat& sampleIdx=cv::Mat(),
- bool update=false );
- CV_WRAP virtual float predict( const cv::Mat& samples, CV_OUT cv::Mat* results=0 ) const;
5.训练
下面开始分析train函数,分析CvMat格式参数的train函数,即:
- bool train( const CvMat* trainData, const CvMat* responses,const CvMat* varIdx = 0, const CvMat* sampleIdx=0, bool update=false );
在进入该函数之前,还要先回头看看CvNormalBayesClassifier类有哪些数据成员:
- protected:
- int var_count, var_all; //每个样本的特征维数、即变量数目,或者说trainData的列数目(在varIdx=0时)
- CvMat* var_idx; //特征子集的索引,可能特征数目为100,但是只用其中一部分训练
- CvMat* cls_labels; //类别数目
- CvMat** count; //count[0...(classNum-1)],每个元素是一个CvMat(rows=1,cols=var_count)指针,代表训练数据中每一类的某个特征的数目
- CvMat** sum; //sum[0...(classNum-1)],每个元素是一个CvMat(rows=1,cols=var_count)指针,代表训练数据中每一类的某个特征的累加和
- CvMat** productsum; //productsum[0...(classNum-1)],每个元素是一个CvMat(rows=cols=var_count)指针,存储类内特征相关矩阵
- CvMat** avg; //avg[0...(classNum-1)],每个元素是一个CvMat(rows=1,cols=var_count)指针,代表训练数据中每一类的某个特征的平均值
- CvMat** inv_eigen_values;//inv_eigen_values[0...(classNum-1)],每个元素是一个CvMat(rows=1,cols=var_count)指针,代表训练数据中每一类的某个特征的特征值的倒数
- CvMat** cov_rotate_mats; //特征变量的协方差矩阵经过SVD奇异值分解后得到的特征向量矩阵
- CvMat* c;
这些数据成员,怎样使用呢?在train函数中见分晓:
- bool CvNormalBayesClassifier::train( const CvMat* _train_data, const CvMat* _responses,
- const CvMat* _var_idx, const CvMat* _sample_idx, bool update )
- {
- const float min_variation = FLT_EPSILON;
- bool result = false;
- CvMat* responses = 0;
- const float** train_data = 0;
- CvMat* __cls_labels = 0;
- CvMat* __var_idx = 0;
- CvMat* cov = 0;
- CV_FUNCNAME( "CvNormalBayesClassifier::train" );
- __BEGIN__;
- int cls, nsamples = 0, _var_count = 0, _var_all = 0, nclasses = 0;
- int s, c1, c2;
- const int* responses_data;
- //1.整理训练数据
- CV_CALL( cvPrepareTrainData( 0,
- _train_data, CV_ROW_SAMPLE, _responses, CV_VAR_CATEGORICAL,
- _var_idx, _sample_idx, false, &train_data,
- &nsamples, &_var_count, &_var_all, &responses,
- &__cls_labels, &__var_idx ));
- if( !update ) //如果是初始训练数据
- {
- const size_t mat_size = sizeof(CvMat*);
- size_t data_size;
- clear();
- var_idx = __var_idx;
- cls_labels = __cls_labels;
- __var_idx = __cls_labels = 0;
- var_count = _var_count;
- var_all = _var_all;
- nclasses = cls_labels->cols;
- data_size = nclasses*6*mat_size;
- CV_CALL( count = (CvMat**)cvAlloc( data_size ));
- memset( count, 0, data_size ); //count[cls]存储第cls类每个属性变量个数
- sum = count + nclasses;//sum[cls]存储第cls类每个属性取值的累加和
- productsum = sum + nclasses;//productsum[cls]存储第cls类的协方差矩阵的乘积项sum(XiXj),cov(Xi,Xj)=sum(XiXj)-sum(Xi)E(Xj)
- avg = productsum + nclasses;//avg[cls]存储第cls类的每个变量均值
- inv_eigen_values= avg + nclasses;//inv_eigen_values[cls]存储第cls类的协方差矩阵的特征值
- cov_rotate_mats = inv_eigen_values + nclasses;//存储第cls类的矩阵的特征值对应的特征向量
- CV_CALL( c = cvCreateMat( 1, nclasses, CV_64FC1 ));
- for( cls = 0; cls < nclasses; cls++ ) //对所有类别
- {
- CV_CALL(count[cls] = cvCreateMat( 1, var_count, CV_32SC1 ));
- CV_CALL(sum[cls] = cvCreateMat( 1, var_count, CV_64FC1 ));
- CV_CALL(productsum[cls] = cvCreateMat( var_count, var_count, CV_64FC1 ));
- CV_CALL(avg[cls] = cvCreateMat( 1, var_count, CV_64FC1 ));
- CV_CALL(inv_eigen_values[cls] = cvCreateMat( 1, var_count, CV_64FC1 ));
- CV_CALL(cov_rotate_mats[cls] = cvCreateMat( var_count, var_count, CV_64FC1 ));
- CV_CALL(cvZero( count[cls] ));
- CV_CALL(cvZero( sum[cls] ));
- CV_CALL(cvZero( productsum[cls] ));
- CV_CALL(cvZero( avg[cls] ));
- CV_CALL(cvZero( inv_eigen_values[cls] ));
- CV_CALL(cvZero( cov_rotate_mats[cls] ));
- }
- }
- else //如果是更新训练数据
- {
- // check that the new training data has the same dimensionality etc.
- if( _var_count != var_count || _var_all != var_all || !((!_var_idx && !var_idx) ||
- (_var_idx && var_idx && cvNorm(_var_idx,var_idx,CV_C) < DBL_EPSILON)) )
- CV_ERROR( CV_StsBadArg,
- "The new training data is inconsistent with the original training data" );
- if( cls_labels->cols != __cls_labels->cols ||
- cvNorm(cls_labels, __cls_labels, CV_C) > DBL_EPSILON )
- CV_ERROR( CV_StsNotImplemented,
- "In the current implementation the new training data must have absolutely "
- "the same set of class labels as used in the original training data" );
- nclasses = cls_labels->cols;
- }
- responses_data = responses->data.i;
- CV_CALL( cov = cvCreateMat( _var_count, _var_count, CV_64FC1 ));
- //2.处理训练数据,计算每一类的
- // process train data (count, sum , productsum)
- for( s = 0; s < nsamples; s++ )
- {
- cls = responses_data[s];
- int* count_data = count[cls]->data.i;
- double* sum_data = sum[cls]->data.db;
- double* prod_data = productsum[cls]->data.db;
- const float* train_vec = train_data[s];
- for( c1 = 0; c1 < _var_count; c1++, prod_data += _var_count )
- {
- double val1 = train_vec[c1];
- sum_data[c1] += val1;
- count_data[c1]++;
- for( c2 = c1; c2 < _var_count; c2++ )
- prod_data[c2] += train_vec[c2]*val1;
- }
- }
- //计算每一类的每个属性平均值、协方差矩阵
- // calculate avg, covariance matrix, c
- for( cls = 0; cls < nclasses; cls++ ) //对每一类
- {
- double det = 1;
- int i, j;
- CvMat* w = inv_eigen_values[cls];
- int* count_data = count[cls]->data.i;
- double* avg_data = avg[cls]->data.db;
- double* sum1 = sum[cls]->data.db;
- cvCompleteSymm( productsum[cls], 0 );
- for( j = 0; j < _var_count; j++ ) //计算当前类别cls的每个变量属性值的平均值
- {
- int n = count_data[j];
- avg_data[j] = n ? sum1[j] / n : 0.;
- }
- count_data = count[cls]->data.i;
- avg_data = avg[cls]->data.db;
- sum1 = sum[cls]->data.db;
- for( i = 0; i < _var_count; i++ )//计算当前类别cls的变量协方差矩阵,矩阵大小为_var_count * _var_count,注意协方差矩阵对称。
- {
- double* avg2_data = avg[cls]->data.db;
- double* sum2 = sum[cls]->data.db;
- double* prod_data = productsum[cls]->data.db + i*_var_count;
- double* cov_data = cov->data.db + i*_var_count;
- double s1val = sum1[i];
- double avg1 = avg_data[i];
- int _count = count_data[i];
- for( j = 0; j <= i; j++ )
- {
- double avg2 = avg2_data[j];
- double cov_val = prod_data[j] - avg1 * sum2[j] - avg2 * s1val + avg1 * avg2 * _count;
- cov_val = (_count > 1) ? cov_val / (_count - 1) : cov_val;
- cov_data[j] = cov_val;
- }
- }
- CV_CALL( cvCompleteSymm( cov, 1 ));
- CV_CALL( cvSVD( cov, w, cov_rotate_mats[cls], 0, CV_SVD_U_T ));
- CV_CALL( cvMaxS( w, min_variation, w ));
- for( j = 0; j < _var_count; j++ )
- det *= w->data.db[j];
- CV_CALL( cvDiv( NULL, w, w ));
- c->data.db[cls] = det > 0 ? log(det) : -700;
- }
- result = true;
- __END__;
- if( !result || cvGetErrStatus() < 0 )
- clear();
- cvReleaseMat( &cov );
- cvReleaseMat( &__cls_labels );
- cvReleaseMat( &__var_idx );
- cvFree( &train_data );
- return result;
- }
6.预测
下面看用于预测的predict函数的实现代码:
- float CvNormalBayesClassifier::predict( const CvMat* samples, CvMat* results ) const
- {
- float value = 0;
- if( !CV_IS_MAT(samples) || CV_MAT_TYPE(samples->type) != CV_32FC1 || samples->cols != var_all )
- CV_Error( CV_StsBadArg,
- "The input samples must be 32f matrix with the number of columns = var_all" );
- if( samples->rows > 1 && !results )
- CV_Error( CV_StsNullPtr,
- "When the number of input samples is >1, the output vector of results must be passed" );
- if( results )
- {
- if( !CV_IS_MAT(results) || (CV_MAT_TYPE(results->type) != CV_32FC1 &&
- CV_MAT_TYPE(results->type) != CV_32SC1) ||
- (results->cols != 1 && results->rows != 1) ||
- results->cols + results->rows - 1 != samples->rows )
- CV_Error( CV_StsBadArg, "The output array must be integer or floating-point vector "
- "with the number of elements = number of rows in the input matrix" );
- }
- const int* vidx = var_idx ? var_idx->data.i : 0;
- cv::parallel_for(cv::BlockedRange(0, samples->rows), predict_body(c, cov_rotate_mats, inv_eigen_values, avg, samples,
- vidx, cls_labels, results, &value, var_count
- ));
- return value;
- }
- cv::parallel_for(cv::BlockedRange(0, samples->rows), predict_body(c, cov_rotate_mats, inv_eigen_values, avg, samples,
- vidx, cls_labels, results, &value, var_count));
- //predict函数调用predict_body结构体的()符号重载函数,实现基于贝叶斯的分类
- struct predict_body
- {
- predict_body(CvMat* _c, CvMat** _cov_rotate_mats, CvMat** _inv_eigen_values, CvMat** _avg,
- const CvMat* _samples, const int* _vidx, CvMat* _cls_labels,
- CvMat* _results, float* _value, int _var_count1)
- {
- c = _c;
- cov_rotate_mats = _cov_rotate_mats;
- inv_eigen_values = _inv_eigen_values;
- avg = _avg;
- samples = _samples;
- vidx = _vidx;
- cls_labels = _cls_labels;
- results = _results;
- value = _value;
- var_count1 = _var_count1;
- }
- CvMat* c;
- CvMat** cov_rotate_mats;
- CvMat** inv_eigen_values;
- CvMat** avg;
- const CvMat* samples;
- const int* vidx;
- CvMat* cls_labels;
- CvMat* results;
- float* value;
- int var_count1;
- void operator()( const cv::BlockedRange& range ) const
- {
- int cls = -1;
- int rtype = 0, rstep = 0;
- int nclasses = cls_labels->cols;
- int _var_count = avg[0]->cols;
- if (results)
- {
- rtype = CV_MAT_TYPE(results->type);
- rstep = CV_IS_MAT_CONT(results->type) ? 1 : results->step/CV_ELEM_SIZE(rtype);
- }
- // allocate memory and initializing headers for calculating
- cv::AutoBuffer<double> buffer(nclasses + var_count1);
- CvMat diff = cvMat( 1, var_count1, CV_64FC1, &buffer[0] );
- for(int k = range.begin(); k < range.end(); k += 1 )//对于每个输入测试样本
- {
- int ival;
- double opt = FLT_MAX;
- for(int i = 0; i < nclasses; i++ ) //对于每一类别,计算其似然概率
- {
- double cur = c->data.db[i];
- CvMat* u = cov_rotate_mats[i];
- CvMat* w = inv_eigen_values[i];
- const double* avg_data = avg[i]->data.db;
- const float* x = (const float*)(samples->data.ptr + samples->step*k);
- // cov = u w u' --> cov^(-1) = u w^(-1) u'
- for(int j = 0; j < _var_count; j++ ) //计算特征相对于均值的偏移
- diff.data.db[j] = avg_data[j] - x[vidx ? vidx[j] : j];
- cvGEMM( &diff, u, 1, 0, 0, &diff, CV_GEMM_B_T );
- for(int j = 0; j < _var_count; j++ )//计算特征的联合概率
- {
- double d = diff.data.db[j];
- cur += d*d*w->data.db[j];
- }
- if( cur < opt ) //找到分类概率最大的
- {
- cls = i;
- opt = cur;
- }
- // probability = exp( -0.5 * cur )
- }//for(int i = 0; i < nclasses; i++ )
- ival = cls_labels->data.i[cls];
- if( results )
- {
- if( rtype == CV_32SC1 )
- results->data.i[k*rstep] = ival;
- else
- results->data.fl[k*rstep] = (float)ival;
- }
- if( k == 0 )
- *value = (float)ival;
- }//for(int k = range.begin()...
- }//void operator()...
- };