这一题把16题中的 decision stump 拓展到多维,要求找出E-in最小的那一维并在测试数据上计算对应维度的E-out:
#include <fstream>
#include <iostream>
#include <ctime>
#include <cmath>
#include <vector>
#include <algorithm>
using namespace std;
#define DEMENSION 9 //数据维度
char *file = "training.txt";
char *file_test = "testing.txt";
struct record {
double input[DEMENSION];
int output;
};
struct singleDemensionRecord {
double input;
int output;
};
struct Hypothesis{
int coef;
double threshold;
};
//求数字的符号
int sign(double x)
{
if(x<0) return -1;
else if(x>0) return 1;
else return -1;
}
//从文件读取数据
void getData(ifstream & dataFile, vector<record> &data)
{
while(!dataFile.eof()){
record curRecord;
for(int i=0;i<DEMENSION;++i){ dataFile>>curRecord.input[i]; }
dataFile>>curRecord.output;
data.push_back(curRecord);
}
dataFile.close();
}
//计算指定维度的样本错误率
double calErr(vector<singleDemensionRecord>& singleDemensionVec, vector<Hypothesis>& hypo, int demension)
{
int errCount = 0;
int length = singleDemensionVec.size();
for(int i=0;i<length;++i){
if(singleDemensionVec[i].output != hypo[demension-1].coef*sign(singleDemensionVec[i].input-hypo[demension-1].threshold)){
errCount++;
}
}
return double(errCount)/double(length);
}
//single demension record的比较函数
bool recCompare(singleDemensionRecord & a, singleDemensionRecord & b)
{
return a.input<b.input;
}
//将指定维度的数据提取出来并升序排列
void getInputByDemension(vector<record>& dataSet, vector<singleDemensionRecord>& singleDemensionVec, int demension)
{
int recordSize = dataSet.size();
singleDemensionRecord curRec;
for(int i=0;i<recordSize;++i){
curRec.input = dataSet[i].input[demension-1];
curRec.output = dataSet[i].output;
singleDemensionVec.push_back(curRec);
}
sort(singleDemensionVec.begin(),singleDemensionVec.end(),recCompare);
}
//遍历所有θ,找到最小的E-in并返回
double getMinErrIn(vector<singleDemensionRecord> & singleDemensionVec, vector<Hypothesis>& hypo, int demension, double & bestThres)
{
double minErrIn = 1.0;
double curErrIn;
int recordSize = singleDemensionVec.size();
for(int i=0;i<recordSize-1;++i){
hypo[demension-1].threshold = double(singleDemensionVec[i].input+singleDemensionVec[i+1].input)/2.0;
curErrIn = calErr(singleDemensionVec,hypo,demension);
if(curErrIn<minErrIn){
minErrIn = curErrIn;
bestThres = hypo[demension-1].threshold;
}
}
return minErrIn;
}
//Decision Stump 算法, 确定s和θ
void decisionStump(vector<record>& trainingSet, vector<record>& testSet, vector<Hypothesis>& hypo)
{
int recordSize = trainingSet.size();
int minErrInDem;
double minErrIn = 1.1;
for(int dem=0;dem<DEMENSION;++dem){
vector<singleDemensionRecord> singleDemensionVec;
double curMinErrIn;
double bestThresPositive;
double bestThresNegtive;
double minErrInPositive;
double minErrInNegtive;
getInputByDemension(trainingSet,singleDemensionVec,dem+1);
hypo[dem].coef = 1;
minErrInPositive = getMinErrIn(singleDemensionVec,hypo,dem+1,bestThresPositive);
hypo[dem].coef = -1;
minErrInNegtive = getMinErrIn(singleDemensionVec,hypo,dem+1,bestThresNegtive);
if(minErrInPositive<minErrInNegtive){
hypo[dem].coef = 1;
curMinErrIn = minErrInPositive;
hypo[dem].threshold = bestThresPositive;
}else{
hypo[dem].coef = -1;
curMinErrIn = minErrInNegtive;
hypo[dem].threshold = bestThresNegtive;
}
if(minErrIn>curMinErrIn){
minErrIn = curMinErrIn;
minErrInDem = dem+1;
}
}
cout<<"The demension with min error is : "<<minErrInDem<<endl;
cout<<"min E-in = "<<minErrIn<<endl;
vector<singleDemensionRecord> singleDemensionTestVec;
getInputByDemension(testSet,singleDemensionTestVec,minErrInDem);
cout<<"min E-out = "<<calErr(singleDemensionTestVec,hypo,minErrInDem)<<endl<<endl;
}
void main()
{
srand((unsigned)time(NULL));
vector<record> trainingSet; //训练数据
vector<record> testSet; //测试数据
vector<Hypothesis> hypoVec(DEMENSION); //每个维度一个hypothesis
ifstream dataFile(file);
ifstream testDataFile(file_test);
if( dataFile.is_open() && testDataFile.is_open() ){
getData(dataFile,trainingSet);
getData(testDataFile,testSet);
}else{
cerr<<"ERROR ---> 文件打开失败"<<endl;
exit(1);
}
decisionStump(trainingSet,testSet,hypoVec);
}