8.2 - 《机器学习基石》Home Work 2 Q.19-20

最新推荐文章于 2024-05-29 14:23:25 发布

weixin_34409703

最新推荐文章于 2024-05-29 14:23:25 发布

阅读量275

点赞数

文章标签：人工智能 python

原文链接：https://my.oschina.net/findbill/blog/220074

版权

2019独角兽企业重金招聘Python工程师标准>>>

这一题把16题中的 decision stump 拓展到多维，要求找出E-in最小的那一维并在测试数据上计算对应维度的E-out:

#include <fstream>
#include <iostream>
#include <ctime>
#include <cmath>
#include <vector>
#include <algorithm>

using namespace std;

#define DEMENSION 9			//数据维度

char *file = "training.txt";
char *file_test = "testing.txt";

struct record {
	double input[DEMENSION];	
	int output;						
};

struct singleDemensionRecord {
	double input;
	int output;
};

struct Hypothesis{
	int coef;
	double threshold;	
};

//求数字的符号
int sign(double x)
{
	if(x<0)		 return -1;
	else if(x>0) return 1;
	else		 return -1;
}

//从文件读取数据
void getData(ifstream & dataFile, vector<record> &data)
{
	while(!dataFile.eof()){
		record curRecord;   
		for(int i=0;i<DEMENSION;++i){ dataFile>>curRecord.input[i]; }
		dataFile>>curRecord.output;
		data.push_back(curRecord);
	}
	dataFile.close();	
}

//计算指定维度的样本错误率
double calErr(vector<singleDemensionRecord>& singleDemensionVec, vector<Hypothesis>& hypo, int demension)
{
	int errCount = 0;
	int length = singleDemensionVec.size();

	for(int i=0;i<length;++i){
		if(singleDemensionVec[i].output != hypo[demension-1].coef*sign(singleDemensionVec[i].input-hypo[demension-1].threshold)){
			errCount++;
		}	
	}

	return double(errCount)/double(length);
}

//single demension record的比较函数
bool recCompare(singleDemensionRecord & a, singleDemensionRecord & b)
{
	return a.input<b.input;	
}

//将指定维度的数据提取出来并升序排列
void getInputByDemension(vector<record>& dataSet, vector<singleDemensionRecord>& singleDemensionVec, int demension)
{
	int recordSize = dataSet.size(); 
	singleDemensionRecord curRec;

	for(int i=0;i<recordSize;++i){
		curRec.input = dataSet[i].input[demension-1];	
		curRec.output = dataSet[i].output; 
		singleDemensionVec.push_back(curRec);
	}

	sort(singleDemensionVec.begin(),singleDemensionVec.end(),recCompare);
}

//遍历所有θ，找到最小的E-in并返回
double getMinErrIn(vector<singleDemensionRecord> & singleDemensionVec, vector<Hypothesis>& hypo, int demension, double & bestThres)
{
	double minErrIn = 1.0;
	double curErrIn;
	int recordSize = singleDemensionVec.size();

	for(int i=0;i<recordSize-1;++i){
		hypo[demension-1].threshold = double(singleDemensionVec[i].input+singleDemensionVec[i+1].input)/2.0;
		curErrIn = calErr(singleDemensionVec,hypo,demension);
		if(curErrIn<minErrIn){
			minErrIn = curErrIn;
			bestThres = hypo[demension-1].threshold;
		}
	}

	return minErrIn;
}

//Decision Stump 算法, 确定s和θ
void decisionStump(vector<record>& trainingSet, vector<record>& testSet, vector<Hypothesis>& hypo)
{
	int recordSize = trainingSet.size();
	int minErrInDem;
	double minErrIn = 1.1;	

	for(int dem=0;dem<DEMENSION;++dem){

		vector<singleDemensionRecord> singleDemensionVec; 
		double curMinErrIn;
		double bestThresPositive;
		double bestThresNegtive;
		double minErrInPositive;
		double minErrInNegtive;

		getInputByDemension(trainingSet,singleDemensionVec,dem+1);

		hypo[dem].coef = 1;
		minErrInPositive = getMinErrIn(singleDemensionVec,hypo,dem+1,bestThresPositive);
	
		hypo[dem].coef = -1;
		minErrInNegtive = getMinErrIn(singleDemensionVec,hypo,dem+1,bestThresNegtive);

		if(minErrInPositive<minErrInNegtive){
			hypo[dem].coef = 1;	
			curMinErrIn = minErrInPositive; 
			hypo[dem].threshold = bestThresPositive;
		}else{
			hypo[dem].coef = -1;	
			curMinErrIn = minErrInNegtive;	
			hypo[dem].threshold = bestThresNegtive;
		}

		if(minErrIn>curMinErrIn){
			minErrIn = curMinErrIn;
			minErrInDem = dem+1;
		}
	}

	cout<<"The demension with min error is : "<<minErrInDem<<endl;
	cout<<"min E-in = "<<minErrIn<<endl;
	vector<singleDemensionRecord> singleDemensionTestVec;
	getInputByDemension(testSet,singleDemensionTestVec,minErrInDem);
	cout<<"min E-out = "<<calErr(singleDemensionTestVec,hypo,minErrInDem)<<endl<<endl;
}


void main()
{
	srand((unsigned)time(NULL)); 

	vector<record> trainingSet;						//训练数据
	vector<record> testSet;							//测试数据
	vector<Hypothesis> hypoVec(DEMENSION);			//每个维度一个hypothesis

	ifstream dataFile(file);
	ifstream testDataFile(file_test);	

	if( dataFile.is_open() && testDataFile.is_open() ){
		getData(dataFile,trainingSet);	
		getData(testDataFile,testSet);	
	}else{
		cerr<<"ERROR ---> 文件打开失败"<<endl;
		exit(1);
	}

	decisionStump(trainingSet,testSet,hypoVec);
}