ID3-CSDN博客

本文链接：https://blog.csdn.net/DouMiaoO_Oo/article/details/49916363

写了了个不怎样的ID3先放在这，有空回来修改（11.19）：

（这个版本的data都是string类型的，没有做str2int的操作，感觉不太专业..）

#include <iostream>
#include <cmath>
#include <vector>
#include <set>
#include <map>
#include <string>
#include <sstream>
#include <algorithm>
#include <queue>
using namespace std;

#pragma region Data
typedef string String;
typedef String Label;
typedef vector<String> VS;
#pragma endregion Data
class DataSet{   // matrix built by vector 
	public:
	DataSet(){
		this->row = 0;
		this->col = 0;
	}
	DataSet(int row, int col){
		this->col = col;
		this->row = row;
		for( int i = 0; i < row; i++) {
			VS vs;
			dataSet.push_back(vs);
		}
	}
	void push_back(VS vs) {
		this->row++;
		this->dataSet.push_back(vs);
	}
	VS getCol(int id) {
		VS v;
		for( int i = 0; i < this->row; i++) {
			v.push_back(this->dataSet[i][id]);
		}
		return v;
	}
	int row;
	int col;
	vector<VS> dataSet;
};
class Node{
	public: 
	Node(const Label& label, String value){
		this->featureLabel = label;
		this->value = value;
	}
	void push_back(const Node& node) {
		child.push_back(node);
	}
	typename vector<Node> :: iterator begin(){
		return child.begin();
	}
	typename vector<Node> :: iterator end(){
		return child.end();
	}
	bool isLeafNode()const {
			return child.empty();
	}
	void print(){
		queue<Node> q;
		q.push(*this);
		while(!q.empty()){
			Node tmpNode = q.front();//
			q.pop();
			if(tmpNode.isLeafNode())
				continue;
			cout << tmpNode.value << ":\"" << tmpNode.featureLabel <<"\""; 
			for ( vector<Node> :: iterator it = tmpNode.begin(); it != tmpNode.end(); it++) {
				cout << " -> " << (*it).value << ":\""<< (*it).featureLabel << "\"";
				q.push(*it);
			} cout << endl;
		}
	}
	private:
		Label featureLabel;
		String value;
		vector<Node> child;
		//class path{	};
};
template <class T>
class Set{
	public:
		Set(){}
		Set(vector<T>& v){ // add key words typename 
			for(typename vector<T> :: iterator it = v.begin(); it != v.end(); it++) {
				s.insert(*it);
			}
		}
		typename set<T> :: iterator find(const T& item) {
			return s.find(item);
		}
		void insert(const T& item) {
		 	s.insert(item);
		}
		typename set<T> :: iterator begin(){
			return s.begin();
		}
		typename set<T> :: iterator end(){
			return s.end();
		}
		int size() const{
			return s.size();
		}
		
	private:
		set<T> s;
};
template <typename T> 
String ConvertToString(T);
double log2(double);
void printDataSet (DataSet);
double calcShannonEnt(DataSet);
DataSet splitDataSet(DataSet dataSet, int axis, String value);
int chooseBestFeatureToSplit(DataSet d);
Label majorityCnt(VS classList) {
	map<String, int> classCount;
	Label majorityLabel;
	int maxClassCount = 0;
	for(VS :: iterator it = classList.begin(); it != classList.end(); it++) {
		classCount[*it]++;
		if(maxClassCount < classCount[*it]) {
			maxClassCount = classCount[*it]; 
			majorityLabel = *it;
		}
	}	return majorityLabel;
}

Node createTree(DataSet dataSet, VS labels, String cvalue){
	VS classList = dataSet.getCol(dataSet.col-1);
	Set<String> uniqueVals(classList);
	if (uniqueVals.size() == 1) {   // only class Label
		return Node(classList[0], cvalue);       // that we will predict is left
	}
	if (dataSet.col == 1) { // no features left that we can choose 
		return Node(majorityCnt(classList), cvalue); 
	}
	int bestFeat = chooseBestFeatureToSplit(dataSet);
	Label bestFeatLabel = labels[bestFeat];
	Node myTree(bestFeatLabel, cvalue);  // 该节点代表的feature已经确定了 
	labels.erase(labels.begin()+bestFeat);
	VS subLabels = labels;  
	VS featValues = dataSet.getCol(bestFeat);
	uniqueVals = Set<String>(featValues);
	for( set<String> :: iterator value = uniqueVals.begin(); value != uniqueVals.end(); value++) {
		 //  根据该节点下的不同值来确定有多少个分支 
		myTree.push_back( createTree( splitDataSet(dataSet, bestFeat, *value),
									  subLabels, *value) );
	} return myTree;
}
const static int gRow = 39644, gCol = 116;
VS labels;
DataSet dataSet;
void init();
template <typename T>
void MyPrint(vector<T>& v) {
	for(typename vector<T> :: iterator it = v.begin(); it != v.end(); it++ ) {
		cout << *it << ' ';
	} cout << endl;
}

int main(){
	init();
	cout << calcShannonEnt(dataSet) << endl; // test1
	printDataSet(splitDataSet(dataSet, 0, "1")); cout<< endl;	
	printDataSet(splitDataSet(dataSet, 0, "0"));//test2
	cout << chooseBestFeatureToSplit(dataSet) << endl;// test3

	cout << "\ntest4:\n";	
	createTree(dataSet, labels, "Root Label").print();  // test4
	return 0;
}
void init() {
	dataSet.row = 5;
	dataSet.col = 3;
	string str1[] = {"1", "1", "yes"};
	string str2[] = {"1", "1", "yes"};
	string str3[] = {"1", "0", "no"};
	string str4[] = {"0", "1", "no"};
	String str5[] = {"0", "1", "no"};
	String str[5][3] = {str1, str2, str3, str4, str5};
//	String str[5][3] = {{"1", "1", "yes"},
//						{"1", "1", "yes"},
//						{"1", "0", "no"},
//						{"0", "1", "no"},
//						{"0", "1", "no"}};
	for(int i = 0; i < 5; i++) {
		VS vs;
		for (int j = 0; j < 3; j++) {
			vs.push_back(str[i][j]);
		}
		dataSet.dataSet.push_back(vs);
	} //printData(dataSet); cout << endl;
	labels.push_back("No surfacing");
	labels.push_back("Flippers");
	labels.push_back("Fish");
	
}
void printDataSet (DataSet dataSet){
	for (int i = 0; i < dataSet.row; i++) {
		for ( int j = 0; j < dataSet.col; j++) {
			 cout << dataSet.dataSet[i][j] << ' ';
		} cout << endl;
 	} return;
}
double log2(double x){
	return log(x)/log(2);
}
template <typename T>
String ConvertToString(T value) {
  stringstream ss;
  ss << value;
  return ss.str();
}
double calcShannonEnt(DataSet dataSet){
	int numEntries = dataSet.row;
	map<String, int> labelCounts;
	for ( int i = 0; i < dataSet.row; i++) {
		String currentLabel = dataSet.dataSet[i][dataSet.col-1];
		labelCounts[currentLabel]++; 
	}
	double shannonEnt = 0.0;
	for (map<String, int> :: iterator it = labelCounts.begin(); it != labelCounts.end(); it++) {
		double prob = it->second / (double)numEntries;
		shannonEnt -= prob * log2(prob);
	}
	return shannonEnt;
}
DataSet splitDataSet(DataSet dataSet, int axis, String value){
	DataSet* retDataSet = new DataSet; // retDataSet col and row are 0,  
	int retCol = dataSet.col-1;
	for (int i = 0; i < dataSet.row; i++){
		if (dataSet.dataSet[i][axis] == value) {
			VS vs;
			retDataSet->push_back(vs);  // add a new row
			retDataSet->dataSet[retDataSet->row-1].reserve(retCol);
			retDataSet->dataSet[retDataSet->row-1].resize(retCol);
			VS :: iterator itOri = (dataSet.dataSet[i]).begin();
			VS :: iterator itDes = (retDataSet->dataSet[retDataSet->row-1]).begin();
			copy ( itOri, itOri+axis, itDes );
			copy ( itOri+axis+1, itOri+dataSet.col, itDes+axis);
			// get a reducedFeatVec
		}
	}
	if(retDataSet->row > 0){
		retDataSet->col = retCol;
	}
	return *retDataSet;
}
int chooseBestFeatureToSplit(DataSet dataSet){
	int numFeatures = dataSet.col-1; // the last col is the feature we need to predict
	double baseEntropy = calcShannonEnt(dataSet);
	double bestInfoGain = 0.0, bestFeature = -1;
	for (int i = 0; i < numFeatures; i++) {
		VS featList = dataSet.getCol(i);
		Set<String> uniqueVals = Set<String>(featList);
		double newEntropy = 0.0;
		for(set<string> :: iterator value = uniqueVals.begin(); value != uniqueVals.end(); value++) {
			DataSet subDataSet = splitDataSet(dataSet, i, *value);
			double prob = subDataSet.col/(double)(dataSet.col);
			newEntropy += prob*calcShannonEnt(subDataSet);
		}
		double infoGain = baseEntropy - newEntropy;
		if (infoGain > bestInfoGain) {
			bestInfoGain = infoGain;
			bestFeature = i;
		}
	} return bestFeature;
}

作死的换了下数据结构... （11.20）

另一个测试样例：

ID	AUs (split by space)	Emotion
train1	1 2 5 25 27	7
train2	1 2 25 27	7
train3	1 4 15 17	6
train4	1 2 5 12 25 27	7
train5	1 4 17 39	6
train6	1 15 17	6
train7	1 2 5 15 25 27	7
train8	1 2 5 25 26	7
train9	1 2 5 16 25 27	7
train10	1 2 4 15 17	6
train11	1 4 7 20 25	4
train12	4 7 9 25	3

代码，做了str2int的操作：

#include <iostream>
#include <cmath>
#include <vector>
#include <set>
#include <map>
#include <string>
#include <sstream>
#include <algorithm>
#include <queue>
#include <fstream>
using namespace std;

#pragma region TypeName
typedef string String;
//typedef String Label;
typedef vector<String> VS;
typedef vector<int> VI;
#pragma endregion
#pragma region Global Variables
const static int gRow = 13, gCol = 40; // 39 + 1
VS labels;
VI labelsId;
map<int, string> id2labels;
map<int, string> id2class;
#pragma endregion


#pragma region ClassType
class DataSet{   // matrix built by vector 
public:
	DataSet(){
		this->row = 0;
		this->col = 0;
	}
	DataSet(int row, int col){
		this->col = col;
		this->row = row;
		for (int i = 0; i < row; i++) {
			VI v;
			dataSet.push_back(v);
		}
	}
	void push_back(VI v) {
		this->row++;
		this->dataSet.push_back(v);
	}
	VI getCol(int id) {
		VI v;
		for (int i = 0; i < this->row; i++) {
			v.push_back(this->dataSet[i][id]);
		}
		return v;
	}
	int row;
	int col;
	vector<VI> dataSet;
} dataSet;   // 全局创建了这一个对象 
class Node{
public:
	Node(int label, int value){
		this->featureLabel = label;
		this->featureValue = value;
	}
	void insert(const Node& node) { child.push_back(node); }
	typename vector<Node> ::iterator begin(){ return child.begin(); }
	typename vector<Node> ::iterator end(){ return child.end(); }
	bool isLeafNode()const { return child.empty(); }
	Node& getChild(int value){
		for (vector<Node> ::iterator it = child.begin(); it != child.end(); it++) {
			if ((*it).featureValue == value)
				return *it;
		}
	}
	//bool operator < (const Node& oNode) const{ return this->featureValue < oNode.featureValue; }
	void print(){
		queue<Node> q;
		q.push(*this);
		while (!q.empty()){
			Node tmpNode = q.front();//
			q.pop();
			cout << tmpNode.featureValue << ":\"" << id2labels[tmpNode.featureLabel] << "\"";
			for (vector<Node> ::iterator it = tmpNode.begin(); it != tmpNode.end(); it++) {
				if( (*it).isLeafNode()) {
					cout << " -> " << (*it).featureValue << ":\"" << id2class[(*it).featureLabel] << "\"";
				} else {
					cout << " -> " << (*it).featureValue << ":\"" << id2labels[(*it).featureLabel] << "\"";
					q.push(*it);
				}
			} cout << endl;
		}
	}
private:
	int featureLabel; // 特征标签的下标 
	int featureValue; // featureValue是针对父节点标签的分类值 
	vector<Node> child;
};
template <class T>
class Set{
public:
	Set(){};
	Set(vector<T>& v){ // add key words typename 
		for (typename vector<T> ::iterator it = v.begin(); it != v.end(); it++){
			s.insert(*it);
		}
	}
	typename set<T> ::iterator find(const T& item) { return s.find(item); }
	void insert(const T& item) { s.insert(item); }
	typename set<T> ::iterator begin(){ return s.begin(); }
	typename set<T> ::iterator end(){ return s.end(); }
	int size() const{ return s.size(); }
private:
	set<T> s;
};
#pragma endregion


#pragma region FunctionPrototype
template <typename T>
String ConvertToString(T);
double log2(double);
void printDataSet(DataSet);
double calcShannonEnt(DataSet);
DataSet splitDataSet(DataSet dataSet, int axis, int value);
int chooseBestFeatureToSplit(DataSet d);
int majorityCnt(VI classList);
Node createTree(DataSet dataSet, VI labels, int cvalue);
#pragma endregion

void init(){  // 我想考虑到通用情况，就是每个特征有很多取值[0~n] 
	dataSet.col = gCol;
	ifstream file("Dataset_txt format.txt");
	String str[gRow];
	for (int row = 0; !file.eof(); row++) {
		getline(file, str[row], '\n'); // 读入一行 
	}
	//for (int i = 0; i < gRow; i++) {	cout << str[i] << endl;	}

	for (int i = 1; i < gRow; i++) {
		stringstream ss;
		int id;
		String strId;
		VI v(gCol);
		ss << str[i];
		//cout << str[i] << endl;
		ss >> strId; // 去掉第一列，即为列号
	
		while (!ss.eof()) { // !!
			ss >> id;
			if(!ss.eof()) {  // 不是最后一列的数据 
				v[id-1] = 1;    // 这组数据集的特征除了0就是1
				//cout << "id:" << id << endl; 
			} else {         // 是最后一列的数据，类标签 
				v[gCol-1] = id;
				break;
			}
		}
		dataSet.push_back(v);
	}
	for (int i = 0; i < gCol; i++) {
		String label = "AU"+ConvertToString(i+1);
		//cout << "label: " << label << endl;
		labels.push_back(label);
		id2labels[i] = label;
		labelsId.push_back(i);
	}
	for ( int i = 0; i < 10; i++) {
		id2class[i] = ConvertToString(i);
	}
	file.close();
	cout << "dataSet:\n";
	printDataSet(dataSet);
	cout << "row: " << dataSet.row << "\ncol: " << dataSet.col << endl;
}
void init1() { // 自定义测试集 
	//	String str[5][3] = {{"1", "1", "yes"},
	//						{"1", "1", "yes"},
	//						{"1", "0", "no"},
	//						{"0", "1", "no"},
	//						{"0", "1", "no"}};

	dataSet.row = 5;
	dataSet.col = 3;
	int data[5][3] = { { 1, 1, 1 },
	{ 1, 1, 1 },
	 { 1, 0, 0 },
	 { 0, 1, 0 },
	 { 0, 1, 0 }};
	
	for (int i = 0; i < 5; i++) {
		size_t count=sizeof(data[i])/sizeof(int);
		VI v(data[i], data[i]+count);
		dataSet.dataSet.push_back(v);
	} printDataSet(dataSet); cout << endl;
	labels.push_back("No surfacing");
	labelsId.push_back(0);
	id2labels[0] = "No surfacing";
	
	labels.push_back("Flippers");
	labelsId.push_back(1);
	id2labels[1]="Flippers";
	
	labels.push_back("Fish");
	labelsId.push_back(2);
	id2labels[2] = "Fish";
	
	id2class[1] = "yes";
	id2class[0] = "no";
}
int main(){	
	
	init();
	//cout << calcShannonEnt(dataSet) << endl; // test1
	//printDataSet(splitDataSet(dataSet, 0, 1)); cout<< endl;	
	//printDataSet(splitDataSet(dataSet, 0, 0));//test2
	//cout << chooseBestFeatureToSplit(dataSet) << endl;// test3	
	createTree(dataSet, labelsId, -1).print();  // test4 -1是根节点 
	return 0;
}

void printDataSet(DataSet dataSet){
	for (int i = 0; i < dataSet.row; i++) {
		for (int j = 0; j < dataSet.col; j++) {
			cout << dataSet.dataSet[i][j] << ' ';
		} cout << endl;
	} return;
}
double log2(double x){
	return log(x) / log(2);
}
template <typename T>
String ConvertToString(T value) {
	stringstream ss;
	ss << value;
	return ss.str();
}
double calcShannonEnt(DataSet dataSet){
	int numEntries = dataSet.row;
	map<int, int> labelCounts;
	for (int i = 0; i < dataSet.row; i++) {
		int currentLabel = dataSet.dataSet[i][dataSet.col - 1];
		labelCounts[currentLabel]++;
	}
	double shannonEnt = 0.0;
	for (map<int, int> ::iterator it = labelCounts.begin(); it != labelCounts.end(); it++) {
		double prob = it->second / (double)numEntries;
		shannonEnt -= prob * log2(prob);
	}
	return shannonEnt;
}
DataSet splitDataSet(DataSet dataSet, int axis, int value){
	DataSet retDataSet; // retDataSet col and row are 0,  
	int retCol = dataSet.col - 1;
	for (int i = 0; i < dataSet.row; i++){
		if (dataSet.dataSet[i][axis] == value) {
			VI v;
			retDataSet.push_back(v);  // add a new row
			retDataSet.dataSet[retDataSet.row - 1].reserve(retCol);
			retDataSet.dataSet[retDataSet.row - 1].resize(retCol);
			VI::iterator itOri = (dataSet.dataSet[i]).begin();
			VI::iterator itDes = (retDataSet.dataSet[retDataSet.row - 1]).begin();
			copy(itOri, itOri + axis, itDes);
			copy(itOri + axis + 1, itOri + dataSet.col, itDes + axis);
			// get a reducedFeatVec
		}
	}
	if (retDataSet.row > 0){
		retDataSet.col = retCol;
	}
	return retDataSet;
}
int chooseBestFeatureToSplit(DataSet dataSet){
	int numFeatures = dataSet.col - 1; // the last col is the feature we need to predict
	double baseEntropy = calcShannonEnt(dataSet);
	double bestInfoGain = 0.0, bestFeature = -1;
	for (int i = 0; i < numFeatures; i++) {
		VI featList = dataSet.getCol(i);
		Set<int> uniqueVals = Set<int>(featList);
		double newEntropy = 0.0;
		for (set<int> ::iterator value = uniqueVals.begin(); value != uniqueVals.end(); value++) {
			DataSet subDataSet = splitDataSet(dataSet, i, *value);
			double prob = subDataSet.col / (double)(dataSet.col);
			newEntropy += prob*calcShannonEnt(subDataSet);
		}
		double infoGain = baseEntropy - newEntropy;
		if (infoGain > bestInfoGain) {
			bestInfoGain = infoGain;
			bestFeature = i;
		}
	} return bestFeature;
}
int majorityCnt(VI classList) {
	map<int, int> classCount;
	int majorityLabel;
	int maxClassCount = 0;
	for (VI::iterator it = classList.begin(); it != classList.end(); it++) {
		classCount[*it]++;
		if (maxClassCount < classCount[*it]) {
			maxClassCount = classCount[*it];
			majorityLabel = *it;
		}
	}	return majorityLabel;
}
Node createTree(DataSet dataSet, VI labels, int cvalue){ // labels 是特征值标签的下标 
	VI classList = dataSet.getCol(dataSet.col - 1);
	Set<int> uniqueVals(classList);
	if (uniqueVals.size() == 1) {   // only class Label
		return Node(classList[0], cvalue);       // that we will predict is left
	}
	if (dataSet.col == 1) { // no features left that we can choose, choose major class
		return Node(majorityCnt(classList), cvalue);
	}
	int bestFeat = chooseBestFeatureToSplit(dataSet);
	int bestFeatLabel = labels[bestFeat];
	Node myTree(bestFeatLabel, cvalue);  // 该节点代表的feature已经确定了,cvalue是针对父节点标签的分类值 
	labels.erase(labels.begin() + bestFeat);
	VI subLabels = labels;
	VI featValues = dataSet.getCol(bestFeat);
	uniqueVals = Set<int>(featValues);
	for (set<int> ::iterator value = uniqueVals.begin(); value != uniqueVals.end(); value++) {
		//  根据该节点下的不同值来确定有多少个分支 
		myTree.insert(createTree(splitDataSet(dataSet, bestFeat, *value),
			subLabels, *value));
	} return myTree;
}