C4.5

最新推荐文章于 2021-05-23 22:27:23 发布
DouMiaoO_Oo
最新推荐文章于 2021-05-23 22:27:23 发布
阅读量495
点赞数
分类专栏： machine learning
本文链接：https://blog.csdn.net/doumiaoo_oo/article/details/50964862
版权
machine learning 专栏收录该内容
1 篇文章 0 订阅
订阅专栏
#include <iostream>
#include <cmath>
#include <vector>
#include <set>
#include <map>
#include <string>
#include <sstream>
#include <algorithm>
#include <queue>
#include <fstream>
using namespace std;
/* 数据集有两种类型
数据集类型1：
类别名称1 类别名称2 ... 类别名称n 类标签
...       ...           ...       类标签值1
                                  类标签值2
                                  ...
                                  类标签值m
数据集类型2：
测试集Id 测试集内容     测试集属性值
1        data1 data2 .. value1  
2
...
m
例子： 微博Id 微博内容       微博转发次数
        1     今天 天气 晴    0
        2     中大 新校长     10
        ...
*/
#pragma region TypeName
typedef string String;
//typedef String Label;
typedef vector<String> VS;
typedef vector<int> VI;
#pragma endregion
#pragma region Global Variables
const static int gRow = 8, gCol = 5; // 7 + 1, 4 + 1
VS labels;  // 类别名称
VI labelsId; // 类别名称对应的Id
map<int, string> id2labels; // 映射类别Id和类别名称
map<int, string> id2class;  // 最后一列中，用Id映射到类标签的不同值
// 设计的不好，应该针对每一列都用一个map<int, string>来映射他们的属性值。
#pragma endregion


#pragma region ClassType
class DataSet{   // matrix built by vector 
public:
	DataSet(){
		this->row = 0;
		this->col = 0;
	}
	// 修改了方法 
	DataSet(int row, int col){
		this->resize(row, col);
	}
	//  作死的补充了方法 
	void resize(int row, int col){
		this->col = col;
		this->row = row;
		dataSet.clear();
		for (int i = 0; i < row; i++) {
			VI v(col);
			dataSet.push_back(v);
		}
		i2s.resize(col);
	}
	// 这个重载太精彩.. 这样我在外部调用时就可以使用dataSet[][] 
	vector<int>& operator[](int row){
		return dataSet[row];
	}
	// 以上是新修改或者添加的部分 
	void push_back(VI v) { // 这个方法不能更新i2s属性 
		this->row++;
		this->dataSet.push_back(v);
	}
	VI getCol(int id) {
		VI v;
		for (int i = 0; i < this->row; i++) {
			v.push_back(this->dataSet[i][id]);
		}
		return v;
	}
	int row;
	int col;
	vector<VI> dataSet;
	vector< map<int, string> > i2s; // 新添加的，用来映射每一列中id->属性名 
} dataSet;   // 全局创建了这一个对象
class Node{
public:
	Node(int label, int value){
		this->featureLabelId = label;
		this->featureValue = value;
	}
	void addChild(const Node& node) { child.push_back(node); }

	bool isLeafNode()const { return child.empty(); }
	Node& getChild(int value){
		for (vector<Node> ::iterator it = child.begin(); it != child.end(); it++) {
			if ((*it).featureValue == value)
				return *it;
		}
	}
	string getClassLabel() const{  // pre: make sure this is leaf node. 叶节点返回类标签 
		return id2class[this->featureLabelId];
	}
	int getFeatureLabelId(){
		return this->featureLabelId;
	}
	//bool operator < (const Node& oNode) const{ return this->featureValue < oNode.featureValue; }
	void print(){
		queue<Node> q;
		q.push(*this);
		while (!q.empty()){
			Node tmpNode = q.front();//
			q.pop();
			cout << tmpNode.featureValue << ":\"" << id2labels[tmpNode.featureLabelId] << "\"";
			for (vector<Node> ::iterator it = tmpNode.begin(); it != tmpNode.end(); it++) {
				if( it->isLeafNode()) {
					cout << " -> " << it->featureValue << ":\"" << id2class[it->featureLabelId] << "\"";
				} else {
					cout << " -> " << it->featureValue << ":\"" << id2labels[it->featureLabelId] << "\"";
					q.push(*it);
				}
			} cout << endl;
		}
	}
private:
	int featureLabelId; // 特征标签的下标,如果是叶节点就是类标签的下标（不好的设计） 
	int featureValue; // featureValue是针对父节点标签的分类值 
	vector<Node> child;
	typename vector<Node> ::iterator begin(){ return child.begin(); }
	typename vector<Node> ::iterator end(){ return child.end(); }
};
template <class T>
class Set{
public:
	Set(){};
	Set(vector<T>& v){ // add key words typename 
		for (typename vector<T> ::iterator it = v.begin(); it != v.end(); it++){
			s.insert(*it);
		}
	}
	typename set<T> ::iterator find(const T& item) { return s.find(item); }
	void insert(const T& item) { s.insert(item); }
	typename set<T> ::iterator begin(){ return s.begin(); }
	typename set<T> ::iterator end(){ return s.end(); }
	int size() const{ return s.size(); }
private:
	set<T> s;
};
#pragma endregion


#pragma region FunctionPrototype
template <typename T>
String ConvertToString(T);
double log2(double);
template <typename T>
void printVector(vector<T> v);
void printDataSet(DataSet);
double calcShannonEnt(DataSet);
DataSet splitDataSet(DataSet dataSet, int axis, int value);
int chooseBestFeatureToSplit(DataSet d);
int majorityCnt(VI classList);
Node createTree(DataSet dataSet, VI labels, int cvalue);
#pragma endregion

void init_default() { // 自定义测试集 
	//	String str[5][3] = {{"1", "1", "yes"},
	//						{"1", "1", "yes"},
	//						{"1", "0", "no"},
	//						{"0", "1", "no"},
	//						{"0", "1", "no"}};

	dataSet.row = 5;
	dataSet.col = 3;
	int data[5][3] = { { 1, 1, 1 },
	{ 1, 1, 1 },
	 { 1, 0, 0 },
	 { 0, 1, 0 },
	 { 0, 1, 0 }};
	
	for (int i = 0; i < 5; i++) {
		size_t count=sizeof(data[i])/sizeof(int);
		VI v(data[i], data[i]+count);
		dataSet.dataSet.push_back(v);
	} printDataSet(dataSet); cout << endl;
	labels.push_back("No surfacing");
	labelsId.push_back(0);
	id2labels[0] = "No surfacing";
	
	labels.push_back("Flippers");
	labelsId.push_back(1);
	id2labels[1]="Flippers";
	
	labels.push_back("Fish");
	labelsId.push_back(2);
	id2labels[2] = "Fish";
	
	id2class[1] = "yes";
	id2class[0] = "no";
}
void init_exam(){
    dataSet.col = gCol;
	ifstream file("Dataset_train");
	String str[gRow-1]; // 存进文本的每一行,gRow-1是去掉属性名称
    {  // 读入第一行属性名称+类标签
        string featureTag;
        getline(file, featureTag, '\n'); // 读入属性名称行
        stringstream ss;
        string label;
        ss << featureTag;
        for(int j = 0; j < gCol; j++){ // 最后一列是类标签
            ss >> label;
//            cout << "label: " << label << endl;  // 测试输出
            labels.push_back(label);
            id2labels[j] = label;
            labelsId.push_back(j);
        }
//        printVector(labels);  // 测试输出 
    }
    
	for (int row = 0; !file.eof(); row++) {
		getline(file, str[row], '\n'); // 读入数据部分的行（即除了属性名称行之外的数据内容）
	}
//	for (int i = 0; i < gRow; i++) {	cout << str[i] << endl;	}// 测试用的一个输出
    string str_matrix[gRow-1][gCol]; // 没有映射到int类型之前的数据集
	for (int i = 0; i < gRow-1; i++) { // 因为第0行是属性名称，所以gRow-1
		stringstream ss;
        string data;
        ss << str[i];
        for(int j = 0; j < gCol; j++){ // 最后一列是类标签
            ss >> str_matrix[i][j];
        }
    }
/*    for(int i = 0; i < gRow-1; i++){
    	for(int j = 0; j< gCol; j++){
    		cout << str_matrix[i][j] << " ";
    	} cout << endl; 
    }*/
    dataSet.resize(gRow-1, gCol);  // 更新大小 
    // 将str_matrix映射到dataset中,方法是遍历每列
    for (int j = 0; j < gCol; j++){
        map<string, int> s2i;  // string to int
        int featureId = 0; // 针对这种特征(列)的特征ID
        for(int i = 0; i < gRow-1; i++){ // gRow-1因为第一行是属性名
        	map<string, int> :: iterator it = s2i.find(str_matrix[i][j]);
            if ( it != s2i.end() ){ // 这个属性的值被映射过了 
            	dataSet[i][j] = it->second;
            } else {  // 新的属性值并未被添加过 
            	s2i.insert(pair<string, int>(str_matrix[i][j], featureId) );
            	map<int, string>& id2featureValue = dataSet.i2s[j];
            	id2featureValue[featureId] = str_matrix[i][j]; // 第j列属性的 id->属性值映射
            	dataSet[i][j] = featureId++;
            	if (j == gCol-1) { // 最后一列是列标签，特殊处理 
            		id2class[featureId-1] = str_matrix[i][j];
//            		cout << "featuredId is :" << featureId-1 << " \ndata is :" << str_matrix[i][j] << endl;// 测试用输出 
            	}
            }
        }
    }
    file.close();
	
/*
  cout << "dataSet:\n";
	printDataSet(dataSet);
	cout << "row: " << dataSet.row << "\ncol: " << dataSet.col << endl;
	dataSet.getCol(dataSet.col-1);	
	cout << id2class[0] << " " << id2class[1] << endl;//*/ // 测试输出
}

// 下面的设计不通用
int test_row = 7;
int test_col = 4;
int test_set[7][4];
void loadTestSet(){
//	ifstream file("Dataset_test");
	int set[7][4] = { {0, 1, 0, 0},
				 {0, 2, 1, 0},
				 {2, 1, 1, 0},
				 {0, 1, 1, 1},
				 {1, 1, 0, 1},
				 {1, 0, 1, 0},
				 {2, 1, 0, 1},
			   };
	for( int i = 0; i < test_row; i++){
		for( int j = 0; j < test_col; j++){
			test_set[i][j] = set[i][j];
		}
	}
}
void classify(const Node& root){
	for(int i = 0; i < test_row; i++){
		Node node = root;
		while(!node.isLeafNode()){
			// featureLabelId is col id
			node = node.getChild(test_set[i][node.getFeatureLabelId()]);
		} cout << node.getClassLabel() << endl;
	}
}
int main(){
	init_exam();
//	cout << calcShannonEnt(dataSet) << endl; // test1
//	printDataSet(splitDataSet(dataSet, 0, 1)); cout<< endl;	
//	printDataSet(splitDataSet(dataSet, 0, 0));//test2
//	cout << chooseBestFeatureToSplit(dataSet) << endl;// test3	
//	createTree(dataSet, labelsId, -1).print();  // test4 -1是根节点 

// start process
	Node root = createTree(dataSet, labelsId, -1);
	loadTestSet();
	classify(root);
// end process
	return 0;
}

void printDataSet(DataSet dataSet){
	for (int i = 0; i < dataSet.row; i++) {
		for (int j = 0; j < dataSet.col; j++) {
			cout << dataSet.dataSet[i][j] << ' ';
		} cout << endl;
	} return;
}
double log2(double x){
	return log(x) / log(2);
}
template <typename T>
void printVector(vector<T> v){
	for(int i = 0; i < v.size(); i++){
		cout << v[i] << ' ';
	} cout << endl;
}
template <typename T>
String ConvertToString(T value) {
	stringstream ss;
	ss << value;
	return ss.str();
}
double calcShannonEnt(DataSet dataSet){
	int numEntries = dataSet.row;
	map<int, int> labelCounts;
	for (int i = 0; i < dataSet.row; i++) {  // 计算当前的数据集有多少类标签 
		int currentLabel = dataSet.dataSet[i][dataSet.col - 1];
		labelCounts[currentLabel]++;
	}
	double shannonEnt = 0.0; // 计算当前数据集的香农熵 
	for (map<int, int> ::iterator it = labelCounts.begin(); it != labelCounts.end(); it++) {
		double prob = it->second / (double)numEntries;
		shannonEnt -= prob * log2(prob);
	}
	return shannonEnt;
}
DataSet splitDataSet(DataSet dataSet, int axis, int value){
	DataSet retDataSet; // retDataSet col and row are 0,  
	int retCol = dataSet.col - 1;   // 去除axis轴上属性值为value的列，把剩余部分放入retDataSet 
	for (int i = 0; i < dataSet.row; i++){
		if (dataSet.dataSet[i][axis] == value) {
			VI v;
			retDataSet.push_back(v);  // add a new row
			retDataSet.dataSet[retDataSet.row - 1].reserve(retCol);
			retDataSet.dataSet[retDataSet.row - 1].resize(retCol);
			VI::iterator itOri = (dataSet.dataSet[i]).begin();
			VI::iterator itDes = (retDataSet.dataSet[retDataSet.row - 1]).begin();
			copy(itOri, itOri + axis, itDes);
			copy(itOri + axis + 1, itOri + dataSet.col, itDes + axis);
			// get a reducedFeatVec
		}
	}
	if (retDataSet.row > 0){  // 拆完之后存在retDataSet所以更新col 
		retDataSet.col = retCol;
	}
	return retDataSet;
}
int chooseBestFeatureToSplit(DataSet dataSet){
	int numFeatures = dataSet.col - 1; // the last col is the feature we need to predict
	double baseEntropy = calcShannonEnt(dataSet);
	double bestInfoGainRatio = 0.0;
	int bestFeature = -1;
	for (int i = 0; i < numFeatures; i++) {
		VI featList = dataSet.getCol(i);
		Set<int> uniqueVals = Set<int>(featList);
		double newEntropy = 0.0;
		double SplitInfo = 0.0; // use for C4.5
		for (set<int> ::iterator value = uniqueVals.begin(); value != uniqueVals.end(); value++) {
			DataSet subDataSet = splitDataSet(dataSet, i, *value);
			double prob = subDataSet.row / (double)(dataSet.row);
			newEntropy += prob*calcShannonEnt(subDataSet);
			SplitInfo -= prob*log2(prob);
		}
		double infoGain = baseEntropy - newEntropy;
		cout << "SplitInfo: " << SplitInfo << endl;
		double infoGainRatio = infoGain/SplitInfo;
		// ----  测试 ----
		cout << "infoGainRatio " <<  infoGainRatio << endl; 
		// ----  测试 ----
		if (infoGainRatio > bestInfoGainRatio) {
			bestInfoGainRatio = infoGainRatio;
			bestFeature = i;
		}
	} return bestFeature;
}
int majorityCnt(VI classList) {  // 计算类标签这一列，哪个类标签出现的最多 
	map<int, int> classCount;
	int majorityLabel;
	int maxClassCount = 0;
	for (VI::iterator it = classList.begin(); it != classList.end(); it++) {
		classCount[*it]++;
		if (maxClassCount < classCount[*it]) {
			maxClassCount = classCount[*it];
			majorityLabel = *it;
		}
	}	return majorityLabel;
}
/*
// labelsId 是特征值标签的下标
// cvalue是当前节点,针对父节点分类下的属性值
*/
Node createTree(DataSet dataSet, VI labelsId, int cvalue){
	VI classList = dataSet.getCol(dataSet.col - 1);// 得到类标签那一列 
	Set<int> uniqueVals(classList);
	if (uniqueVals.size() == 1) {   // dataSet only class Label left 
		return Node(classList[0], cvalue);       // what we will predict is left
	}
	if (dataSet.col == 1) { // no features left that we can choose, choose major class
		return Node(majorityCnt(classList), cvalue);
	}
	int bestFeat = chooseBestFeatureToSplit(dataSet);  // 实为找到bestFeature的Id 
	int bestFeatLabel = labelsId[bestFeat];
	Node myTree(bestFeatLabel, cvalue);  // 该节点代表的feature已经确定了,cvalue是针对父节点标签的属性值 
	labelsId.erase(labelsId.begin() + bestFeat); 
	VI subLabelsId = labelsId;
	VI featValues = dataSet.getCol(bestFeat);
	uniqueVals = Set<int>(featValues);
	for (set<int> ::iterator value = uniqueVals.begin(); value != uniqueVals.end(); value++) {
		//  根据该节点下的不同值来确定有多少个分支 
		myTree.addChild(createTree(splitDataSet(dataSet, bestFeat, *value),
			subLabelsId, *value));
	} return myTree;
}