C++实现决策树

依托的理论依据是信息增益,注释在代码里,代码很好懂的,文章最后有两个样例。

这是运行结果

( )内是特征,[ ] 内是具体的特征值,比如(身高)下面有(高)(矮)(中等)。

代码在这里~

#include <iostream>
#include <cctype>
#include <cstring>
#include <string>
#include <cmath>
#include <algorithm>
#include <vector>
#include <map>
#include <set>
#include <queue>
#include <cstdio>
#include <cstdlib>
#define inf 0x3f3f3f3f
#define LL long long 
#define maxn 10005
using namespace std;

int hu = 0;

int NUM;                //数据数量 
int ALL;                //属性数量
int state[maxn];
map<string, int> big_map;
map<int, string> sma_map;
vector<string> info[maxn];
vector<string> s;
int ttt = 0;


struct Tree_node{
	int feature;
	int edge[50];                      
	//决定对应下标的child是父节点的哪个分支的 
	Tree_node* child[50];
	int size; 
	int positive;
	int negative;
	int state;                                
	//state = -1代表有儿子,state = 1或state = 0就是叶节点了 
};

struct TwoDimension{
	vector<int> a[maxn];
};

void print_data()
{
	for(int i = 0; i < ALL; i++)
		cout << s[i] << "  ";
	printf("\n");
	for(int i = 1; i <= NUM; i++)
	{
		for(int j = 0; j < ALL; j++)
			cout << info[i][j] << "  ";
		printf("\n");
	}
	for(int i = 1; i <= NUM; i++)
		printf("%d  ", state[i]);
}

void read()
{
	string temp;
	int y = 0;
	scanf("%d%d", &NUM, &ALL);
	for(int i = 1; i <= ALL; i++)
	{
		cin >> temp;
		s.push_back(temp);
		if(!big_map.count(temp))
		{
			big_map[temp] = ++y;
			sma_map[y] = temp;
		}
	}
	for(int i = 1; i <= NUM; i++)
	{
		int h;
		scanf("%d", &h);
		for(int j = 1; j <= ALL; j++)
		{
			cin >> temp;
			info[i].push_back(temp);
			if(!big_map.count(temp))
			{
				big_map[temp] = ++y;
				sma_map[y] = temp;
			}
		}
		cin >> temp;
		if(temp == "yes")
			state[i] = 1;
		else
			state[i] = 0;
	}
}

void print_decision_tree(Tree_node* root, int level)
{
	if(root == NULL)	printf("\n");
	if(root != NULL){
	if(root->state != -1)
	{
		printf("(State:%d, Yes:%d, No:%d)\n", root->state, root->positive, root->negative);
		return ;
	}
	else
		cout << "(" << sma_map[root->feature] << ")" << endl;
	for(int i = 0; i < root->size; i++)
	{
		for(int j = 0; j < (level+1)*8; j++)	printf(" ");
		cout << "[" <<sma_map[root->edge[i]] << "]";
		print_decision_tree(root->child[i], level + 1);
	}}	
}

double entropy(double yes, double no, double sum)
{
	double yessum = yes/sum;
	double nosum = no/sum;
	if(!yes)	yessum = 1;
	if(!no)		nosum = 1;
	return -(yes/sum*log(yessum)/log(2) + no/sum*log(nosum)/log(2));
}


int get_information_gain(int& positive, int& negative, vector<string>& str, vector<int>& use_these_data, TwoDimension& next_use_data , int* lock_feature)
{
	int t = 0;
	int rest_feature = 0;
	int rest_id;
	int num[maxn]; 
	int yes[maxn];
	int no[maxn];
	double Gain[maxn];
	
	map<string, int> mp;
	
	memset(Gain, -1, sizeof(Gain));
	
	//验证是否所有数据都正或都负 
	int a = 0;
	int b = 0;
	for(int i = 0; i < use_these_data.size(); i++)
	{
		int j = use_these_data[i];
		if(state[j])	a++;
		else	b++;
	}
	if(a == use_these_data.size())
	{
		positive = use_these_data.size();
		negative = 0;
		return -2;
	}
	if(b == use_these_data.size())
	{
		positive = 0;
		negative = use_these_data.size();
		return -2;
	}
	
	for(int i = 0; i < ALL; i++)
	{
		//数据初始化 
		memset(num, 0, sizeof(num));
		memset(yes, 0, sizeof(yes));
		memset(no, 0, sizeof(no)); 
		mp.clear();
		t = 0;
		//特征锁为0,此特征可用 
		if(!lock_feature[i])
		{
			rest_feature ++; 
			rest_id = i;
			Gain[i] = 0;
			double sum = use_these_data.size();
			for(int k = 0; k < use_these_data.size(); k++)
			{
				int j = use_these_data[k];
				//在数据集中找到此特征值 
				if(!mp.count(info[j][i]))
					mp[info[j][i]] = ++t;
				int e = mp[info[j][i]];
				//num[e]代表一个特征下各特征值的数量                      
				num[e]++;
				//注意,e最小是1,所以用yes[e]和no[e]表示具体特征值的正负例个数,用yes[0]和no[0]表示此特征的正负例个数 
				if(state[j])	
				{
					yes[e]++;
					yes[0]++;
				}
				else
				{	
					no[e]++;
					no[0]++;
				}		
			}
			//计算此特征的information gain 
			for(int j = 0; j <= t; j++)
			{
				if(!j)
					Gain[i] += entropy(yes[0], no[0], yes[0]+no[0]);
				else
					Gain[i] -= num[j]/sum*entropy(yes[j], no[j], yes[j]+no[j]);
			}
		}
	}
	
	//剩余特征只有一个,不会再往下分了
	if(rest_feature == 1)
	{
		for(int i = 0; i < use_these_data.size(); i++)
		{
			int j = use_these_data[i];
			if(state[j])	positive++;
			else			negative++;
		}
		str.push_back(s[rest_id]);
		return -3;
	}
			
	double target = -1;
	int id = -1;
	for(int i = 0; i < ALL; i++)
	{
		if(Gain[i] > target && !lock_feature[i])
		{
			target = Gain[i];
			id = i;
		}
	}
	if(id == -1)
		return -1;
	else
	{
		t = 0;
		positive = negative = 0;
		mp.clear();
		for(int i = 0; i < use_these_data.size(); i++)
		{
			int j = use_these_data[i];
			if(!mp.count(info[j][id]))
			{
				str.push_back(info[j][id]);
				mp[info[j][id]] = t++;
			}
			int e = mp[info[j][id]];
			next_use_data.a[e].push_back(j);
			if(state[j])	positive ++;
			else	negative ++;	
		}
	}
	return id;
}
	
Tree_node* build_decision_tree(vector<int> use_these_data, int* lock_feature)   //属性锁不是全局变量
{


	//函数变量声明 
	Tree_node* neww = (Tree_node*) malloc (sizeof(Tree_node));
	neww->state = -1;
	int positive = 0;
	int negative = 0;
	int target = -1;
	neww->size = 0;
	vector<double> information_gain;
	TwoDimension next_use_data;
	vector<string> str;
	//计算此节点并递归建树 
	
	target = get_information_gain(positive, negative , str, use_these_data, next_use_data, lock_feature);
	//特殊的返回值 
	if(target == -1)
		return NULL;
	if(target == -2)
	{
		neww->positive = positive;
		neww->negative = negative;
		if(positive > negative)	neww->state = 1;
		else					neww->state = 0; 
		return neww;
	}
	if(target == -3)
	{
		neww->positive = positive;
		neww->negative = negative;
		neww->feature = big_map[str[0]];
		Tree_node* newc = (Tree_node*) malloc (sizeof(Tree_node));
		if(positive > negative)	newc->state = 1;
		else					newc->state = 0; 
		newc->negative = negative;
		newc->positive = positive;
		neww->child[neww->size++] = newc;
		return neww;
	}
	neww->feature = big_map[s[target]];
	neww->positive = positive;
	neww->negative = negative;
	
	//剩下的数据全正或全负,直接建立叶结点 
	if(positive == 0)
	{
		neww->positive = 0;
		neww->negative = negative;
		Tree_node* newc = (Tree_node*) malloc (sizeof(Tree_node));
		newc->state = 0;
		newc->negative = negative;
		newc->positive = 0;
		neww->child[neww->size++] = newc;
		return neww;
	}
	if(negative == 0)
	{
		neww->positive = positive;
		neww->negative = negative;
		Tree_node* newc = (Tree_node*) malloc (sizeof(Tree_node));
		newc->state = 1;
		newc->negative = negative;
		newc->positive = positive;
		neww->child[neww->size++] = newc;
		return neww;
	}
	
	//递归建树 
	neww->size = 0;
	for(int i = 0; i < str.size(); i++)
	{
		neww->edge[neww->size] = big_map[str[i]];
		int new_lock[maxn];

		for(int j = 0; j < ALL; j++)
			new_lock[j] = lock_feature[j];
		new_lock[target] = 1;   
	                        
		neww->child[neww->size++] = (build_decision_tree(next_use_data.a[i], new_lock));
	}
	return neww;
}

				
int main()
{
	freopen("ztest.txt","r",stdin);
	read();
	//读入数据 
	//print_data();
	//展示数据 
	Tree_node* decision_tree;
	decision_tree = (Tree_node*) malloc (sizeof(Tree_node));
	//清空子树 
	vector<int> use_these_data;
	for(int i = 1; i <= NUM; i++)
		use_these_data.push_back(i);
	//所有数据都可以用
	int lock_feature[maxn];
	memset(lock_feature, 0, sizeof(lock_feature));
	//所有特性都可选择 
	decision_tree = build_decision_tree(use_these_data, lock_feature);
	//建树 
	printf("\n\n");
	print_decision_tree(decision_tree, 0);
}
	

测试数据1

15  4
weather	temperature wind	forage
sunny	hot	high	false	no
sunny	hot	high	true	no
overcast	hot	high	false	yes
rainy	warm	high	false	yes
rainy	cool	norm	false	yes
rainy	cool	norm	true	no
overcast	cool	norm	true	yes
sunny	warm	high	false	no
sunny      warm       high         false        no
sunny	cool	norm	false	yes
rainy	cool	norm	false	yes
sunny	warm	norm	true	yes
overcast	warm	high	true	yes
overcast	hot	norm	false	yes
rainy	warm	high	true	no

测试数据2

240 4
size  feature  price  height  
small  strong  expensive  high  no  
small  strong  expensive  a_bit_high  no  
small  strong  expensive  common  yes  
small  strong  expensive  low  no  
small  strong  a_litle_expensive  high  yes  
small  strong  a_litle_expensive  a_bit_high  no  
small  strong  a_litle_expensive  common  yes  
small  strong  a_litle_expensive  low  no  
small  strong  affordable  high  yes  
small  strong  affordable  a_bit_high  yes  
small  strong  affordable  common  yes  
small  strong  affordable  low  no  
small  strong  substantial  high  yes  
small  strong  substantial  a_bit_high  no  
small  strong  substantial  common  no  
small  strong  substantial  low  no  
small  strong  cheap  high  no  
small  strong  cheap  a_bit_high  no  
small  strong  cheap  common  no  
small  strong  cheap  low  no  
small  elegant  expensive  high  no  
small  elegant  expensive  a_bit_high  no  
small  elegant  expensive  common  yes  
small  elegant  expensive  low  no  
small  elegant  a_litle_expensive  high  no  
small  elegant  a_litle_expensive  a_bit_high  yes  
small  elegant  a_litle_expensive  common  no  
small  elegant  a_litle_expensive  low  no  
small  elegant  affordable  high  yes  
small  elegant  affordable  a_bit_high  yes  
small  elegant  affordable  common  no  
small  elegant  affordable  low  no  
small  elegant  substantial  high  yes  
small  elegant  substantial  a_bit_high  yes  
small  elegant  substantial  common  no  
small  elegant  substantial  low  no  
small  elegant  cheap  high  yes  
small  elegant  cheap  a_bit_high  yes  
small  elegant  cheap  common  no  
small  elegant  cheap  low  no  
small  slight  expensive  high  no  
small  slight  expensive  a_bit_high  yes  
small  slight  expensive  common  no  
small  slight  expensive  low  no  
small  slight  a_litle_expensive  high  yes  
small  slight  a_litle_expensive  a_bit_high  no  
small  slight  a_litle_expensive  common  yes  
small  slight  a_litle_expensive  low  no  
small  slight  affordable  high  no  
small  slight  affordable  a_bit_high  no  
small  slight  affordable  common  no  
small  slight  affordable  low  no  
small  slight  substantial  high  yes  
small  slight  substantial  a_bit_high  no  
small  slight  substantial  common  no  
small  slight  substantial  low  no  
small  slight  cheap  high  yes  
small  slight  cheap  a_bit_high  no  
small  slight  cheap  common  no  
small  slight  cheap  low  no  
smart  strong  expensive  high  yes  
smart  strong  expensive  a_bit_high  no  
smart  strong  expensive  common  no  
smart  strong  expensive  low  no  
smart  strong  a_litle_expensive  high  no  
smart  strong  a_litle_expensive  a_bit_high  yes  
smart  strong  a_litle_expensive  common  no  
smart  strong  a_litle_expensive  low  no  
smart  strong  affordable  high  yes  
smart  strong  affordable  a_bit_high  yes  
smart  strong  affordable  common  no  
smart  strong  affordable  low  no  
smart  strong  substantial  high  no  
smart  strong  substantial  a_bit_high  no  
smart  strong  substantial  common  no  
smart  strong  substantial  low  no  
smart  strong  cheap  high  no  
smart  strong  cheap  a_bit_high  no  
smart  strong  cheap  common  yes  
smart  strong  cheap  low  no  
smart  elegant  expensive  high  yes  
smart  elegant  expensive  a_bit_high  no  
smart  elegant  expensive  common  yes  
smart  elegant  expensive  low  no  
smart  elegant  a_litle_expensive  high  yes  
smart  elegant  a_litle_expensive  a_bit_high  yes  
smart  elegant  a_litle_expensive  common  yes  
smart  elegant  a_litle_expensive  low  no  
smart  elegant  affordable  high  yes  
smart  elegant  affordable  a_bit_high  yes  
smart  elegant  affordable  common  yes  
smart  elegant  affordable  low  no  
smart  elegant  substantial  high  yes  
smart  elegant  substantial  a_bit_high  yes  
smart  elegant  substantial  common  no  
smart  elegant  substantial  low  no  
smart  elegant  cheap  high  yes  
smart  elegant  cheap  a_bit_high  no  
smart  elegant  cheap  common  yes  
smart  elegant  cheap  low  no  
smart  slight  expensive  high  yes  
smart  slight  expensive  a_bit_high  yes  
smart  slight  expensive  common  no  
smart  slight  expensive  low  no  
smart  slight  a_litle_expensive  high  no  
smart  slight  a_litle_expensive  a_bit_high  yes  
smart  slight  a_litle_expensive  common  no  
smart  slight  a_litle_expensive  low  no  
smart  slight  affordable  high  no  
smart  slight  affordable  a_bit_high  yes  
smart  slight  affordable  common  yes  
smart  slight  affordable  low  no  
smart  slight  substantial  high  yes  
smart  slight  substantial  a_bit_high  yes  
smart  slight  substantial  common  yes  
smart  slight  substantial  low  no  
smart  slight  cheap  high  yes  
smart  slight  cheap  a_bit_high  no  
smart  slight  cheap  common  yes  
smart  slight  cheap  low  no  
middle  strong  expensive  high  yes  
middle  strong  expensive  a_bit_high  no  
middle  strong  expensive  common  yes  
middle  strong  expensive  low  no  
middle  strong  a_litle_expensive  high  no  
middle  strong  a_litle_expensive  a_bit_high  no  
middle  strong  a_litle_expensive  common  yes  
middle  strong  a_litle_expensive  low  no  
middle  strong  affordable  high  yes  
middle  strong  affordable  a_bit_high  yes  
middle  strong  affordable  common  no  
middle  strong  affordable  low  no  
middle  strong  substantial  high  no  
middle  strong  substantial  a_bit_high  no  
middle  strong  substantial  common  no  
middle  strong  substantial  low  no  
middle  strong  cheap  high  no  
middle  strong  cheap  a_bit_high  yes  
middle  strong  cheap  common  yes  
middle  strong  cheap  low  no  
middle  elegant  expensive  high  yes  
middle  elegant  expensive  a_bit_high  no  
middle  elegant  expensive  common  yes  
middle  elegant  expensive  low  no  
middle  elegant  a_litle_expensive  high  no  
middle  elegant  a_litle_expensive  a_bit_high  yes  
middle  elegant  a_litle_expensive  common  no  
middle  elegant  a_litle_expensive  low  no  
middle  elegant  affordable  high  no  
middle  elegant  affordable  a_bit_high  yes  
middle  elegant  affordable  common  yes  
middle  elegant  affordable  low  no  
middle  elegant  substantial  high  yes  
middle  elegant  substantial  a_bit_high  no  
middle  elegant  substantial  common  no  
middle  elegant  substantial  low  no  
middle  elegant  cheap  high  no  
middle  elegant  cheap  a_bit_high  no  
middle  elegant  cheap  common  yes  
middle  elegant  cheap  low  no  
middle  slight  expensive  high  yes  
middle  slight  expensive  a_bit_high  yes  
middle  slight  expensive  common  no  
middle  slight  expensive  low  no  
middle  slight  a_litle_expensive  high  yes  
middle  slight  a_litle_expensive  a_bit_high  no  
middle  slight  a_litle_expensive  common  no  
middle  slight  a_litle_expensive  low  no  
middle  slight  affordable  high  no  
middle  slight  affordable  a_bit_high  yes  
middle  slight  affordable  common  no  
middle  slight  affordable  low  no  
middle  slight  substantial  high  yes  
middle  slight  substantial  a_bit_high  yes  
middle  slight  substantial  common  yes  
middle  slight  substantial  low  no  
middle  slight  cheap  high  no  
middle  slight  cheap  a_bit_high  yes  
middle  slight  cheap  common  yes  
middle  slight  cheap  low  no  
large  strong  expensive  high  yes  
large  strong  expensive  a_bit_high  no  
large  strong  expensive  common  no  
large  strong  expensive  low  no  
large  strong  a_litle_expensive  high  no  
large  strong  a_litle_expensive  a_bit_high  no  
large  strong  a_litle_expensive  common  no  
large  strong  a_litle_expensive  low  no  
large  strong  affordable  high  no  
large  strong  affordable  a_bit_high  no  
large  strong  affordable  common  no  
large  strong  affordable  low  no  
large  strong  substantial  high  no  
large  strong  substantial  a_bit_high  no  
large  strong  substantial  common  no  
large  strong  substantial  low  no  
large  strong  cheap  high  yes  
large  strong  cheap  a_bit_high  no  
large  strong  cheap  common  yes  
large  strong  cheap  low  no  
large  elegant  expensive  high  yes  
large  elegant  expensive  a_bit_high  yes  
large  elegant  expensive  common  yes  
large  elegant  expensive  low  no  
large  elegant  a_litle_expensive  high  yes  
large  elegant  a_litle_expensive  a_bit_high  no  
large  elegant  a_litle_expensive  common  yes  
large  elegant  a_litle_expensive  low  no  
large  elegant  affordable  high  yes  
large  elegant  affordable  a_bit_high  no  
large  elegant  affordable  common  yes  
large  elegant  affordable  low  no  
large  elegant  substantial  high  no  
large  elegant  substantial  a_bit_high  yes  
large  elegant  substantial  common  no  
large  elegant  substantial  low  no  
large  elegant  cheap  high  yes  
large  elegant  cheap  a_bit_high  no  
large  elegant  cheap  common  yes  
large  elegant  cheap  low  no  
large  slight  expensive  high  no  
large  slight  expensive  a_bit_high  no  
large  slight  expensive  common  no  
large  slight  expensive  low  no  
large  slight  a_litle_expensive  high  yes  
large  slight  a_litle_expensive  a_bit_high  yes  
large  slight  a_litle_expensive  common  no  
large  slight  a_litle_expensive  low  no  
large  slight  affordable  high  yes  
large  slight  affordable  a_bit_high  yes  
large  slight  affordable  common  yes  
large  slight  affordable  low  no  
large  slight  substantial  high  yes  
large  slight  substantial  a_bit_high  no  
large  slight  substantial  common  yes  
large  slight  substantial  low  no  
large  slight  cheap  high  no  
large  slight  cheap  a_bit_high  yes  
large  slight  cheap  common  yes  
large  slight  cheap  low  no  

 

 

  • 2
    点赞
  • 19
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值