依托的理论依据是信息增益,注释在代码里,代码很好懂的,文章最后有两个样例。
这是运行结果
( )内是特征,[ ] 内是具体的特征值,比如(身高)下面有(高)(矮)(中等)。
代码在这里~
#include <iostream>
#include <cctype>
#include <cstring>
#include <string>
#include <cmath>
#include <algorithm>
#include <vector>
#include <map>
#include <set>
#include <queue>
#include <cstdio>
#include <cstdlib>
#define inf 0x3f3f3f3f
#define LL long long
#define maxn 10005
using namespace std;
int hu = 0;
int NUM; //数据数量
int ALL; //属性数量
int state[maxn];
map<string, int> big_map;
map<int, string> sma_map;
vector<string> info[maxn];
vector<string> s;
int ttt = 0;
struct Tree_node{
int feature;
int edge[50];
//决定对应下标的child是父节点的哪个分支的
Tree_node* child[50];
int size;
int positive;
int negative;
int state;
//state = -1代表有儿子,state = 1或state = 0就是叶节点了
};
struct TwoDimension{
vector<int> a[maxn];
};
void print_data()
{
for(int i = 0; i < ALL; i++)
cout << s[i] << " ";
printf("\n");
for(int i = 1; i <= NUM; i++)
{
for(int j = 0; j < ALL; j++)
cout << info[i][j] << " ";
printf("\n");
}
for(int i = 1; i <= NUM; i++)
printf("%d ", state[i]);
}
void read()
{
string temp;
int y = 0;
scanf("%d%d", &NUM, &ALL);
for(int i = 1; i <= ALL; i++)
{
cin >> temp;
s.push_back(temp);
if(!big_map.count(temp))
{
big_map[temp] = ++y;
sma_map[y] = temp;
}
}
for(int i = 1; i <= NUM; i++)
{
int h;
scanf("%d", &h);
for(int j = 1; j <= ALL; j++)
{
cin >> temp;
info[i].push_back(temp);
if(!big_map.count(temp))
{
big_map[temp] = ++y;
sma_map[y] = temp;
}
}
cin >> temp;
if(temp == "yes")
state[i] = 1;
else
state[i] = 0;
}
}
void print_decision_tree(Tree_node* root, int level)
{
if(root == NULL) printf("\n");
if(root != NULL){
if(root->state != -1)
{
printf("(State:%d, Yes:%d, No:%d)\n", root->state, root->positive, root->negative);
return ;
}
else
cout << "(" << sma_map[root->feature] << ")" << endl;
for(int i = 0; i < root->size; i++)
{
for(int j = 0; j < (level+1)*8; j++) printf(" ");
cout << "[" <<sma_map[root->edge[i]] << "]";
print_decision_tree(root->child[i], level + 1);
}}
}
double entropy(double yes, double no, double sum)
{
double yessum = yes/sum;
double nosum = no/sum;
if(!yes) yessum = 1;
if(!no) nosum = 1;
return -(yes/sum*log(yessum)/log(2) + no/sum*log(nosum)/log(2));
}
int get_information_gain(int& positive, int& negative, vector<string>& str, vector<int>& use_these_data, TwoDimension& next_use_data , int* lock_feature)
{
int t = 0;
int rest_feature = 0;
int rest_id;
int num[maxn];
int yes[maxn];
int no[maxn];
double Gain[maxn];
map<string, int> mp;
memset(Gain, -1, sizeof(Gain));
//验证是否所有数据都正或都负
int a = 0;
int b = 0;
for(int i = 0; i < use_these_data.size(); i++)
{
int j = use_these_data[i];
if(state[j]) a++;
else b++;
}
if(a == use_these_data.size())
{
positive = use_these_data.size();
negative = 0;
return -2;
}
if(b == use_these_data.size())
{
positive = 0;
negative = use_these_data.size();
return -2;
}
for(int i = 0; i < ALL; i++)
{
//数据初始化
memset(num, 0, sizeof(num));
memset(yes, 0, sizeof(yes));
memset(no, 0, sizeof(no));
mp.clear();
t = 0;
//特征锁为0,此特征可用
if(!lock_feature[i])
{
rest_feature ++;
rest_id = i;
Gain[i] = 0;
double sum = use_these_data.size();
for(int k = 0; k < use_these_data.size(); k++)
{
int j = use_these_data[k];
//在数据集中找到此特征值
if(!mp.count(info[j][i]))
mp[info[j][i]] = ++t;
int e = mp[info[j][i]];
//num[e]代表一个特征下各特征值的数量
num[e]++;
//注意,e最小是1,所以用yes[e]和no[e]表示具体特征值的正负例个数,用yes[0]和no[0]表示此特征的正负例个数
if(state[j])
{
yes[e]++;
yes[0]++;
}
else
{
no[e]++;
no[0]++;
}
}
//计算此特征的information gain
for(int j = 0; j <= t; j++)
{
if(!j)
Gain[i] += entropy(yes[0], no[0], yes[0]+no[0]);
else
Gain[i] -= num[j]/sum*entropy(yes[j], no[j], yes[j]+no[j]);
}
}
}
//剩余特征只有一个,不会再往下分了
if(rest_feature == 1)
{
for(int i = 0; i < use_these_data.size(); i++)
{
int j = use_these_data[i];
if(state[j]) positive++;
else negative++;
}
str.push_back(s[rest_id]);
return -3;
}
double target = -1;
int id = -1;
for(int i = 0; i < ALL; i++)
{
if(Gain[i] > target && !lock_feature[i])
{
target = Gain[i];
id = i;
}
}
if(id == -1)
return -1;
else
{
t = 0;
positive = negative = 0;
mp.clear();
for(int i = 0; i < use_these_data.size(); i++)
{
int j = use_these_data[i];
if(!mp.count(info[j][id]))
{
str.push_back(info[j][id]);
mp[info[j][id]] = t++;
}
int e = mp[info[j][id]];
next_use_data.a[e].push_back(j);
if(state[j]) positive ++;
else negative ++;
}
}
return id;
}
Tree_node* build_decision_tree(vector<int> use_these_data, int* lock_feature) //属性锁不是全局变量
{
//函数变量声明
Tree_node* neww = (Tree_node*) malloc (sizeof(Tree_node));
neww->state = -1;
int positive = 0;
int negative = 0;
int target = -1;
neww->size = 0;
vector<double> information_gain;
TwoDimension next_use_data;
vector<string> str;
//计算此节点并递归建树
target = get_information_gain(positive, negative , str, use_these_data, next_use_data, lock_feature);
//特殊的返回值
if(target == -1)
return NULL;
if(target == -2)
{
neww->positive = positive;
neww->negative = negative;
if(positive > negative) neww->state = 1;
else neww->state = 0;
return neww;
}
if(target == -3)
{
neww->positive = positive;
neww->negative = negative;
neww->feature = big_map[str[0]];
Tree_node* newc = (Tree_node*) malloc (sizeof(Tree_node));
if(positive > negative) newc->state = 1;
else newc->state = 0;
newc->negative = negative;
newc->positive = positive;
neww->child[neww->size++] = newc;
return neww;
}
neww->feature = big_map[s[target]];
neww->positive = positive;
neww->negative = negative;
//剩下的数据全正或全负,直接建立叶结点
if(positive == 0)
{
neww->positive = 0;
neww->negative = negative;
Tree_node* newc = (Tree_node*) malloc (sizeof(Tree_node));
newc->state = 0;
newc->negative = negative;
newc->positive = 0;
neww->child[neww->size++] = newc;
return neww;
}
if(negative == 0)
{
neww->positive = positive;
neww->negative = negative;
Tree_node* newc = (Tree_node*) malloc (sizeof(Tree_node));
newc->state = 1;
newc->negative = negative;
newc->positive = positive;
neww->child[neww->size++] = newc;
return neww;
}
//递归建树
neww->size = 0;
for(int i = 0; i < str.size(); i++)
{
neww->edge[neww->size] = big_map[str[i]];
int new_lock[maxn];
for(int j = 0; j < ALL; j++)
new_lock[j] = lock_feature[j];
new_lock[target] = 1;
neww->child[neww->size++] = (build_decision_tree(next_use_data.a[i], new_lock));
}
return neww;
}
int main()
{
freopen("ztest.txt","r",stdin);
read();
//读入数据
//print_data();
//展示数据
Tree_node* decision_tree;
decision_tree = (Tree_node*) malloc (sizeof(Tree_node));
//清空子树
vector<int> use_these_data;
for(int i = 1; i <= NUM; i++)
use_these_data.push_back(i);
//所有数据都可以用
int lock_feature[maxn];
memset(lock_feature, 0, sizeof(lock_feature));
//所有特性都可选择
decision_tree = build_decision_tree(use_these_data, lock_feature);
//建树
printf("\n\n");
print_decision_tree(decision_tree, 0);
}
测试数据1
15 4
weather temperature wind forage
sunny hot high false no
sunny hot high true no
overcast hot high false yes
rainy warm high false yes
rainy cool norm false yes
rainy cool norm true no
overcast cool norm true yes
sunny warm high false no
sunny warm high false no
sunny cool norm false yes
rainy cool norm false yes
sunny warm norm true yes
overcast warm high true yes
overcast hot norm false yes
rainy warm high true no
测试数据2
240 4
size feature price height
small strong expensive high no
small strong expensive a_bit_high no
small strong expensive common yes
small strong expensive low no
small strong a_litle_expensive high yes
small strong a_litle_expensive a_bit_high no
small strong a_litle_expensive common yes
small strong a_litle_expensive low no
small strong affordable high yes
small strong affordable a_bit_high yes
small strong affordable common yes
small strong affordable low no
small strong substantial high yes
small strong substantial a_bit_high no
small strong substantial common no
small strong substantial low no
small strong cheap high no
small strong cheap a_bit_high no
small strong cheap common no
small strong cheap low no
small elegant expensive high no
small elegant expensive a_bit_high no
small elegant expensive common yes
small elegant expensive low no
small elegant a_litle_expensive high no
small elegant a_litle_expensive a_bit_high yes
small elegant a_litle_expensive common no
small elegant a_litle_expensive low no
small elegant affordable high yes
small elegant affordable a_bit_high yes
small elegant affordable common no
small elegant affordable low no
small elegant substantial high yes
small elegant substantial a_bit_high yes
small elegant substantial common no
small elegant substantial low no
small elegant cheap high yes
small elegant cheap a_bit_high yes
small elegant cheap common no
small elegant cheap low no
small slight expensive high no
small slight expensive a_bit_high yes
small slight expensive common no
small slight expensive low no
small slight a_litle_expensive high yes
small slight a_litle_expensive a_bit_high no
small slight a_litle_expensive common yes
small slight a_litle_expensive low no
small slight affordable high no
small slight affordable a_bit_high no
small slight affordable common no
small slight affordable low no
small slight substantial high yes
small slight substantial a_bit_high no
small slight substantial common no
small slight substantial low no
small slight cheap high yes
small slight cheap a_bit_high no
small slight cheap common no
small slight cheap low no
smart strong expensive high yes
smart strong expensive a_bit_high no
smart strong expensive common no
smart strong expensive low no
smart strong a_litle_expensive high no
smart strong a_litle_expensive a_bit_high yes
smart strong a_litle_expensive common no
smart strong a_litle_expensive low no
smart strong affordable high yes
smart strong affordable a_bit_high yes
smart strong affordable common no
smart strong affordable low no
smart strong substantial high no
smart strong substantial a_bit_high no
smart strong substantial common no
smart strong substantial low no
smart strong cheap high no
smart strong cheap a_bit_high no
smart strong cheap common yes
smart strong cheap low no
smart elegant expensive high yes
smart elegant expensive a_bit_high no
smart elegant expensive common yes
smart elegant expensive low no
smart elegant a_litle_expensive high yes
smart elegant a_litle_expensive a_bit_high yes
smart elegant a_litle_expensive common yes
smart elegant a_litle_expensive low no
smart elegant affordable high yes
smart elegant affordable a_bit_high yes
smart elegant affordable common yes
smart elegant affordable low no
smart elegant substantial high yes
smart elegant substantial a_bit_high yes
smart elegant substantial common no
smart elegant substantial low no
smart elegant cheap high yes
smart elegant cheap a_bit_high no
smart elegant cheap common yes
smart elegant cheap low no
smart slight expensive high yes
smart slight expensive a_bit_high yes
smart slight expensive common no
smart slight expensive low no
smart slight a_litle_expensive high no
smart slight a_litle_expensive a_bit_high yes
smart slight a_litle_expensive common no
smart slight a_litle_expensive low no
smart slight affordable high no
smart slight affordable a_bit_high yes
smart slight affordable common yes
smart slight affordable low no
smart slight substantial high yes
smart slight substantial a_bit_high yes
smart slight substantial common yes
smart slight substantial low no
smart slight cheap high yes
smart slight cheap a_bit_high no
smart slight cheap common yes
smart slight cheap low no
middle strong expensive high yes
middle strong expensive a_bit_high no
middle strong expensive common yes
middle strong expensive low no
middle strong a_litle_expensive high no
middle strong a_litle_expensive a_bit_high no
middle strong a_litle_expensive common yes
middle strong a_litle_expensive low no
middle strong affordable high yes
middle strong affordable a_bit_high yes
middle strong affordable common no
middle strong affordable low no
middle strong substantial high no
middle strong substantial a_bit_high no
middle strong substantial common no
middle strong substantial low no
middle strong cheap high no
middle strong cheap a_bit_high yes
middle strong cheap common yes
middle strong cheap low no
middle elegant expensive high yes
middle elegant expensive a_bit_high no
middle elegant expensive common yes
middle elegant expensive low no
middle elegant a_litle_expensive high no
middle elegant a_litle_expensive a_bit_high yes
middle elegant a_litle_expensive common no
middle elegant a_litle_expensive low no
middle elegant affordable high no
middle elegant affordable a_bit_high yes
middle elegant affordable common yes
middle elegant affordable low no
middle elegant substantial high yes
middle elegant substantial a_bit_high no
middle elegant substantial common no
middle elegant substantial low no
middle elegant cheap high no
middle elegant cheap a_bit_high no
middle elegant cheap common yes
middle elegant cheap low no
middle slight expensive high yes
middle slight expensive a_bit_high yes
middle slight expensive common no
middle slight expensive low no
middle slight a_litle_expensive high yes
middle slight a_litle_expensive a_bit_high no
middle slight a_litle_expensive common no
middle slight a_litle_expensive low no
middle slight affordable high no
middle slight affordable a_bit_high yes
middle slight affordable common no
middle slight affordable low no
middle slight substantial high yes
middle slight substantial a_bit_high yes
middle slight substantial common yes
middle slight substantial low no
middle slight cheap high no
middle slight cheap a_bit_high yes
middle slight cheap common yes
middle slight cheap low no
large strong expensive high yes
large strong expensive a_bit_high no
large strong expensive common no
large strong expensive low no
large strong a_litle_expensive high no
large strong a_litle_expensive a_bit_high no
large strong a_litle_expensive common no
large strong a_litle_expensive low no
large strong affordable high no
large strong affordable a_bit_high no
large strong affordable common no
large strong affordable low no
large strong substantial high no
large strong substantial a_bit_high no
large strong substantial common no
large strong substantial low no
large strong cheap high yes
large strong cheap a_bit_high no
large strong cheap common yes
large strong cheap low no
large elegant expensive high yes
large elegant expensive a_bit_high yes
large elegant expensive common yes
large elegant expensive low no
large elegant a_litle_expensive high yes
large elegant a_litle_expensive a_bit_high no
large elegant a_litle_expensive common yes
large elegant a_litle_expensive low no
large elegant affordable high yes
large elegant affordable a_bit_high no
large elegant affordable common yes
large elegant affordable low no
large elegant substantial high no
large elegant substantial a_bit_high yes
large elegant substantial common no
large elegant substantial low no
large elegant cheap high yes
large elegant cheap a_bit_high no
large elegant cheap common yes
large elegant cheap low no
large slight expensive high no
large slight expensive a_bit_high no
large slight expensive common no
large slight expensive low no
large slight a_litle_expensive high yes
large slight a_litle_expensive a_bit_high yes
large slight a_litle_expensive common no
large slight a_litle_expensive low no
large slight affordable high yes
large slight affordable a_bit_high yes
large slight affordable common yes
large slight affordable low no
large slight substantial high yes
large slight substantial a_bit_high no
large slight substantial common yes
large slight substantial low no
large slight cheap high no
large slight cheap a_bit_high yes
large slight cheap common yes
large slight cheap low no