# cart（分类与回归树）原理与实现

## cart（分类与回归树）原理与实现

### 原理

CART假设决策树是二叉树，内部结点特征的值分为“是”和“否”。所以某个属性（特征）若是取值超过2个，则必然有一个分支结点在该属性上的取值超过一个。与ID3树不同的是，上层使用过的属性，在下层依然可以使用，只是属性的取值集合变少了而已。分支时需要考虑所有可能的属性选择与所有可能取值作为切分点。而ID3只需要考虑属性选择，因为分支数量等于取值数量，所以不必考虑切分点。

1. 对每个特征A，对其每个可能的取值a，根据样本点对A<=a$A<=a$的测试为“是”和“否”分为两个子集D1$D_1$D2$D_2$，计算A<=a$A<=a$时的gini指数
2. 在所有可能的特征及所有切分点中，找到gini指数最小的特征及切分点，据此最终分成两个子集D1$D_1$D2$D_2$
3. 递归调用1和2，直至满足停止条件
4. 计算叶子节点的所属类别。

### 实现

class CartTree(object):
def __init__(self, min_num_leafnode = 1, min_gini = 0.05):
#         assert xtrain.shape[0] == ytrain.shape[0]
self.min_num_leafnode = min_num_leafnode
self.min_gini = min_gini
pass

def train(self, xtrain, ytrain):
self.tree = self.maketree(xtrain, ytrain)

def gini(self, ytrain):
uni_y = np.unique(ytrain)
gn = 1.0
for i in uni_y:
num_i = ytrain[ytrain == i].shape[0] * 1.0 / ytrain.shape[0]
gn = gn - num_i *  num_i
return gn

def node_class(self, ytrain):
most = -np.inf
most_label = -np.inf
uni_y = np.unique(ytrain)
for l in uni_y:
num_l = ytrain[ytrain == l].shape[0]
if num_l > most:
most = num_l
most_label = l
return most_label
pass

def split_sample(self, feat, val, xtrain, ytrain):
left_x  = xtrain[xtrain[:, feat] <= val]
left_y  = ytrain[xtrain[:, feat] <= val]
right_x = xtrain[xtrain[:, feat] >  val]
right_y = ytrain[xtrain[:, feat] >  val]
return left_x, left_y, right_x, right_y

def choose_feat_val(self, xtrain, ytrain):
m,n = xtrain.shape
bestFeat = None
bestVal = None
best_gini = np.inf
for featIndex in range(n):
uni_i_x = np.unique(xtrain[:, featIndex])
for sptval in uni_i_x:
left_x, left_y, right_x, right_y = self.split_sample(featIndex, sptval, xtrain, ytrain)
left_gini  = self.gini(left_y)
right_gini = self.gini(right_y)
sum_gini = left_y.shape[0] * 1.0 / ytrain.shape[0] *  left_gini + \
right_y.shape[0] * 1.0 / ytrain.shape[0] * right_gini
if sum_gini < best_gini:
best_gini = sum_gini
bestFeat = featIndex
bestVal = sptval
return bestFeat, bestVal

def maketree(self, xtrain, ytrain):
tree = {}
tree['gini'] = self.gini(ytrain)
print tree['gini']
tree['class'] = self.node_class(ytrain)
print tree['class']
if tree['gini'] < self.min_gini or xtrain.shape[0] < self.min_num_leafnode: #如果节点的样本数量或者gini指数小于阈值，则不再进行分裂
print 'if'
tree['feat_split'] = None                                                     #不再进行分裂，则表示feat，val，left，right都为None
tree['feat_val'] = None
tree['left']  = None
tree['right'] = None
else:
print 'else'
feat, val = self.choose_feat_val(xtrain, ytrain)
#             print 'feat, vla', feat, val
tree['feat_split'] = feat
tree['feat_val']  = val
left_x, left_y, right_x, right_y = self.split_sample(feat, val, xtrain, ytrain)
#             print np.c_[left_x, left_y]
#             print np.c_[right_x, right_y]
tree['left']  = self.maketree(left_x , left_y )
tree['right'] = self.maketree(right_x, right_y)
return tree

def pred(self, xtest):
return self.predict(xtest, self.tree)

def predict(self, xtest, tree):
if tree['left'] == None and tree['right'] == None:
return tree['class']
else:
if xtest[tree['feat_split']] <= tree['feat_val']:
return self.predict(xtest, tree['left'])
elif xtest[tree['feat_split']] > tree['feat_val']:
return self.predict(xtest, tree['right'])

x=np.array([
[0,0,0,0,8],
[0,0,0,1,3.5],
[0,1,0,1,3.5],
[0,1,1,0,3.5],
[0,0,0,0,3.5],
[1,0,0,0,3.5],
[1,0,0,1,3.5],
[1,1,1,1,2],
[1,0,1,2,3.5],
[1,0,1,2,3.5],
[2,0,1,2,3.5],
[2,0,1,1,3.5],
[2,1,0,1,3.5],
[2,1,0,2,3.5],
[2,0,0,0,10],
]);

y = np.array([
[1],
[0],
[1],
[1],
[0],
[0],
[0],
[1],
[1],
[1],
[1],
[1],
[1],
[1],
[1],
])
clf = CartTree(1, 0.1)
clf.train(x, y)
clf.tree
>>>{'class': 1,
'feat_split': 2,
'feat_val': 0.0,
'gini': 0.3911111111111112,
'left': {'class': 1,
'feat_split': 0,
'feat_val': 1.0,
'gini': 0.49382716049382713,
'left': {'class': 0,
'feat_split': 1,
'feat_val': 0.0,
'gini': 0.4444444444444445,
'left': {'class': 0,
'feat_split': 4,
'feat_val': 3.5,
'gini': 0.31999999999999984,
'left': {'class': 0,
'feat_split': None,
'feat_val': None,
'gini': 0.0,
'left': None,
'right': None},
'right': {'class': 1,
'feat_split': None,
'feat_val': None,
'gini': 0.0,
'left': None,
'right': None}},
'right': {'class': 1,
'feat_split': None,
'feat_val': None,
'gini': 0.0,
'left': None,
'right': None}},
'right': {'class': 1,
'feat_split': None,
'feat_val': None,
'gini': 0.0,
'left': None,
'right': None}},
'right': {'class': 1,
'feat_split': None,
'feat_val': None,
'gini': 0.0,
'left': None,
'right': None}}

for i,j in zip(x, y):
print i,j, clf.pred(i)

color representation
red 1
green 2
blue 3

color representation
red (1,0,0)
green (0,1,0)
blue (0,0,1)
from sklearn.tree import DecisionTreeClassifier,export_graphviz
from IPython.display import Image
import pydotplus
skl_clf = DecisionTreeClassifier()
skl_clf = skl_clf.fit(x,y)
dot_data = export_graphviz(skl_clf, out_file=None)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf("my.pdf")
for i,j in zip(x,y):
print i,j, skl_clf.predict(i)

• 本文已收录于以下专栏：

## CART分类与回归树的原理与实现

CART(Classification And Regression Tree)算法采用一种二分递归分割的技术，将当前的样本集分为两个子样本集，使得生成的的每个非叶子节点都有两个分支。因此，CART算...

## Cart分类树算法之原理篇

﻿﻿ CART算法由一下两步组成： （1）决策树的生成：基于训练数据集生成决策树，生成的决策树要尽量大； （2）决策树的剪枝：用验证数据集对已生成的树进行剪枝并选择最优子树，这时用损失函数最小作...

## CART分类回归树的介绍

• 2014年04月22日 11:32
• 241KB
• 下载

举报原因： 您举报文章：cart（分类与回归树）原理与实现 色情 政治 抄袭 广告 招聘 骂人 其他 (最多只允许输入30个字)