cart(分类与回归树)原理与实现
原理
CART假设决策树是二叉树,内部结点特征的值分为“是”和“否”。所以某个属性(特征)若是取值超过2个,则必然有一个分支结点在该属性上的取值超过一个。与ID3树不同的是,上层使用过的属性,在下层依然可以使用,只是属性的取值集合变少了而已。分支时需要考虑所有可能的属性选择与所有可能取值作为切分点。而ID3只需要考虑属性选择,因为分支数量等于取值数量,所以不必考虑切分点。
分类树的生成:
输入:训练集,停止计算的条件
输出:CART决策树
1. 对每个特征A,对其每个可能的取值a,根据样本点对
A<=a
的测试为“是”和“否”分为两个子集
D1
和
D2
,计算
A<=a
时的gini指数
2. 在所有可能的特征及所有切分点中,找到gini指数最小的特征及切分点,据此最终分成两个子集
D1
和
D2
3. 递归调用1和2,直至满足停止条件
4. 计算叶子节点的所属类别。
停止计算的条件常用的有节点的样本数量小于预定阈值,或者节点的gini指数小于预定阈值(表示节点已经很纯了)。
实现
class CartTree(object):
def __init__(self, min_num_leafnode = 1, min_gini = 0.05):
# assert xtrain.shape[0] == ytrain.shape[0]
self.min_num_leafnode = min_num_leafnode
self.min_gini = min_gini
pass
def train(self, xtrain, ytrain):
self.tree = self.maketree(xtrain, ytrain)
def gini(self, ytrain):
uni_y = np.unique(ytrain)
gn = 1.0
for i in uni_y:
num_i = ytrain[ytrain == i].shape[0] * 1.0 / ytrain.shape[0]
gn = gn - num_i * num_i
return gn
def node_class(self, ytrain):
most = -np.inf
most_label = -np.inf
uni_y = np.unique(ytrain)
for l in uni_y:
num_l = ytrain[ytrain == l].shape[0]
if num_l > most:
most = num_l
most_label = l
return most_label
pass
def split_sample(self, feat, val, xtrain, ytrain):
left_x = xtrain[xtrain[:, feat] <= val]
left_y = ytrain[xtrain[:, feat] <= val]
right_x = xtrain[xtrain[:, feat] > val]
right_y = ytrain[xtrain[:, feat] > val]
return left_x, left_y, right_x, right_y
def choose_feat_val(self, xtrain, ytrain):
m,n = xtrain.shape
bestFeat = None
bestVal = None
best_gini = np.inf
for featIndex in range(n):
uni_i_x = np.unique(xtrain[:, featIndex])
for sptval in uni_i_x:
left_x, left_y, right_x, right_y = self.split_sample(featIndex, sptval, xtrain, ytrain)
left_gini = self.gini(left_y)
right_gini = self.gini(right_y)
sum_gini = left_y.shape[0] * 1.0 / ytrain.shape[0] * left_gini + \
right_y.shape[0] * 1.0 / ytrain.shape[0] * right_gini
if sum_gini < best_gini:
best_gini = sum_gini
bestFeat = featIndex
bestVal = sptval
return bestFeat, bestVal
def maketree(self, xtrain, ytrain):
tree = {}
tree['gini'] = self.gini(ytrain)
print tree['gini']
tree['class'] = self.node_class(ytrain)
print tree['class']
if tree['gini'] < self.min_gini or xtrain.shape[0] < self.min_num_leafnode: #如果节点的样本数量或者gini指数小于阈值,则不再进行分裂
print 'if'
tree['feat_split'] = None #不再进行分裂,则表示feat,val,left,right都为None
tree['feat_val'] = None
tree['left'] = None
tree['right'] = None
else:
print 'else'
feat, val = self.choose_feat_val(xtrain, ytrain)
# print 'feat, vla', feat, val
tree['feat_split'] = feat
tree['feat_val'] = val
left_x, left_y, right_x, right_y = self.split_sample(feat, val, xtrain, ytrain)
# print np.c_[left_x, left_y]
# print np.c_[right_x, right_y]
tree['left'] = self.maketree(left_x , left_y )
tree['right'] = self.maketree(right_x, right_y)
return tree
def pred(self, xtest):
return self.predict(xtest, self.tree)
def predict(self, xtest, tree):
if tree['left'] == None and tree['right'] == None:
return tree['class']
else:
if xtest[tree['feat_split']] <= tree['feat_val']:
return self.predict(xtest, tree['left'])
elif xtest[tree['feat_split']] > tree['feat_val']:
return self.predict(xtest, tree['right'])
x=np.array([
[0,0,0,0,8],
[0,0,0,1,3.5],
[0,1,0,1,3.5],
[0,1,1,0,3.5],
[0,0,0,0,3.5],
[1,0,0,0,3.5],
[1,0,0,1,3.5],
[1,1,1,1,2],
[1,0,1,2,3.5],
[1,0,1,2,3.5],
[2,0,1,2,3.5],
[2,0,1,1,3.5],
[2,1,0,1,3.5],
[2,1,0,2,3.5],
[2,0,0,0,10],
]);
y = np.array([
[1],
[0],
[1],
[1],
[0],
[0],
[0],
[1],
[1],
[1],
[1],
[1],
[1],
[1],
[1],
])
clf = CartTree(1, 0.1)
clf.train(x, y)
clf.tree
>>>{'class': 1,
'feat_split': 2,
'feat_val': 0.0,
'gini': 0.3911111111111112,
'left': {'class': 1,
'feat_split': 0,
'feat_val': 1.0,
'gini': 0.49382716049382713,
'left': {'class': 0,
'feat_split': 1,
'feat_val': 0.0,
'gini': 0.4444444444444445,
'left': {'class': 0,
'feat_split': 4,
'feat_val': 3.5,
'gini': 0.31999999999999984,
'left': {'class': 0,
'feat_split': None,
'feat_val': None,
'gini': 0.0,
'left': None,
'right': None},
'right': {'class': 1,
'feat_split': None,
'feat_val': None,
'gini': 0.0,
'left': None,
'right': None}},
'right': {'class': 1,
'feat_split': None,
'feat_val': None,
'gini': 0.0,
'left': None,
'right': None}},
'right': {'class': 1,
'feat_split': None,
'feat_val': None,
'gini': 0.0,
'left': None,
'right': None}},
'right': {'class': 1,
'feat_split': None,
'feat_val': None,
'gini': 0.0,
'left': None,
'right': None}}
for i,j in zip(x, y):
print i,j, clf.pred(i)
上面的数据集参照了这个博客 http://blog.csdn.net/dark_scope/article/details/13168827。
关于tree这个数据结构,每个节点保存了gini指数和所属类别class,内部结点也是。
叶子节点的feat,val,left,right为None。
关于切分点,采用的是
<=
<script type="math/tex" id="MathJax-Element-7"><=</script>,sklearn中用的是两个相邻值的中点。仿照sklearn.tree中,将离散特征也做连续处理,不过离散特征应该事先采用向量形式表示。如颜色color有
color | representation |
---|---|
red | 1 |
green | 2 |
blue | 3 |
要转换成如下形式:
color | representation |
---|---|
red | (1,0,0) |
green | (0,1,0) |
blue | (0,0,1) |
from sklearn.tree import DecisionTreeClassifier,export_graphviz
from IPython.display import Image
import pydotplus
skl_clf = DecisionTreeClassifier()
skl_clf = skl_clf.fit(x,y)
dot_data = export_graphviz(skl_clf, out_file=None)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf("my.pdf")
for i,j in zip(x,y):
print i,j, skl_clf.predict(i)
上面是直接调用了sklearn.tree生成的结果如图所示,可以看到结果与上面是一致的。