CCP后剪枝的主要思想是:
1,找个树T0所有节点位置,并计算gt值,
r(t)时该节点的错分样本率,p(t)时该节点样本占全部样本比例 R(Tt)表示该节点的所有叶子节 的R(t)值,T表示该节点包含的叶子节点数量。假设该树除了叶子节点外有K个节点,我们可以得到K个gt值,取其中最小的值为alpha,最小值对应的子树被剪掉(alpha越小表示该节点的对分类结果的作用越小),类别根据大数原理确定,产生被选树集中的第二个数T1,原有树集中有整树T0
2. 将T1重复以上步骤,最终可以得到{T0,T1,.....Tm}的树集合,Tm表示只有根节点的树
3.对以上树集合,进行交叉验证,选取效果最好的树最为剪枝的树,如果效果相同,则选在树集中排位靠后的树,因为表示剪掉的枝比较多,树结构简单
因为时从下开始剪枝,所以是一种自下而上的剪枝操作,在这个过程中需要先获得一个整树,所所以时后剪枝操作,g(t)的获取同时考虑复杂度和损失函数,所以方法叫做CCP(cost-complexity)
根据前一篇文章可以得到没有剪枝的整树 《(一)CART决策树(无剪枝和缺失值处理)》, 缺失值处理见(三),关注后可见
- 根据以上原理,首先应该先找到T0的所有节点位置以及每个节点所对应的类别
def getSplitTree(self,tree):
sub_Tree = []
for key in ['left','right']:
if isinstance(tree[key], dict):
sub_tree = {} ## 获得每个子树
sub_tree['cut_f'] = tree[key]['cut_f']
sub_tree['cut_val'] = tree[key]['cut_val']
sub_tree['left_values'] = tree[key]['left_values']
sub_tree['right_values'] = tree[key]['right_values']
sub_tree['left'] = tree[key]['left']
sub_tree['right'] = tree[key]['right']
sub_Tree.append(sub_tree)
return sub_Tree
def getBoxTree(self,tree):
'''
获取所有节点的子树
:param: tree 整体子树
:param: boxTree = [tree] 能提取的子树集合
:return: boxTree 子树集合
'''
boxTree = [tree]
sub_Tree= self.getSplitTree(tree) ## 获取根节点的两个子树
boxTree = boxTree + sub_Tree
A= [1] ## 设置暂停获取条件,每个子树产生两个叉,所以每次循环最多得到4个分支,最少0个,当0个即已经到最大深度不再划分
while A != []:
# print('进入循环————————————————————————')
A = []
for i in range(len(sub_Tree)):
temp = self.getSplitTree(sub_Tree[i]) # 依次获取左右子树的“左右子树”
A += temp
if A == []:
break
else:
# print('获得子树————————————————————————',A)
sub_Tree = A
boxTree = boxTree + A
return boxTree
def getNodes(self,Box):
'''
求每个子树的节点数,以及所有节点的预测结果
'''
global node
global ini_node_value
boxNodes = [] ########### 每个子树的节点数############
nodeValue = [] ########### 每个子树所有节点数的预测结果############
ini_node_value = []
node = 0
for k in range(len(Box)):
#print('node-----------',node)
node,ini_node_value = self.findnodes(Box[k])
boxNodes.append(node)
node = 0
nodeValue.append(ini_node_value)
ini_node_value =[]
return boxNodes,nodeValue
def findnodes(self,data):
'''
计算每个子树的叶子节点数 以及 子树的所有叶子节点的预测结果
'''
global node
global ini_node_value
#print('node-----------',node)
for key, val in data.items():
if isinstance(data[key],dict):
self.findnodes(data[key])
else:
if key == 'left':
node += 1
ini_node_value.append(data['left_values'])
# print('-----------',ini_node_value)
elif key == 'right':
node += 1
ini_node_value.append(data['right_values'])
#print('-----------',ini_node_value)
return node,ini_node_value
def costFunction(self,Tt,boxNodes,nodeValue):
'''
获取每个子树的gt值
'''
gt = []
value = Tt['left_values']+ Tt['right_values']
Value = []
#print('--------',value)
for key, val in value.items():
Value.append(val)
Value = np.array(Value)
### 节点Tt的错误误差,遵循大数原则,认为少数的为错分样本
error = Value.min()/ Value.sum()
#print('error--------',error)
## 该节点总样本占整体的比重
p_node = Value.sum() / len(X_train)
Rt = error * p_node
#print('Rt--------',Rt)
RTt = 0
######### RTt 叶子节点所有错分数目 ##############
#print('nodeValue--------',nodeValue,len(nodeValue))
for i in range(len(nodeValue)):
num = []
if len(nodeValue[i])==1: ## 表示该节点纯净
continue
else:
for key, val in nodeValue[i].items():
#print('key, val--------',key, val)
num.append(val)
num = np.array(num)
# print('num--------',num)
RTt += (num.sum()-num.max())/ len(X_train)
#print('RTt--------',RTt)
gt_add = (Rt - RTt)/ (boxNodes-1)
gt_add = 0.0 if (np.abs(gt_add - 0)<1e-13) else gt_add
gt.append(gt_add)
#print('boxNodes--------',boxNodes)
return gt
然后剪枝操作应该先找到要剪枝的部分的路径,并用剪枝并用类别值代替该位置的树(其实就是代替该key下的字典)
def find_path(self,iterable, mode, target, current_path='root'):
'''
referenced by
----------
https://blog.csdn.net/qq_47110957/article/details/106982333
Parameters
----------
iterable : dict or list or tuple
The object iterable.
mode : int
You can use 0 or 1. 0 means look up the key name of dict and i means look up the value
target : arbitrary type
Your target for look up.
current_path : string, optional
This parameter is used to run the function and you don't need to give it a value when you use it. The default is ''.
Returns
-------
path : list
This list contains all eligible paths.
'''
path = []
if type(iterable) == type({}):#如果为字典类型
if mode == 0:#查找键名模式
for key in iterable.keys():
c_p = current_path + " .'%s'" % key
if key == target:
path.append(c_p)
if type(iterable[key]) in [type({}),type(()),type([])]:#如果键值是序列就继续向下找
path += self.find_path(iterable[key], mode, target, c_p)
elif mode == 1:
for key in iterable.keys():
c_p = current_path + '.'+'%s'%key
if iterable[key] == target:
path.append(c_p)
if type(iterable[key]) in [type({}),type(()),type([])]:#如果键值是序列就继续向下找
path += self.find_path(iterable[key], mode, target, c_p)
return path
def replace(self,obj, replaceContent):
'''
通过剪枝获取剪枝后的子树
'''
replacePath = []
pathStr = ""
for s in keyPath.split("."):
if s == "root":
pathStr += "copyObj"
elif s.isdigit():
pathStr += f"[{s}]"
else:
pathStr += f'["{s}"]'
replacePath.append(pathStr)
#print(replacePath)
replaceObj = []
for path in replacePath:
#print('---------replacePath')
#print(path)
for content in replaceContent:
copyObj = copy.deepcopy(obj)
exec(f"{path} = content")
replaceObj.append(copyObj)
return replaceObj
整体的代码如下所示,包括了第一部分产生树的过程
## 注意dataframe 类型数据iloc很慢,可以用apply函数尝试,或者替换成ndarray格式
class DecisionTreeClassifier():
def __init__(self,max_depth: int = None,min_samples_split:int = 5,
min_samples_leaf: int = 5,min_impurity_decrease: float =0.0,ispruning: bool = False, nsplits: int = 3):
'''
min_samples_split: 内部节点再划分所需最小样本数
min_samples_leaf: 叶子节点最少样本数 这个值限制了叶子节点最少的样本数,如果某叶子节点数目小于样本数,则会和兄弟节点一起被剪枝
分裂需要满足的最小增益
max_depth: 最大深度
min_impurity_decrease:分裂需要满足的最小增益
pruning: 是否剪枝,默认不剪枝 False
nsplits: 后剪枝交叉验证数据集的折叠数量,默认是3
'''
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.min_samples_leaf = min_samples_leaf
self.min_impurity_decrease = min_impurity_decrease
self.nodes = 0
self.tree = None
self.type_feature = None
self.ispruning = ispruning
self.nsplits = nsplits
def Gini(self,X,y):
'''
:param data:
:param X: 特征数据
:param y: 目标数据
:return: Gini: 返回该数据每个特征的Gini系数
'''
## 根据第一个公式
K = np.unique(y)
Gini = 1 - np.sum([(len(X[y == k]) / len(X))**2 for k in K])
return Gini
def typeFeature(self,X):
# 表示特征是否为连续还是离散
n_sample,n_feature = X.shape
self.type_feature = []
#### 特征属性小于10个,认为是离散型数据用0表示,连续性数据用1 表示
for f_idx in range(n_feature):
if len(np.unique(X[:, f_idx]))< 10:
self.type_feature.append(0)
else:
self.type_feature.append(1)
return self.type_feature
def binSplitData(self,X,y,f_idx,f_val):
'''
二分类划分数据集
:param X 划分数据
:param f_idx: 数据X的第f_idx个特征 X.iloc[:,f_idx] 太慢 用 X.columns[f_idx]
np.unique(X.at[:,f_idx])可以得到该特征的属性,如 array(['Overcast', 'Rain', 'Sunny'], dtype=object)
:param f_val: 数据f_idx个特征中的属性值,即上面中的一种 'Overcast'/ 'Rain'/ 'Sunny
:param type_feature: 离散特征 0 连续特征 1
:return: 二分后的左右数据子集
'''
### att 数有数据在第f_idx的特征的所有属性,将不等于 f_val 分为一类,其余分为另一类
#################### 0: 离散类型特征二分方法 1:连续数据 ############################
att=X[:, f_idx]
if self.type_feature[f_idx]== 0:
X_left = X[att == f_val]
X_right = X[att != f_val]
y_left = y[att == f_val]
y_right = y[att != f_val]
else:
X_left = X[att <= f_val]
X_right = X[att >f_val]
y_left = y[att <= f_val]
y_right = y[att > f_val]
## 切分点和样本点的索引
return X_left, X_right, y_left, y_right
def bestSplit(self,X,y):
'''
找到最佳分割特征与特征值
:param X
:return: best_f_idx 最佳分割特征 , best_f_val 特征值
'''
best_gain = 0
n_sample,n_feature = X.shape
best_f_idx = None
## 根据大数原则,找到X特征中出现最多的属性作为特征值
best_f_val = stats.mode(y)[0][0]
## 第一个终止条件: 当叶子节点中的样本数小于最小分割值,或者所有样本属于同一类别时,不再分割
if n_sample < self.min_samples_split or len(np.unique(y)) == 1:
return best_f_idx,best_f_val
Gini_before= self.Gini(X,y)
##-------------------------通过不断二分的过程 寻找对于某个特征,的最佳分割点---------------------------
for f_idx in range(n_feature):
##-------------------------如果该特征中的属性个数小于10,则认为是离散数据 type_feature = 0,否则else---------------------------
if self.type_feature[f_idx] == 0:
for f_val in np.unique(X[:, f_idx]):
## 当某个特征只有两个类别时,仅仅做一次左右子树的划分,不用重复操作
if len(np.unique(X[:, f_idx]))== 2 and f_val == np.unique(X[:, f_idx])[0]:
continue
else:
X_left, X_right, y_left, y_right = self.__binSplitData(X,y,f_idx,f_val)
## 第二个终止条件: 分割后样本数据小于节点的最低样本数,则放弃分割
if len(X_left)<self.min_samples_leaf or len(X_right)<self.min_samples_leaf:
continue
## 如果不满足上述停止条件,则计算分割后的加权Gini
Gini_after = len(X_left)/len(X) * self.__Gini(X_left,y_left) + len(X_right)/len(X) * self.__Gini(X_right,y_right)
gain = Gini_before - Gini_after
## 第三个终止条件,当分裂后的增益小于阈值后者大于目前最大增益
if gain < self.min_impurity_decrease or gain < best_gain:
continue
else:
## 更新最大增益和最佳分裂位置
best_gain = gain
best_f_idx,best_f_val = f_idx,f_val
##------------------------- 连续特征属性的二分 case = 1 ---------------------------
else:
# for f_val in np.unique(X[:, f_idx]):
## 划分num个区间来进行二分,避免过拟合问题
for f_val in np.linspace(X[:, f_idx].min()+1,X[:, f_idx].max()-1,num=50):
X_left, X_right, y_left, y_right = self.binSplitData(X,y,f_idx,f_val)
## 第二个终止条件: 分割后样本数据小于节点的最低样本数,则放弃分割
if len(X_left)<self.min_samples_leaf or len(X_right)<self.min_samples_leaf:
continue
## 如果不满足上述停止条件,则计算分割后的加权Gini
Gini_after = len(X_left)/len(X) * self.Gini(X_left,y_left) + len(X_right)/len(X) * self.Gini(X_right,y_right)
#print('n_feature,f_val,Gini_after,',n_feature,f_val,Gini_after)
gain = Gini_before - Gini_after
print('len(X_left)/sumNan--------------------------------------------',len(X_left)/len(X))
print('len(X_right)/sumNan--------------------------------------------',len(X_right)/len(X))
## 第三个终止条件,当分裂后的增益小于阈值后者大于目前最大增益
if gain < self.min_impurity_decrease or gain < best_gain:
continue
else:
## 更新最大增益和最佳分裂位置
best_gain = gain
best_f_idx,best_f_val = f_idx,f_val
return best_f_idx,best_f_val
def CART(self,X,y):
'''
生成CART树
:param X: 特征数据
:param y: 目标数据
:return; CART 树
'''
best_f_idx, best_f_val = self.bestSplit(X,y)
self.nodes += 1
# best_f_idx 为空表示不能接续划分,则该点为叶子结点 best_f_val
if best_f_idx is None:
return best_f_val
# 节点数超过最大深度的限制,也要返回叶节点,叶节点的值为当前数据中的目标值众数
if self.max_depth:
if self.nodes >= 2**self.max_depth:
return stats.mode(y)[0][0]
tree = dict()
tree['cut_f'] = best_f_idx
tree['cut_val'] = best_f_val
X_left, X_right, y_left, y_right = self.binSplitData(X,y,best_f_idx,best_f_val)
tree['left_values'] = Counter(y_left)
tree['right_values'] = Counter(y_right)
tree['left'] = self.CART(X_left,y_left)
tree['right'] = self.CART(X_right,y_right)
return tree
def fit(self,X,y,sample_weight = None):
'''
拟合模型,数据应该是 ndarray or series类型,dataframe通过 df.values转变成ndarray,不会报错
:param X: 特征数据
:param: y: 目标数据
:param: sample_weight
:return: None
'''
if sample_weight is None:
## 使得每个数据的权值都是 1/len(X) *len(X)是产生 len(X)个
sample_weight = np.array([1/len(X)] * len(X))
# 标记每个特征是离散还是连续,从而采用不同的二分方法
self.type_feature = self.typeFeature(X)
self.tree = self.CART(X,y)
## 若self.pruning = Ture 执行后剪枝操作
if self.ispruning == True:
print('进入到后剪枝过程----------------------')
testTree = self.pruningBox()
from sklearn.model_selection import StratifiedKFold
kfolder = StratifiedKFold(n_splits = self.nsplits)
score = 0
scoreBox = []
for indexTree in range(len(testTree)):
score = 0
for Index_train, Index_test in kfolder.split(X,y):
self.tree = testTree[indexTree]
y_ = self.predict(X[Index_test])
score += (y_ == y[Index_test]).mean()
score = score / self.nsplits
scoreBox.append(score)
TreeIndex = np.argwhere(scoreBox == np.max(scoreBox))[-1][0]
self.tree = testTree[TreeIndex]
return self.tree
def predict(self,X_test):
'''
数据类别预测
:param X_test:预测数据
:return: y_: 类别预测结果
'''
return np.array([self.predict_one(x_test, self.tree) for x_test in X_test])
def predict_one(self,x_test,tree):
if isinstance(tree, dict): # 非叶节点才做左右判断
cut_f_idx, cut_val = tree['cut_f'], tree['cut_val']
if self.type_feature[cut_f_idx] == 0:
sub_tree = tree['left'] if x_test[cut_f_idx] == cut_val else tree['right']
else:
sub_tree = tree['left'] if x_test[cut_f_idx] <= cut_val else tree['right']
return self.predict_one(x_test, sub_tree)
else:
return tree
##########################################剪枝过程需要用到的函数#######################################################
def getSplitTree(self,tree):
sub_Tree = []
for key in ['left','right']:
if isinstance(tree[key], dict):
sub_tree = {} ## 获得每个子树
sub_tree['cut_f'] = tree[key]['cut_f']
sub_tree['cut_val'] = tree[key]['cut_val']
sub_tree['left_values'] = tree[key]['left_values']
sub_tree['right_values'] = tree[key]['right_values']
sub_tree['left'] = tree[key]['left']
sub_tree['right'] = tree[key]['right']
sub_Tree.append(sub_tree)
return sub_Tree
def getBoxTree(self,tree):
'''
获取所有节点的子树
:param: tree 整体子树
:param: boxTree = [tree] 能提取的子树集合
:return: boxTree 子树集合
'''
boxTree = [tree]
sub_Tree= self.getSplitTree(tree) ## 获取根节点的两个子树
boxTree = boxTree + sub_Tree
A= [1] ## 设置暂停获取条件,每个子树产生两个叉,所以每次循环最多得到4个分支,最少0个,当0个即已经到最大深度不再划分
while A != []:
# print('进入循环————————————————————————')
A = []
for i in range(len(sub_Tree)):
temp = self.getSplitTree(sub_Tree[i]) # 依次获取左右子树的“左右子树”
A += temp
if A == []:
break
else:
# print('获得子树————————————————————————',A)
sub_Tree = A
boxTree = boxTree + A
return boxTree
def getNodes(self,Box):
'''
求每个子树的节点数,以及所有节点的预测结果
'''
global node
global ini_node_value
boxNodes = [] ########### 每个子树的节点数############
nodeValue = [] ########### 每个子树所有节点数的预测结果############
ini_node_value = []
node = 0
for k in range(len(Box)):
#print('node-----------',node)
node,ini_node_value = self.findnodes(Box[k])
boxNodes.append(node)
node = 0
nodeValue.append(ini_node_value)
ini_node_value =[]
return boxNodes,nodeValue
def findnodes(self,data):
'''
计算每个子树的叶子节点数 以及 子树的所有叶子节点的预测结果
'''
global node
global ini_node_value
#print('node-----------',node)
for key, val in data.items():
if isinstance(data[key],dict):
self.findnodes(data[key])
else:
if key == 'left':
node += 1
ini_node_value.append(data['left_values'])
# print('-----------',ini_node_value)
elif key == 'right':
node += 1
ini_node_value.append(data['right_values'])
#print('-----------',ini_node_value)
return node,ini_node_value
def costFunction(self,Tt,boxNodes,nodeValue):
'''
获取每个子树的gt值
'''
gt = []
value = Tt['left_values']+ Tt['right_values']
Value = []
#print('--------',value)
for key, val in value.items():
Value.append(val)
Value = np.array(Value)
### 节点Tt的错误误差,遵循大数原则,认为少数的为错分样本
error = Value.min()/ Value.sum()
#print('error--------',error)
## 该节点总样本占整体的比重
p_node = Value.sum() / len(X_train)
Rt = error * p_node
#print('Rt--------',Rt)
RTt = 0
######### RTt 叶子节点所有错分数目 ##############
#print('nodeValue--------',nodeValue,len(nodeValue))
for i in range(len(nodeValue)):
num = []
if len(nodeValue[i])==1: ## 表示该节点纯净
continue
else:
for key, val in nodeValue[i].items():
#print('key, val--------',key, val)
num.append(val)
num = np.array(num)
# print('num--------',num)
RTt += (num.sum()-num.max())/ len(X_train)
#print('RTt--------',RTt)
gt_add = (Rt - RTt)/ (boxNodes-1)
gt_add = 0.0 if (np.abs(gt_add - 0)<1e-13) else gt_add
gt.append(gt_add)
#print('boxNodes--------',boxNodes)
return gt
def find_path(self,iterable, mode, target, current_path='root'):
'''
referenced by
----------
https://blog.csdn.net/qq_47110957/article/details/106982333
Parameters
----------
iterable : dict or list or tuple
The object iterable.
mode : int
You can use 0 or 1. 0 means look up the key name of dict and i means look up the value
target : arbitrary type
Your target for look up.
current_path : string, optional
This parameter is used to run the function and you don't need to give it a value when you use it. The default is ''.
Returns
-------
path : list
This list contains all eligible paths.
'''
path = []
if type(iterable) == type({}):#如果为字典类型
if mode == 0:#查找键名模式
for key in iterable.keys():
c_p = current_path + " .'%s'" % key
if key == target:
path.append(c_p)
if type(iterable[key]) in [type({}),type(()),type([])]:#如果键值是序列就继续向下找
path += self.find_path(iterable[key], mode, target, c_p)
elif mode == 1:
for key in iterable.keys():
c_p = current_path + '.'+'%s'%key
if iterable[key] == target:
path.append(c_p)
if type(iterable[key]) in [type({}),type(()),type([])]:#如果键值是序列就继续向下找
path += self.find_path(iterable[key], mode, target, c_p)
return path
def replace(self,obj, replaceContent):
'''
通过剪枝获取剪枝后的子树
'''
replacePath = []
pathStr = ""
for s in keyPath.split("."):
if s == "root":
pathStr += "copyObj"
elif s.isdigit():
pathStr += f"[{s}]"
else:
pathStr += f'["{s}"]'
replacePath.append(pathStr)
#print(replacePath)
replaceObj = []
for path in replacePath:
#print('---------replacePath')
#print(path)
for content in replaceContent:
copyObj = copy.deepcopy(obj)
exec(f"{path} = content")
replaceObj.append(copyObj)
return replaceObj
# print('---------',replaceObj)
def pruningBox(self):
testTree = [self.tree]
#print('testTree------------',testTree[-1])
while self.isnotRoot(testTree[-1]):
#print('剪枝前得到的树------------',testTree[-1])
replaceObj= self.pruning(testTree[-1])
#print('剪枝后得到的子树------------',replaceObj)
testTree.append(replaceObj[0])
#print('待测试的子树集合长度------------',len(testTree))
return testTree
def pruning(self,tree):
boxTree = [tree]
### 获取整树的节点子集,t0,t1,t2....
Box = self.getBoxTree(tree)
#print('获取到的子树个数-------------',len(Box))
### 获取该子集中每个子树所包含的叶子节点数,以及每个子树的所有叶子节点的预测值
boxNodes,nodeValue = self.getNodes(Box)
### 获取每个节点的gt值
gt = np.array([self.costFunction(Box[idx],boxNodes[idx],nodeValue[idx]) for idx in range(len(Box))])
#print('获取每个节点的gt值-------------',gt)
### 找到最低gt值所对应的节点位置,即需要删除的子树
del_node = np.argwhere(gt == gt.min())[-1][0]
#print('找到最低gt值所对应的节点位置,即需要删除的节点下标-------------',del_node)
box = Box[del_node]
#print('需要剪枝的子树-------------',box)
keyName = self.find_path(Box[0],1, box, current_path='')
#print('获取需要剪枝的节点的路径信息-------------',keyName)
### 获取需要剪枝的节点的路径信息
label = (box['left_values']+ box['right_values']).most_common(1)[0][0]
jsonObj = dict.copy(Box[0])
#print(jsonObj)
#print('-----------------')
newObj = dict()
### 通过路径信息,将剪枝后的类别信息替换原有子树的 位置,获取新的待测试树集合
global keyPath
keyPath="root"
keyPath += keyName[0]
#print('keyPath获取需要剪枝的节点的路径信息-------------',keyPath)
return self.replace(obj=jsonObj, replaceContent=[label])
def isnotRoot(self,data):
'''
判断是否为根节点的子树,0表示是根节点的子树,即测试树的集合完成
'''
con = 0
for key, val in data.items():
if isinstance(data[key],dict):
continue
else:
con += 1
### 所有key都不是字典的时候,即是根节点
if con == 4:
return 0
else:
return 1
将ispruning设为True,就是表示进行剪枝操作,默认不进行
if __name__ == '__main__':
from collections import Counter
from sklearn import datasets
import pandas as pd
import numpy as np
from scipy import stats # 用于求众数
from sklearn.model_selection import train_test_split
import copy
############ data 1,连续属性数据 #######
print('数据特征为连续属性,测试结果为')
data = datasets.load_breast_cancer()
X, Y = data.data, data.target
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
tree_clf = DecisionTreeClassifier(ispruning=True,nsplits=8)
tree = tree_clf.fit(X_train, Y_train)
Y_pred = tree_clf.predict(X_test)
print('acc:{}'.format(np.sum(Y_pred == Y_test) / len(Y_test)))