可以参考(一)CART基本原理和(二)CART+CCP原理,了解前一部分内容,结果通过cancer数据验证,目前不能对带有缺失值的字符型特征进行处理,因为python自带的numpy中的一些操作对此不支持,而dataframe的iloc切片有效率太低,所以没有实现
本文主要解释CART的缺失值处理,以及含有缺失值的数据的分类
对于缺失值的处理,其实就是将每个数据附上权重,本身权重都是1,对于该特征缺失的数据,根据该属性的数据比率为空值数据赋值,过程可以分为以下步骤:
1.对于K特征,首先将没有缺失值的所有数据进行分类前和分类后的Gini系数,得到的增益再乘上该特征无缺失值的比率得到最终的增益
2. 此时可以得到左右树,以及进入到每个树的数据,所有就可以得到,左右树的数据比重。对于刚刚有缺失值的数据就按照这个比重分配到两个树中,假设8号数据该特征有缺失值,则8号就是就会同时存在在左右树中,只不过每个树种的权重不同
3.重复1.2操作可以完成有缺失值树的建立。注意的就是,之前建立树的时候思想是每个类别数据的个数,现在应该转变为每个节点中类别的概率
这部分主要是对(一)中的Gini系数计算函数以及binSplitData二分类划分数据集还有bestSplit找到最佳分割点的函数进行了改进
def Gini(self,y,sample_weight):
'''
:param data:
:param sample_weight: 特征数据的权重,NaN表示元数据为空值
:param y: 目标数据
:return: Gini: 返回该特征的Gini系数
'''
## 根据第一个公式
cond = np.isnan(sample_weight)
sample_weight_columns = sample_weight[~cond]
y_columns = y[~cond]
K = np.unique(y_columns)
gini = 1 - np.sum([(np.sum(sample_weight_columns[y_columns == k]) / np.sum(sample_weight_columns))**2 for k in K])
return gini
def binSplitData(self,X,y,f_idx,f_val,sample_weight):
'''
二分类划分数据集,带有空值的数据
:param X 划分数据
:param f_idx: 数据X的第f_idx个特征 X.iloc[:,f_idx] 太慢 用 X.columns[f_idx]
np.unique(X.at[:,f_idx])可以得到该特征的属性,如 array(['Overcast', 'Rain', 'Sunny'], dtype=object)
:param f_val: 数据f_idx个特征中的属性值,即上面中的一种 'Overcast'/ 'Rain'/ 'Sunny
:param type_feature: 离散特征 0 连续特征 1
:param sample_weight:每个数据的权重
:return: 二分后的左右数据子集
'''
### att 数有数据在第f_idx的特征的所有属性,将不等于 f_val 分为一类,其余分为另一类
## cond =0表示该点不是缺失值
#################### 0: 离散类型特征二分方法 1:连续数据 ############################
att = X[:, f_idx]
cond2 = ~np.isnan(att)
if self.type_feature[f_idx]== 0:
### 找到满足节点条件并且不是缺失值的数据
cond1 = att == f_val
co = cond1 & cond2
X_left = X[cond1 & cond2]
X_right = X[~cond1 & cond2]
y_left = y[cond1 & cond2]
y_right = y[~cond1 & cond2]
weight_left = sample_weight[cond1 & cond2]
weight_right = sample_weight[~cond1 & cond2]
else:
### 找到满足节点条件并且不是缺失值的数据
cond1 = att <= f_val
X_left = X[cond1 & cond2]
X_right = X[~cond1 & cond2]
y_left = y[cond1 & cond2]
y_right = y[~cond1 & cond2]
weight_left = sample_weight[cond1 & cond2]
weight_right = sample_weight[~cond1 & cond2]
## 切分点和样本点的索引
return X_left, X_right, y_left, y_right, weight_left, weight_right
def bestSplit(self,X,y,sample_weight):
'''
找到最佳分割特征与特征值
:param X
:return: best_f_idx 最佳分割特征 , best_f_val 特征值
'''
best_gain = 0
n_sample,n_feature = X.shape
best_f_idx = None
## 根据大数原则,找到X特征中出现最多的属性作为特征值
best_f_val = stats.mode(y)[0][0]
## 第一个终止条件: 当叶子节点中的样本数小于最小分割值,或者所有样本属于同一类别时,不再分割
if n_sample < self.min_samples_split or len(np.unique(y)) == 1:
return best_f_idx,best_f_val
### 初始权重都为1
## 每一个特征中 非空数据的个数
##-------------------------通过不断二分的过程 寻找对于某个特征,的最佳分割点---------------------------
for f_idx in range(n_feature):
##-------------------------如果该特征中的属性个数小于10,则认为是离散数据 type_feature = 0,否则else---------------------------
if self.type_feature[f_idx] == 0:
Gini_before= self.Gini(y,sample_weight)
for f_val in np.unique(X[:, f_idx]):
## 当某个特征只有两个类别时,仅仅做一次左右子树的划分,不用重复操作
if len(np.unique(X[:, f_idx]))== 2 and f_val == np.unique(X[:, f_idx])[0]:
continue
else:
X_left, X_right, y_left, y_right,weight_left, weight_right = self.binSplitData(X,y,f_idx,f_val,sample_weight)
## __binSplitData 函数已经修改,添加了缺失值处理步骤,目前该特征数据是非缺失值数据
## 第二个终止条件: 分割后样本数据小于节点的最低样本数,则放弃分割
if len(X_left)<self.min_samples_leaf or len(X_right)<self.min_samples_leaf:
continue
## 如果不满足上述停止条件,则计算分割后的加权Gini
sumNan = (~np.isnan(X[:, f_idx])).sum()
Gini_after = len(X_left)/sumNan * self.Gini(y_left,weight_left) + len(X_right)/sumNan * self.Gini(y_right,weight_right)
gain = (Gini_before - Gini_after) * sumNan/ len(X)
## 第三个终止条件,当分裂后的增益小于阈值后者大于目前最大增益
if gain < self.min_impurity_decrease or gain < best_gain:
continue
else:
## 更新最大增益和最佳分裂位置
best_gain = gain
best_f_idx,best_f_val = f_idx,f_val
##------------------------- 连续特征属性的二分 case = 1 ---------------------------
else:
Gini_before= self.Gini(y,sample_weight)
# for f_val in np.unique(X[:, f_idx]):
## 划分num个区间来进行二分,避免过拟合问题
for f_val in np.linspace(np.nanmin(X[:, f_idx])+1,np.nanmax(X[:, f_idx])-1,num=50):
X_left, X_right, y_left, y_right,weight_left, weight_right = self.binSplitData(X,y,f_idx,f_val,sample_weight)
## 第二个终止条件: 分割后样本数据小于节点的最低样本数,则放弃分割
if len(X_left)<self.min_samples_leaf or len(X_right)<self.min_samples_leaf:
continue
## 如果不满足上述停止条件,则计算分割后的加权Gini
sumNan = (~np.isnan(X[:, f_idx])).sum()
Gini_after = len(X_left)/sumNan * self.Gini(y_left,weight_left) + len(X_right)/sumNan * self.Gini(y_right,weight_right)
gain = (Gini_before - Gini_after) * sumNan/ len(X)
## 第三个终止条件,当分裂后的增益小于阈值后者大于目前最大增益
if gain < self.min_impurity_decrease or gain < best_gain:
continue
else:
## 更新最大增益和最佳分裂位置
best_gain = gain
best_f_idx,best_f_val = f_idx,f_val
return best_f_idx,best_f_val
def CART(self,X,y,sample_weight):
'''
生成CART树
:param X: 特征数据
:param y: 目标数据
:return; CART 树
'''
best_f_idx, best_f_val = self.bestSplit(X,y,sample_weight)
self.nodes += 1
# best_f_idx 为空表示不能接续划分,则该点为叶子结点 best_f_val
if best_f_idx is None:
return best_f_val
# 节点数超过最大深度的限制,也要返回叶节点,叶节点的值为当前数据中的目标值众数
if self.max_depth:
if self.nodes >= 2**self.max_depth:
return 1 if (sample_weight[y == 1].sum())>(sample_weight[y == 0].sum()) else 0
tree = dict()
tree['cut_f'] = best_f_idx
tree['cut_val'] = best_f_val
X_left, X_right, y_left, y_right,weight_left, weight_right = self.binSplitData(X,y,best_f_idx,best_f_val,sample_weight)
## 该部分就是给左右树加入特征为空的数据的过程
IndexNan = np.isnan(X[:,best_f_idx])
numNan = IndexNan.sum() #空值的个数
notNan = len(X) - numNan ########## notNan该节点特征中非空属性的数据个数 也等于没有更新前 左右树的数据个数和
weight_nan_left = np.ones(numNan) * (len(X_left)/notNan)
weight_left = np.concatenate((weight_left,weight_nan_left))
weight_nan_right = np.ones(numNan) * (len(X_right)/notNan)
weight_right = np.concatenate((weight_right,weight_nan_right))
X_nan = X[IndexNan,:]
X_left = np.concatenate((X_left,X_nan))
X_right = np.concatenate((X_right,X_nan))
y_nan = y[IndexNan]
y_left = np.concatenate((y_left,y_nan))
y_right = np.concatenate((y_right,y_nan))
tree['left_values'] = self.Counter(y_left,weight_left)
tree['right_values'] = self.Counter(y_right,weight_right)
tree['left'] = self.CART(X_left,y_left,weight_left)
tree['right'] = self.CART(X_right,y_right,weight_right)
return tree
建立好的树同样可以进行剪枝操作,与(二)中基本无异,另外就是对于有缺失值的测试数据如何分类的问题。这里采用的方式是如果一个数据的特征在该节点正好是缺失的,就将该数据分类成两个一样的数据,分别进入到左右树中, 不同的就是各自权重不一样,按照建立该树的时候的左右树训练数据的比重分配。最后找到所有可能的情况,进行权重的相加,高概率的类别即使该数据的类别,这里主要是针对预测函数的改进
def predict(self,X_test,sample_weight = None):
'''
数据类别预测
:param X_test:预测数据
:return: y_: 类别预测结果
'''
y_pre = []
if sample_weight is None:
sample_weight = np.array([1] * len(X_test))
for k in range(len(X_test)):
global dic
dic= {}
for key in tree_clf.K:
dic.update({key:0})
tree,_= self.predict_one(X_test[k,:], self.tree, sample_weight[k])
y_ = max(dic, key=lambda x: dic[x])
if y_==0:
y_pre.append(tree)
else:
y_pre.append(y_)
return y_pre
def predict_one(self,x_test,tree,sample_weight):
'''
可以预测带有缺失值的数据
'''
global dic
if isinstance(tree, dict): # 非叶节点才做左右判断
#print('不是叶子节点-------------------------------')
cut_f_idx, cut_val = tree['cut_f'], tree['cut_val']
if self.type_feature[cut_f_idx] == 0:# 该特征是离散值
if np.isnan(x_test[cut_f_idx]):
subX_test = np.array([x_test,x_test])
sample_weight *= getWeightY(tree)
sub_tree_left = tree['left']
sub_tree_right = tree['right']
self.predict_one(subX_test[0], sub_tree_left, sample_weight[0])
self.predict_one(subX_test[1], sub_tree_right, sample_weight[1])
else:
sub_tree = tree['left'] if x_test[cut_f_idx] == cut_val else tree['right']
return self.predict_one(x_test, sub_tree,sample_weight)
else: # 该特征是连续值
#print('x_test-------------------------------',x_test)
if np.isnan(x_test[cut_f_idx]):
# print('cut_f_idx-------------------------------',cut_f_idx)
subX_test = np.array([x_test,x_test])
#print('原有sample_weight-------------------------------',sample_weight)
sample_weight *= self.getWeightY(tree)
sub_tree_left = tree['left']
sub_tree_right = tree['right']
sub_tree,result_weight = self.predict_one(subX_test[0], sub_tree_left, sample_weight[0])
dic[sub_tree] = dic[sub_tree] + result_weight
sub_tree,sample_weight = self.predict_one(subX_test[1], sub_tree_right,sample_weight[1])
dic[sub_tree] = dic[sub_tree] + sample_weight
sample_weight = 0
else:
#print('进入预测程序不是空值cut_f_idx-------------------------------',cut_f_idx)
sub_tree = tree['left'] if x_test[cut_f_idx] <= cut_val else tree['right']
return self.predict_one(x_test, sub_tree,sample_weight)
else:
#max(dic, key=lambda x: dic[x])
#print('是叶子节点-------------------------------',tree,sample_weight)
return tree,sample_weight
def getWeightY(self,tree):
sum_left = 0
for k in tree['left_values'].values():
sum_left += k
sum_right = 0
for k in tree['right_values'].values():
sum_right += k
updateWeightY = np.array([sum_left/(sum_left+sum_right),sum_right/(sum_left+sum_right)])
return updateWeightY
以下是完整的代码 加入CCP和缺失值处理
## 注意dataframe 类型数据iloc很慢,可以用apply函数尝试,或者替换成ndarray格式
class DecisionTreeClassifier():
def __init__(self,max_depth: int = None,min_samples_split:int = 5,
min_samples_leaf: int = 5,min_impurity_decrease: float =0.0,ispruning: bool = False, nsplits: int = 3, isNan: bool = False):
'''
min_samples_split: 内部节点再划分所需最小样本数
min_samples_leaf: 叶子节点最少样本数 这个值限制了叶子节点最少的样本数,如果某叶子节点数目小于样本数,则会和兄弟节点一起被剪枝
分裂需要满足的最小增益
max_depth: 最大深度
min_impurity_decrease:分裂需要满足的最小增益
pruning: 是否剪枝,默认不剪枝 False
nsplits: 后剪枝交叉验证数据集的折叠数量,默认是3
'''
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.min_samples_leaf = min_samples_leaf
self.min_impurity_decrease = min_impurity_decrease
self.nodes = 0
self.tree = None
self.type_feature = None
self.ispruning = ispruning
self.nsplits = nsplits
self.isNan = isNan
self.K = None ##总共分类的类别
##########################################################3333
def Gini(self,y,sample_weight):
'''
:param data:
:param sample_weight: 特征数据的权重,NaN表示元数据为空值
:param y: 目标数据
:return: Gini: 返回该特征的Gini系数
'''
## 根据第一个公式
cond = np.isnan(sample_weight)
sample_weight_columns = sample_weight[~cond]
y_columns = y[~cond]
K = np.unique(y_columns)
gini = 1 - np.sum([(np.sum(sample_weight_columns[y_columns == k]) / np.sum(sample_weight_columns))**2 for k in K])
return gini
def typeFeature(self,X):
# 表示特征是否为连续还是离散
n_sample,n_feature = X.shape
self.type_feature = []
#### 特征属性小于10个,认为是离散型数据用0表示,连续性数据用1 表示
for f_idx in range(n_feature):
if len(np.unique(X[:, f_idx]))< 10:
self.type_feature.append(0)
else:
self.type_feature.append(1)
return self.type_feature
def binSplitData(self,X,y,f_idx,f_val,sample_weight):
'''
二分类划分数据集,带有空值的数据
:param X 划分数据
:param f_idx: 数据X的第f_idx个特征 X.iloc[:,f_idx] 太慢 用 X.columns[f_idx]
np.unique(X.at[:,f_idx])可以得到该特征的属性,如 array(['Overcast', 'Rain', 'Sunny'], dtype=object)
:param f_val: 数据f_idx个特征中的属性值,即上面中的一种 'Overcast'/ 'Rain'/ 'Sunny
:param type_feature: 离散特征 0 连续特征 1
:param sample_weight:每个数据的权重
:return: 二分后的左右数据子集
'''
### att 数有数据在第f_idx的特征的所有属性,将不等于 f_val 分为一类,其余分为另一类
## cond =0表示该点不是缺失值
#################### 0: 离散类型特征二分方法 1:连续数据 ############################
att = X[:, f_idx]
cond2 = ~np.isnan(att)
if self.type_feature[f_idx]== 0:
### 找到满足节点条件并且不是缺失值的数据
cond1 = att == f_val
co = cond1 & cond2
X_left = X[cond1 & cond2]
X_right = X[~cond1 & cond2]
y_left = y[cond1 & cond2]
y_right = y[~cond1 & cond2]
weight_left = sample_weight[cond1 & cond2]
weight_right = sample_weight[~cond1 & cond2]
else:
### 找到满足节点条件并且不是缺失值的数据
cond1 = att <= f_val
X_left = X[cond1 & cond2]
X_right = X[~cond1 & cond2]
y_left = y[cond1 & cond2]
y_right = y[~cond1 & cond2]
weight_left = sample_weight[cond1 & cond2]
weight_right = sample_weight[~cond1 & cond2]
## 切分点和样本点的索引
return X_left, X_right, y_left, y_right, weight_left, weight_right
def bestSplit(self,X,y,sample_weight):
'''
找到最佳分割特征与特征值
:param X
:return: best_f_idx 最佳分割特征 , best_f_val 特征值
'''
best_gain = 0
n_sample,n_feature = X.shape
best_f_idx = None
## 根据大数原则,找到X特征中出现最多的属性作为特征值
best_f_val = stats.mode(y)[0][0]
## 第一个终止条件: 当叶子节点中的样本数小于最小分割值,或者所有样本属于同一类别时,不再分割
if n_sample < self.min_samples_split or len(np.unique(y)) == 1:
return best_f_idx,best_f_val
### 初始权重都为1
## 每一个特征中 非空数据的个数
##-------------------------通过不断二分的过程 寻找对于某个特征,的最佳分割点---------------------------
for f_idx in range(n_feature):
##-------------------------如果该特征中的属性个数小于10,则认为是离散数据 type_feature = 0,否则else---------------------------
if self.type_feature[f_idx] == 0:
Gini_before= self.Gini(y,sample_weight)
for f_val in np.unique(X[:, f_idx]):
## 当某个特征只有两个类别时,仅仅做一次左右子树的划分,不用重复操作
if len(np.unique(X[:, f_idx]))== 2 and f_val == np.unique(X[:, f_idx])[0]:
continue
else:
X_left, X_right, y_left, y_right,weight_left, weight_right = self.binSplitData(X,y,f_idx,f_val,sample_weight)
## __binSplitData 函数已经修改,添加了缺失值处理步骤,目前该特征数据是非缺失值数据
## 第二个终止条件: 分割后样本数据小于节点的最低样本数,则放弃分割
if len(X_left)<self.min_samples_leaf or len(X_right)<self.min_samples_leaf:
continue
## 如果不满足上述停止条件,则计算分割后的加权Gini
sumNan = (~np.isnan(X[:, f_idx])).sum()
Gini_after = len(X_left)/sumNan * self.Gini(y_left,weight_left) + len(X_right)/sumNan * self.Gini(y_right,weight_right)
gain = (Gini_before - Gini_after) * sumNan/ len(X)
## 第三个终止条件,当分裂后的增益小于阈值后者大于目前最大增益
if gain < self.min_impurity_decrease or gain < best_gain:
continue
else:
## 更新最大增益和最佳分裂位置
best_gain = gain
best_f_idx,best_f_val = f_idx,f_val
##------------------------- 连续特征属性的二分 case = 1 ---------------------------
else:
Gini_before= self.Gini(y,sample_weight)
# for f_val in np.unique(X[:, f_idx]):
## 划分num个区间来进行二分,避免过拟合问题
for f_val in np.linspace(np.nanmin(X[:, f_idx])+1,np.nanmax(X[:, f_idx])-1,num=50):
X_left, X_right, y_left, y_right,weight_left, weight_right = self.binSplitData(X,y,f_idx,f_val,sample_weight)
## 第二个终止条件: 分割后样本数据小于节点的最低样本数,则放弃分割
if len(X_left)<self.min_samples_leaf or len(X_right)<self.min_samples_leaf:
continue
## 如果不满足上述停止条件,则计算分割后的加权Gini
sumNan = (~np.isnan(X[:, f_idx])).sum()
Gini_after = len(X_left)/sumNan * self.Gini(y_left,weight_left) + len(X_right)/sumNan * self.Gini(y_right,weight_right)
gain = (Gini_before - Gini_after) * sumNan/ len(X)
## 第三个终止条件,当分裂后的增益小于阈值后者大于目前最大增益
if gain < self.min_impurity_decrease or gain < best_gain:
continue
else:
## 更新最大增益和最佳分裂位置
best_gain = gain
best_f_idx,best_f_val = f_idx,f_val
return best_f_idx,best_f_val
def CART(self,X,y,sample_weight):
'''
生成CART树
:param X: 特征数据
:param y: 目标数据
:return; CART 树
'''
best_f_idx, best_f_val = self.bestSplit(X,y,sample_weight)
self.nodes += 1
# best_f_idx 为空表示不能接续划分,则该点为叶子结点 best_f_val
if best_f_idx is None:
return best_f_val
# 节点数超过最大深度的限制,也要返回叶节点,叶节点的值为当前数据中的目标值众数
if self.max_depth:
if self.nodes >= 2**self.max_depth:
return 1 if (sample_weight[y == 1].sum())>(sample_weight[y == 0].sum()) else 0
tree = dict()
tree['cut_f'] = best_f_idx
tree['cut_val'] = best_f_val
X_left, X_right, y_left, y_right,weight_left, weight_right = self.binSplitData(X,y,best_f_idx,best_f_val,sample_weight)
IndexNan = np.isnan(X[:,best_f_idx])
numNan = IndexNan.sum() #空值的个数
notNan = len(X) - numNan ########## notNan该节点特征中非空属性的数据个数 也等于没有更新前 左右树的数据个数和
weight_nan_left = np.ones(numNan) * (len(X_left)/notNan)
weight_left = np.concatenate((weight_left,weight_nan_left))
weight_nan_right = np.ones(numNan) * (len(X_right)/notNan)
weight_right = np.concatenate((weight_right,weight_nan_right))
X_nan = X[IndexNan,:]
X_left = np.concatenate((X_left,X_nan))
X_right = np.concatenate((X_right,X_nan))
y_nan = y[IndexNan]
y_left = np.concatenate((y_left,y_nan))
y_right = np.concatenate((y_right,y_nan))
tree['left_values'] = self.Counter(y_left,weight_left)
tree['right_values'] = self.Counter(y_right,weight_right)
tree['left'] = self.CART(X_left,y_left,weight_left)
tree['right'] = self.CART(X_right,y_right,weight_right)
return tree
def Counter(self,y,sample_weight):
'''
加入权重因子后的每个节点的分类值,用来计算left_values和right_values
'''
K = np.unique(y)
dic = {}
for key in K:
val = sample_weight[y == key].sum()
dic.update({key:val})
return dic
def fit(self,X,y,sample_weight = None):
'''
拟合模型,数据应该是 ndarray or series类型,dataframe通过 df.values转变成ndarray,不会报错
:param X: 特征数据
:param: y: 目标数据
:param: sample_weight
:return: None
'''
self.K = np.unique(y)
if sample_weight is None:
## 使得每个数据的权值都是 1 *len(X)是产生 len(X)个
sample_weight = np.array([1] * len(X))
# 标记每个特征是离散还是连续,从而采用不同的二分方法
self.type_feature = self.typeFeature(X)
self.tree = self.CART(X,y,sample_weight)
## 若self.pruning = Ture 执行后剪枝操作
if self.ispruning == True:
print('进入到后剪枝过程----------------------')
testTree = self.pruningBox()
from sklearn.model_selection import StratifiedKFold
kfolder = StratifiedKFold(n_splits = self.nsplits)
score = 0
scoreBox = []
for indexTree in range(len(testTree)):
score = 0
for Index_train, Index_test in kfolder.split(X,y):
self.tree = testTree[indexTree]
y_ = self.predict(X[Index_test])
score += (y_ == y[Index_test]).mean()
score = score / self.nsplits
scoreBox.append(score)
TreeIndex = np.argwhere(scoreBox == np.max(scoreBox))[-1][0]
self.tree = testTree[TreeIndex]
return self.tree
def predict(self,X_test,sample_weight = None):
'''
数据类别预测
:param X_test:预测数据
:return: y_: 类别预测结果
'''
y_pre = []
if sample_weight is None:
sample_weight = np.array([1] * len(X_test))
for k in range(len(X_test)):
global dic
dic= {}
for key in tree_clf.K:
dic.update({key:0})
tree,_= self.predict_one(X_test[k,:], self.tree, sample_weight[k])
y_ = max(dic, key=lambda x: dic[x])
if y_==0:
y_pre.append(tree)
else:
y_pre.append(y_)
return y_pre
def predict_one(self,x_test,tree,sample_weight):
'''
可以预测带有缺失值的数据
'''
global dic
if isinstance(tree, dict): # 非叶节点才做左右判断
#print('不是叶子节点-------------------------------')
cut_f_idx, cut_val = tree['cut_f'], tree['cut_val']
if self.type_feature[cut_f_idx] == 0:# 该特征是离散值
if np.isnan(x_test[cut_f_idx]):
subX_test = np.array([x_test,x_test])
sample_weight *= getWeightY(tree)
sub_tree_left = tree['left']
sub_tree_right = tree['right']
self.predict_one(subX_test[0], sub_tree_left, sample_weight[0])
self.predict_one(subX_test[1], sub_tree_right, sample_weight[1])
else:
sub_tree = tree['left'] if x_test[cut_f_idx] == cut_val else tree['right']
return self.predict_one(x_test, sub_tree,sample_weight)
else: # 该特征是连续值
#print('x_test-------------------------------',x_test)
if np.isnan(x_test[cut_f_idx]):
# print('cut_f_idx-------------------------------',cut_f_idx)
subX_test = np.array([x_test,x_test])
#print('原有sample_weight-------------------------------',sample_weight)
sample_weight *= self.getWeightY(tree)
sub_tree_left = tree['left']
sub_tree_right = tree['right']
sub_tree,result_weight = self.predict_one(subX_test[0], sub_tree_left, sample_weight[0])
dic[sub_tree] = dic[sub_tree] + result_weight
sub_tree,sample_weight = self.predict_one(subX_test[1], sub_tree_right,sample_weight[1])
dic[sub_tree] = dic[sub_tree] + sample_weight
sample_weight = 0
else:
#print('进入预测程序不是空值cut_f_idx-------------------------------',cut_f_idx)
sub_tree = tree['left'] if x_test[cut_f_idx] <= cut_val else tree['right']
return self.predict_one(x_test, sub_tree,sample_weight)
else:
#max(dic, key=lambda x: dic[x])
#print('是叶子节点-------------------------------',tree,sample_weight)
return tree,sample_weight
def getWeightY(self,tree):
sum_left = 0
for k in tree['left_values'].values():
sum_left += k
sum_right = 0
for k in tree['right_values'].values():
sum_right += k
updateWeightY = np.array([sum_left/(sum_left+sum_right),sum_right/(sum_left+sum_right)])
return updateWeightY
##########################################剪枝过程需要用到的函数#######################################################
def getSplitTree(self,tree):
sub_Tree = []
for key in ['left','right']:
if isinstance(tree[key], dict):
sub_tree = {} ## 获得每个子树
sub_tree['cut_f'] = tree[key]['cut_f']
sub_tree['cut_val'] = tree[key]['cut_val']
sub_tree['left_values'] = tree[key]['left_values']
sub_tree['right_values'] = tree[key]['right_values']
sub_tree['left'] = tree[key]['left']
sub_tree['right'] = tree[key]['right']
sub_Tree.append(sub_tree)
return sub_Tree
def getBoxTree(self,tree):
'''
获取所有节点的子树
:param: tree 整体子树
:param: boxTree = [tree] 能提取的子树集合
:return: boxTree 子树集合
'''
boxTree = [tree]
sub_Tree= self.getSplitTree(tree) ## 获取根节点的两个子树
boxTree = boxTree + sub_Tree
A= [1] ## 设置暂停获取条件,每个子树产生两个叉,所以每次循环最多得到4个分支,最少0个,当0个即已经到最大深度不再划分
while A != []:
# print('进入循环————————————————————————')
A = []
for i in range(len(sub_Tree)):
temp = self.getSplitTree(sub_Tree[i]) # 依次获取左右子树的“左右子树”
A += temp
if A == []:
break
else:
# print('获得子树————————————————————————',A)
sub_Tree = A
boxTree = boxTree + A
return boxTree
def getNodes(self,Box):
'''
求每个子树的节点数,以及所有节点的预测结果
'''
global node
global ini_node_value
boxNodes = [] ########### 每个子树的节点数############
nodeValue = [] ########### 每个子树所有节点数的预测结果############
ini_node_value = []
node = 0
for k in range(len(Box)):
#print('node-----------',node)
node,ini_node_value = self.findnodes(Box[k])
boxNodes.append(node)
node = 0
nodeValue.append(ini_node_value)
ini_node_value =[]
return boxNodes,nodeValue
def findnodes(self,data):
'''
计算每个子树的叶子节点数 以及 子树的所有叶子节点的预测结果
'''
global node
global ini_node_value
#print('node-----------',node)
for key, val in data.items():
if isinstance(data[key],dict):
self.findnodes(data[key])
else:
if key == 'left':
node += 1
ini_node_value.append(data['left_values'])
# print('-----------',ini_node_value)
elif key == 'right':
node += 1
ini_node_value.append(data['right_values'])
#print('-----------',ini_node_value)
return node,ini_node_value
def dicAdd(self,dic1,dic2):
dic = {}
for key in list(set(dic1) | set(dic2)):
if dic1.get(key) and dic2.get(key):
dic.update({key: dic1.get(key) + dic2.get(key)})
else:
dic.update({key: dic1.get(key) or dic2.get(key)})
return dic
def costFunction(self,Tt,boxNodes,nodeValue):
'''
获取每个子树的gt值
'''
gt = []
value = self.dicAdd(Tt['left_values'],Tt['right_values'])
Value = []
#print('--------',value)
for key, val in value.items():
Value.append(val)
Value = np.array(Value)
### 节点Tt的错误误差,遵循大数原则,认为少数的为错分样本
error = Value.min()/ Value.sum()
#print('error--------',error)
## 该节点总样本占整体的比重
p_node = Value.sum() / len(X_train)
Rt = error * p_node
#print('Rt--------',Rt)
RTt = 0
######### RTt 叶子节点所有错分数目 ##############
#print('nodeValue--------',nodeValue,len(nodeValue))
for i in range(len(nodeValue)):
num = []
if len(nodeValue[i])==1: ## 表示该节点纯净
continue
else:
for key, val in nodeValue[i].items():
#print('key, val--------',key, val)
num.append(val)
num = np.array(num)
# print('num--------',num)
RTt += (num.sum()-num.max())/ len(X_train)
#print('RTt--------',RTt)
gt_add = (Rt - RTt)/ (boxNodes-1)
gt_add = 0.0 if (np.abs(gt_add - 0)<1e-13) else gt_add
gt.append(gt_add)
#print('boxNodes--------',boxNodes)
return gt
def replace(self,obj, replaceContent):
'''
通过剪枝获取剪枝后的子树
'''
replacePath = []
pathStr = ""
for s in keyPath.split("."):
if s == "root":
pathStr += "copyObj"
elif s.isdigit():
pathStr += f"[{s}]"
else:
pathStr += f'["{s}"]'
replacePath.append(pathStr)
#print(replacePath)
replaceObj = []
for path in replacePath:
#print('---------replacePath')
#print(path)
for content in replaceContent:
copyObj = copy.deepcopy(obj)
exec(f"{path} = content")
replaceObj.append(copyObj)
return replaceObj
# print('---------',replaceObj)
def pruningBox(self):
testTree = [self.tree]
#print('testTree------------',testTree[-1])
while self.isnotRoot(testTree[-1]):
#print('剪枝前得到的树------------',testTree[-1])
replaceObj= self.pruning(testTree[-1])
#print('剪枝后得到的子树------------',replaceObj)
testTree.append(replaceObj[0])
#print('待测试的子树集合长度------------',len(testTree))
return testTree
def pruning(self,tree):
boxTree = [tree]
### 获取整树的节点子集,t0,t1,t2....
Box = self.getBoxTree(tree)
#print('获取到的子树个数-------------',len(Box))
### 获取该子集中每个子树所包含的叶子节点数,以及每个子树的所有叶子节点的预测值
boxNodes,nodeValue = self.getNodes(Box)
### 获取每个节点的gt值
gt = np.array([self.costFunction(Box[idx],boxNodes[idx],nodeValue[idx]) for idx in range(len(Box))])
#print('获取每个节点的gt值-------------',gt)
### 找到最低gt值所对应的节点位置,即需要删除的子树
del_node = np.argwhere(gt == gt.min())[-1][0]
#print('找到最低gt值所对应的节点位置,即需要删除的节点下标-------------',del_node)
box = Box[del_node]
#print('需要剪枝的子树-------------',box)
keyName = self.find_path(Box[0],1, box, current_path='')
#print('获取需要剪枝的节点的路径信息-------------',keyName)
### 获取需要剪枝的节点的路径信息
dic = self.dicAdd(box['left_values'], box['right_values'])
max(dic, key=lambda x: dic[x])
label = max(dic, key=lambda x: dic[x])
jsonObj = dict.copy(Box[0])
#print(jsonObj)
#print('-----------------')
newObj = dict()
### 通过路径信息,将剪枝后的类别信息替换原有子树的 位置,获取新的待测试树集合
global keyPath
keyPath="root"
keyPath += keyName[0]
#print('keyPath获取需要剪枝的节点的路径信息-------------',keyPath)
return self.replace(obj=jsonObj, replaceContent=[label])
def find_path(self,iterable, mode, target, current_path='root'):
'''
referenced by
----------
https://blog.csdn.net/qq_47110957/article/details/106982333
Parameters
----------
iterable : dict or list or tuple
The object iterable.
mode : int
You can use 0 or 1. 0 means look up the key name of dict and i means look up the value
target : arbitrary type
Your target for look up.
current_path : string, optional
This parameter is used to run the function and you don't need to give it a value when you use it. The default is ''.
Returns
-------
path : list
This list contains all eligible paths.
'''
path = []
if type(iterable) == type({}):#如果为字典类型
if mode == 0:#查找键名模式
for key in iterable.keys():
c_p = current_path + " .'%s'" % key
if key == target:
path.append(c_p)
if type(iterable[key]) in [type({}),type(()),type([])]:#如果键值是序列就继续向下找
path += self.find_path(iterable[key], mode, target, c_p)
elif mode == 1:
for key in iterable.keys():
c_p = current_path + '.'+'%s'%key
if iterable[key] == target:
path.append(c_p)
if type(iterable[key]) in [type({}),type(()),type([])]:#如果键值是序列就继续向下找
path += self.find_path(iterable[key], mode, target, c_p)
return path
def isnotRoot(self,data):
'''
判断是否为根节点的子树,0表示是根节点的子树,即测试树的集合完成
'''
con = 0
for key, val in data.items():
if isinstance(data[key],dict):
continue
else:
con += 1
### 所有key都不是字典的时候,即是根节点
if con == 4:
return 0
else:
return 1
将cancer数据随机产生一些空值,对模型测试,测试结果0.956,另外可以将数据进行归一化进一步提高精度
if __name__ == '__main__':
from collections import Counter
from sklearn import datasets
import pandas as pd
import numpy as np
from scipy import stats # 用于求众数
from sklearn.model_selection import train_test_split
import copy
############ data 1,连续属性数据 #######
print('数据特征为连续属性,测试结果为')
data = datasets.load_breast_cancer()
X, Y = data.data, data.target
m,n = X.shape
nanindexX = np.random.randint(0,m,100)
nanindexY = np.random.randint(0,n,100)
X[nanindexX,nanindexY] = np.nan
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
tree_clf = DecisionTreeClassifier(ispruning = True)
tree = tree_clf.fit(X_train,Y_train)
Y_pred = tree_clf.predict(X_test)
print('acc:{}'.format(np.sum(Y_pred == Y_test) / len(Y_test)))
数据特征为连续属性,测试结果为
进入到后剪枝过程----------------------
acc:0.956140350877193