CART 是决策树中的一种,特征的分割选择是基于Gini系数实现的,每次采用二分类的形式,计算所有可能的二分类的基尼系数增量,从而确定其分割特征,因为不能一次将一个特征中的全部属性都分割完,所以可能出现不同节点是一个特征的情况,只是特征的分类值不同。Python自带的决策树是CART二分类树,但是不能处理离散数据,需要进行NLP转换成数字才能处理,本文通过每个特征的属性个数,将特征分为连续属性(属性个数大于10)和离散属性,可同时对其进行操作,过程中只进行了预剪枝的操作。后剪枝(二)以及缺失值(三)的处理在下一篇文章中继续描述,加关注可见
## 注意dataframe 类型数据iloc很慢,可以用apply函数尝试,或者替换成ndarray格式
class DecisionTreeClassifier():
def __init__(self,max_depth: int = None,min_samples_split:int = 5,
min_samples_leaf: int = 5,min_impurity_decrease: float =0.0):
'''
min_samples_split: 内部节点再划分所需最小样本数
min_samples_leaf: 叶子节点最少样本数 这个值限制了叶子节点最少的样本数,如果某叶子节点数目小于样本数,则会和兄弟节点一起被剪枝
分裂需要满足的最小增益
max_depth: 最大深度
min_impurity_decrease:分裂需要满足的最小增益
'''
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.min_samples_leaf = min_samples_leaf
self.min_impurity_decrease = min_impurity_decrease
self.nodes = 0
self.tree = None
self.type_feature = None
def __Gini(self,X,y):
'''
:param data:
:param X: 特征数据
:param y: 目标数据
:return: Gini: 返回该数据每个特征的Gini系数
'''
## 根据第一个公式
K = np.unique(y)
Gini = 1 - np.sum([(len(X[y == k]) / len(X))**2 for k in K])
return Gini
def __typeFeature(self,X):
# 表示特征是否为连续还是离散
n_sample,n_feature = X.shape
self.type_feature = []
#### 特征属性小于10个,认为是离散型数据用0表示,连续性数据用1 表示
for f_idx in range(n_feature):
if len(np.unique(X[:, f_idx]))< 10:
self.type_feature.append(0)
else:
self.type_feature.append(1)
return self.type_feature
def __binSplitData(self,X,y,f_idx,f_val):
'''
二分类划分数据集
:param X 划分数据
:param f_idx: 数据X的第f_idx个特征 X.iloc[:,f_idx] 太慢 用 X.columns[f_idx]
np.unique(X.at[:,f_idx])可以得到该特征的属性,如 array(['Overcast', 'Rain', 'Sunny'], dtype=object)
:param f_val: 数据f_idx个特征中的属性值,即上面中的一种 'Overcast'/ 'Rain'/ 'Sunny
:param type_feature: 离散特征 0 连续特征 1
:return: 二分后的左右数据子集
'''
### att 数有数据在第f_idx的特征的所有属性,将不等于 f_val 分为一类,其余分为另一类
#################### 0: 离散类型特征二分方法 1:连续数据 ############################
att=X[:, f_idx]
if self.type_feature[f_idx]== 0:
X_left = X[att == f_val]
X_right = X[att != f_val]
y_left = y[att == f_val]
y_right = y[att != f_val]
else:
X_left = X[att <= f_val]
X_right = X[att >f_val]
y_left = y[att <= f_val]
y_right = y[att > f_val]
## 切分点和样本点的索引
return X_left, X_right, y_left, y_right
def __bestSplit(self,X,y):
'''
找到最佳分割特征与特征值
:param X
:return: best_f_idx 最佳分割特征 , best_f_val 特征值
'''
best_gain = 0
n_sample,n_feature = X.shape
best_f_idx = None
## 根据大数原则,找到X特征中出现最多的属性作为特征值
best_f_val = stats.mode(y)[0][0]
## 第一个终止条件: 当叶子节点中的样本数小于最小分割值,或者所有样本属于同一类别时,不再分割
if n_sample < self.min_samples_split or len(np.unique(y)) == 1:
return best_f_idx,best_f_val
Gini_before= self.__Gini(X,y)
##-------------------------通过不断二分的过程 寻找对于某个特征,的最佳分割点---------------------------
for f_idx in range(n_feature):
##-------------------------如果该特征中的属性个数小于10,则认为是离散数据 type_feature = 0,否则else---------------------------
if self.type_feature[f_idx] == 0:
for f_val in np.unique(X[:, f_idx]):
## 当某个特征只有两个类别时,仅仅做一次左右子树的划分,不用重复操作
if len(np.unique(X[:, f_idx]))== 2 and f_val == np.unique(X[:, f_idx])[0]:
continue
else:
X_left, X_right, y_left, y_right = self.__binSplitData(X,y,f_idx,f_val)
## 第二个终止条件: 分割后样本数据小于节点的最低样本数,则放弃分割
if len(X_left)<self.min_samples_leaf or len(X_right)<self.min_samples_leaf:
continue
## 如果不满足上述停止条件,则计算分割后的加权Gini
Gini_after = len(X_left)/len(X) * self.__Gini(X_left,y_left) + len(X_right)/len(X) * self.__Gini(X_right,y_right)
gain = Gini_before - Gini_after
## 第三个终止条件,当分裂后的增益小于阈值后者大于目前最大增益
if gain < self.min_impurity_decrease or gain < best_gain:
continue
else:
## 更新最大增益和最佳分裂位置
best_gain = gain
best_f_idx,best_f_val = f_idx,f_val
##------------------------- 连续特征属性的二分 case = 1 ---------------------------
else:
for f_val in np.unique(X[:, f_idx]):
X_left, X_right, y_left, y_right = self.__binSplitData(X,y,f_idx,f_val)
## 第二个终止条件: 分割后样本数据小于节点的最低样本数,则放弃分割
if len(X_left)<self.min_samples_leaf or len(X_right)<self.min_samples_leaf:
continue
## 如果不满足上述停止条件,则计算分割后的加权Gini
Gini_after = len(X_left)/len(X) * self.__Gini(X_left,y_left) + len(X_right)/len(X) * self.__Gini(X_right,y_right)
#print('n_feature,f_val,Gini_after,',n_feature,f_val,Gini_after)
gain = Gini_before - Gini_after
## 第三个终止条件,当分裂后的增益小于阈值后者大于目前最大增益
if gain < self.min_impurity_decrease or gain < best_gain:
continue
else:
## 更新最大增益和最佳分裂位置
best_gain = gain
best_f_idx,best_f_val = f_idx,f_val
return best_f_idx,best_f_val
def __CART(self,X,y):
'''
生成CART树
:param X: 特征数据
:param y: 目标数据
:return; CART 树
'''
best_f_idx, best_f_val = self.__bestSplit(X,y)
self.nodes += 1
# best_f_idx 为空表示不能接续划分,则该点为叶子结点 best_f_val
if best_f_idx is None:
return best_f_val
# 节点数超过最大深度的限制,也要返回叶节点,叶节点的值为当前数据中的目标值众数
if self.max_depth:
if self.nodes >= 2**self.max_depth:
return stats.mode(y)[0][0]
tree = dict()
tree['cut_f'] = best_f_idx
tree['cut_val'] = best_f_val
X_left, X_right, y_left, y_right = self.__binSplitData(X,y,best_f_idx,best_f_val)
tree['left'] = self.__CART(X_left,y_left)
tree['right'] = self.__CART(X_right,y_right)
return tree
def fit(self,X,y,sample_weight = None):
'''
拟合模型,数据应该是 ndarray or series类型,dataframe通过 df.values转变成ndarray,不会报错
:param X: 特征数据
:param: y: 目标数据
:param: sample_weight
:return: None
'''
if sample_weight is None:
## 使得每个数据的权值都是 1/len(X) *len(X)是产生 len(X)个
sample_weight = np.array([1/len(X)] * len(X))
# 标记每个特征是离散还是连续,从而采用不同的二分方法
self.type_feature = self.__typeFeature(X)
self.tree = self.__CART(X,y)
def predict(self,X_test):
'''
数据类别预测
:param X_test:预测数据
:return: y_: 类别预测结果
'''
return np.array([self.__predict_one(x_test, self.tree) for x_test in X_test])
def __predict_one(self,x_test,tree):
if isinstance(tree, dict): # 非叶节点才做左右判断
cut_f_idx, cut_val = tree['cut_f'], tree['cut_val']
if self.type_feature[cut_f_idx] == 0:
sub_tree = tree['left'] if x_test[cut_f_idx] == cut_val else tree['right']
else:
sub_tree = tree['left'] if x_test[cut_f_idx] <= cut_val else tree['right']
return self.__predict_one(x_test, sub_tree)
else:
return tree
为了便于结果的比较,本文将离散数据进行了NLP处理,不处理也同样可以,但是就是没办法与python自带的模型比较了
if __name__ == '__main__':
from sklearn import datasets
import pandas as pd
import numpy as np
from scipy import stats # 用于求众数
from sklearn.model_selection import train_test_split
############ data 1,连续属性数据 #######
print('数据特征为连续属性,测试结果为')
data = datasets.load_breast_cancer()
X, Y = data.data, data.target
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_train, Y_train)
Y_pred = tree_clf.predict(X_test)
del tree_clf
print('acc:{}'.format(np.sum(Y_pred == Y_test) / len(Y_test)))
from sklearn.tree import DecisionTreeClassifier
tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_train, Y_train)
Y_pred = tree_clf.predict(X_test)
print('sklearn acc:{}'.format(np.sum(Y_pred == Y_test) / len(Y_test)))
#################### data2 离散属性数据 #################
del tree_clf
print('数据特征为离散属性,测试结果为')
data = pd.read_csv('./DecisionTreeData.csv',index_col = 'Day')
X = data.iloc[:,:-1]
Y = data.iloc[:,-1].map({'No': 0,'Yes' : 1})
del data
label = {'Sunny':0,'Overcast':1,'Rain':2,'Hot':0,'Mild':1,'Cool':2,'High':0,'Normal':1,'Weak':0,'Strong':1}
X['Outlook']=X['Outlook'].map(label)
X['Temp.']=X['Temp.'].map(label)
X['Humidtity']=X['Humidtity'].map(label)
X['Wind']=X['Wind'].map(label)
X = X.values ## dataframe 转成 ndarray
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_train, Y_train)
Y_pred = tree_clf.predict(X_test)
del tree_clf
print('acc:{}'.format(np.sum(Y_pred == Y_test) / len(Y_test)))
from sklearn.tree import DecisionTreeClassifier
tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_train, Y_train)
Y_pred = tree_clf.predict(X_test)
print('sklearn acc:{}'.format(np.sum(Y_pred == Y_test) / len(Y_test)))
数据特征为连续属性,测试结果为
acc:0.9736842105263158
sklearn acc:0.9649122807017544
数据特征为离散属性,测试结果为
acc:0.3333333333333333
sklearn acc:0.3333333333333333
离散数据结果较差时因为训练集和测试数据太少了。增加数据即可,数据如下图