本篇文章主要是学习<<机器学习实战>>利用cart决策树进行分类,利用基尼系数作为数据切分的依据,最后利用递归函数构建决策树
测试数据
marry = ['single','single','married','single','married','divorced','married','divorced','single','married','single']
income = [125,125,100,70,120,95,60,220,85,75,90]
house = ['yes','yes','no','no','yes','no','no','yes','no','no','no']
loan = ['yes','no','no','no','no','yes','no','no','yes','no','yes']
data = np.array([marry,income,house,loan]).T
1、计算基尼系数
公式:
@staticmethod
# 计算基尼指数
def calc_gini(arr):
y = arr[:,-1]
num = len(y)
gini = 1.
c = Counter(y)
for k in c:
gini -= (c[k] / num) ** 2
return gini
2、切分数据,对分类变量分为等于和不等于两种,连续变量分为小于和大于或等于两种
@staticmethod # 切分数据 def split_data(data, feat, val, data_type='classifier'): if data_type == 'classifier': arr1 = data[np.nonzero(data[:, feat] == val)] arr2 = data[np.nonzero(data[:, feat] != val)] else: arr1 = data[np.nonzero(data[:, feat].astype(float) < val)] arr2 = data[np.nonzero(data[:, feat].astype(float) >= val)] return arr1, arr2
3、对于连续变量,先进行按顺序排序然后计算相邻两个数之间的平均数作为特征值
@staticmethod # 连续变量的切分点处理 def continuity_params_process(arr, feat): c = arr[:, feat].astype(float) c_sort = sorted(set(c)) new_c = [] for i in range(len(c_sort) - 1): val = (c_sort[i] + c_sort[i + 1]) / 2 new_c.append(val) return new_c
4、选择最好的切分点,按照切分后的数据的得到的基尼值最小来进行数据切分
# 选择最好的切分点
# 满足基尼系数减少最快的方向
def select_split(self,data):
min_gini = math.inf
best_feat = None
best_val = None
left = None
right = None
flag = 0
for i in range(data.shape[1] - 1):
if self.__columns[i] in self.__cat_var:
c_set = set(data[:, i])
data_type = 'classifier'
else:
c_set = self.continuity_params_process(data, i)
data_type = 'continuity'
for val in c_set:
arr1, arr2 = self.split_data(data, i, val, data_type)
if (len(arr1) < self.__min_samples_leaf) or (len(arr2) < self.__min_samples_leaf):
continue
g1 = self.calc_gini(arr1)
g2 = self.calc_gini(arr2)
g = len(arr1) / len(data) * g1 + len(arr2) / len(data) * g2
if g < self.__min_impurity_decrease:
continue
if min_gini > g:
flag = 1
min_gini = g
best_feat = i
best_val = val
left = arr1
right = arr2
if flag == 0:
return
return best_feat, best_val, left, right
5、利用递归函数构建递归树,这里面需要主要的是如果特征是一模一样的,而标签又是不一样的,那么cart则无法进一步切分数据。如果是分类变量则等于特征值跳到左子树,如果是连续变量则小于特征值跳到左子树,其他右子树
6、本次实验设置了预剪枝:最大深度,最小叶子节点以及最小切分增益
完整代码:
import pandas as pd
import numpy as np
from collections import Counter
import math
from sklearn.datasets import make_moons
from sklearn.metrics import roc_curve
from sklearn.tree import DecisionTreeClassifier
class CARTClassifier:
def __init__(self,max_depth,min_samples_leaf=1,min_samples_split=2,min_impurity_decrease=0):
self.__max_depth = max_depth
self.__min_samples_leaf = min_samples_leaf # 最小叶子节点个数
self.__min_samples_split = max(min_samples_split,min_samples_leaf*2) # 用于分裂的最小节点个数
self.__min_impurity_decrease = min_impurity_decrease # 最小增益
self.__cont_var = None
self.__cat_var = None
self.__columns = None
self.__tree = None
@property
def max_depth(self):
return self.__max_depth
@max_depth.setter
def max_depth(self,value):
self.__max_depth = value
@property
def min_samples_leaf(self):
return self.__min_samples_leaf
@min_samples_leaf.setter
def min_samples_leaf(self,values):
self.__min_samples_leaf = values
@property
def min_samples_split(self):
return self.__min_samples_split
@min_samples_leaf.setter
def min_samples_leaf(self,value):
self.__min_samples_split = value
@property
def tree(self):
return self.__tree
@staticmethod
# 计算基尼指数
def calc_gini(arr):
y = arr[:,-1]
num = len(y)
gini = 1.
c = Counter(y)
for k in c:
gini -= (c[k] / num) ** 2
return gini
@staticmethod
# 切分数据
def split_data(data, feat, val, data_type='classifier'):
if data_type == 'classifier':
arr1 = data[np.nonzero(data[:, feat] == val)]
arr2 = data[np.nonzero(data[:, feat] != val)]
else:
arr1 = data[np.nonzero(data[:, feat].astype(float) < val)]
arr2 = data[np.nonzero(data[:, feat].astype(float) >= val)]
return arr1, arr2
@staticmethod
# 连续变量的切分点处理
def continuity_params_process(arr, feat):
c = arr[:, feat].astype(float)
c_sort = sorted(set(c))
new_c = []
for i in range(len(c_sort) - 1):
val = (c_sort[i] + c_sort[i + 1]) / 2
new_c.append(val)
return new_c
# 选择最好的切分点
# 满足基尼系数减少最快的方向
def select_split(self,data):
min_gini = math.inf
best_feat = None
best_val = None
left = None
right = None
flag = 0
for i in range(data.shape[1] - 1):
if self.__columns[i] in self.__cat_var:
c_set = set(data[:, i])
data_type = 'classifier'
else:
c_set = self.continuity_params_process(data, i)
data_type = 'continuity'
for val in c_set:
arr1, arr2 = self.split_data(data, i, val, data_type)
if (len(arr1) < self.__min_samples_leaf) or (len(arr2) < self.__min_samples_leaf):
continue
g1 = self.calc_gini(arr1)
g2 = self.calc_gini(arr2)
g = len(arr1) / len(data) * g1 + len(arr2) / len(data) * g2
if g < self.__min_impurity_decrease:
continue
if min_gini > g:
flag = 1
min_gini = g
best_feat = i
best_val = val
left = arr1
right = arr2
if flag == 0:
return
return best_feat, best_val, left, right
def fit(self,X,y,cat_var = None):
self.__cat_var = cat_var if cat_var != None else []
if not isinstance(X,pd.DataFrame):
X = pd.DataFrame(X,columns=range(X.shape[1]))
self.__columns = X.columns
X['y'] = y
X = X.drop_duplicates()
self.__tree = self.create_tree(X.values)
print(self.__tree)
def create_tree(self,data,n=0):
# 构建递归树
tree = {}
if len(set(data[:, -1])) <= 1:
return data[:, -1][0]
# 如果数据的特征一模一样,则无法进一步切分
# 返回
dd = data[:,:-1].copy()
dd = dd.astype(np.dtype("<U21"))
dd = np.unique(dd,axis=0)
if len(dd) == 1:
return data[:,-1].mean()
rr = self.select_split(data)
if rr is None:
return tree
best_feat, best_val, left, right = rr
n += 1
if n >= self.__max_depth:
tree[(best_feat, best_val, 'left')] = round(left[:,-1].mean(),4)
tree[(best_feat, best_val, 'right')] = round(right[:,-1].mean(),4)
else:
if len(left) >= self.__min_samples_split:
tree[(best_feat, best_val, 'left')] = self.create_tree(left)
else:
tree[(best_feat, best_val, 'left')] = round(left[:,-1].mean(),4)
if len(right) >= self.__min_samples_split:
tree[(best_feat, best_val, 'right')] = self.create_tree(right)
else:
tree[(best_feat, best_val, 'right')] = round(right[:,-1].mean(),4)
return tree
def predict_prob(self,X):
tree = self.__tree
pred = []
xx = pd.DataFrame(X,columns=self.__columns)
for i in range(len(xx)):
pred.append(self.__predict(tree,xx.iloc[i:i+1]))
return np.array(pred)
def __predict(self,tree,X):
if type(tree) != dict:
return tree
for key in tree:
col = self.__columns[key[0]]
if col in self.__cat_var:
if X[col].iloc[0] == key[1]:
r = tree[(key[0], key[1], 'left')]
else:
r = tree[(key[0], key[1], 'right')]
else:
if X[col].iloc[0] < key[1]:
r = tree[(key[0], key[1], 'left')]
else:
r = tree[(key[0], key[1], 'right')]
return self.__predict(r, X)
def calc_ks(ytrue,ypred):
fpr, tpr, thresholds = roc_curve(ytrue, ypred)
ks = max(np.abs(tpr - fpr))
return ks
marry = ['single','single','married','single','married','divorced','married','divorced','single','married','single']
income = [125,125,100,70,120,95,60,220,85,75,90]
house = ['yes','yes','no','no','yes','no','no','yes','no','no','no']
loan = ['yes','no','no','no','no','yes','no','no','yes','no','yes']
data = np.array([marry,income,house,loan]).T
df = pd.DataFrame(data,columns=['marry','income','house','loan'])
df['loan'] = df['loan'].apply(lambda x:1 if x == 'yes' else 0)
df['income'] = df['income'].astype(float)
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
if __name__ == '__main__':
# X, y = make_moons(n_samples=400,noise=0.2)
# tree = DecisionTreeClassifier(max_depth=3,min_samples_leaf=2)
# tree.fit(X,y)
# pred = tree.predict_proba(X)[:,-1]
# ks = calc_ks(y,pred)
# print("sklearn:",ks)
mytree = CARTClassifier(max_depth=3,min_samples_leaf=2)
mytree.fit(X,y,cat_var=['marry','house']) # 如果都是连续变量则cat_var设置为None或[]
r_pred = mytree.predict_prob(X)
ks = calc_ks(y,r_pred)
print("mytree:",ks)