决策树cart分类

本篇文章主要是学习<<机器学习实战>>利用cart决策树进行分类,利用基尼系数作为数据切分的依据,最后利用递归函数构建决策树

测试数据

marry = ['single','single','married','single','married','divorced','married','divorced','single','married','single']
income = [125,125,100,70,120,95,60,220,85,75,90]
house = ['yes','yes','no','no','yes','no','no','yes','no','no','no']
loan = ['yes','no','no','no','no','yes','no','no','yes','no','yes']
data = np.array([marry,income,house,loan]).T

1、计算基尼系数

公式:

          

@staticmethod
# 计算基尼指数
def calc_gini(arr):
    y = arr[:,-1]
    num = len(y)
    gini = 1.
    c = Counter(y)
    for k in c:
        gini -= (c[k] / num) ** 2
    return gini

 

2、切分数据,对分类变量分为等于和不等于两种,连续变量分为小于和大于或等于两种

@staticmethod
# 切分数据
def split_data(data, feat, val, data_type='classifier'):
    if data_type == 'classifier':
        arr1 = data[np.nonzero(data[:, feat] == val)]
        arr2 = data[np.nonzero(data[:, feat] != val)]
    else:
        arr1 = data[np.nonzero(data[:, feat].astype(float) < val)]
        arr2 = data[np.nonzero(data[:, feat].astype(float) >= val)]
    return arr1, arr2

 

3、对于连续变量,先进行按顺序排序然后计算相邻两个数之间的平均数作为特征值

@staticmethod
# 连续变量的切分点处理
def continuity_params_process(arr, feat):
    c = arr[:, feat].astype(float)
    c_sort = sorted(set(c))
    new_c = []
    for i in range(len(c_sort) - 1):
        val = (c_sort[i] + c_sort[i + 1]) / 2
        new_c.append(val)
    return new_c

 

4、选择最好的切分点,按照切分后的数据的得到的基尼值最小来进行数据切分

# 选择最好的切分点
# 满足基尼系数减少最快的方向
def select_split(self,data):
    min_gini = math.inf
    best_feat = None
    best_val = None
    left = None
    right = None
    flag = 0
    for i in range(data.shape[1] - 1):
        if self.__columns[i] in self.__cat_var:
            c_set = set(data[:, i])
            data_type = 'classifier'
        else:
            c_set = self.continuity_params_process(data, i)
            data_type = 'continuity'
        for val in c_set:
            arr1, arr2 = self.split_data(data, i, val, data_type)
            if (len(arr1) < self.__min_samples_leaf) or (len(arr2) < self.__min_samples_leaf):
                continue
            g1 = self.calc_gini(arr1)
            g2 = self.calc_gini(arr2)
            g = len(arr1) / len(data) * g1 + len(arr2) / len(data) * g2
            if g < self.__min_impurity_decrease:
                continue
            if min_gini > g:
                flag = 1
                min_gini = g
                best_feat = i
                best_val = val
                left = arr1
                right = arr2
    if flag == 0:
        return
    return best_feat, best_val, left, right

5、利用递归函数构建递归树,这里面需要主要的是如果特征是一模一样的,而标签又是不一样的,那么cart则无法进一步切分数据。如果是分类变量则等于特征值跳到左子树,如果是连续变量则小于特征值跳到左子树,其他右子树

 

6、本次实验设置了预剪枝:最大深度,最小叶子节点以及最小切分增益

完整代码:

import pandas as pd
import numpy as np
from collections import Counter
import math
from sklearn.datasets import make_moons
from sklearn.metrics import roc_curve
from sklearn.tree import DecisionTreeClassifier


class CARTClassifier:
    def __init__(self,max_depth,min_samples_leaf=1,min_samples_split=2,min_impurity_decrease=0):
        self.__max_depth = max_depth
        self.__min_samples_leaf = min_samples_leaf # 最小叶子节点个数
        self.__min_samples_split = max(min_samples_split,min_samples_leaf*2) # 用于分裂的最小节点个数
        self.__min_impurity_decrease = min_impurity_decrease # 最小增益
        self.__cont_var = None
        self.__cat_var = None
        self.__columns = None
        self.__tree = None

    @property
    def max_depth(self):
        return self.__max_depth

    @max_depth.setter
    def max_depth(self,value):
        self.__max_depth = value

    @property
    def min_samples_leaf(self):
        return self.__min_samples_leaf

    @min_samples_leaf.setter
    def min_samples_leaf(self,values):
        self.__min_samples_leaf = values

    @property
    def min_samples_split(self):
        return self.__min_samples_split

    @min_samples_leaf.setter
    def min_samples_leaf(self,value):
        self.__min_samples_split = value

    @property
    def tree(self):
        return self.__tree

    @staticmethod
    # 计算基尼指数
    def calc_gini(arr):
        y = arr[:,-1]
        num = len(y)
        gini = 1.
        c = Counter(y)
        for k in c:
            gini -= (c[k] / num) ** 2
        return gini

    @staticmethod
    # 切分数据
    def split_data(data, feat, val, data_type='classifier'):
        if data_type == 'classifier':
            arr1 = data[np.nonzero(data[:, feat] == val)]
            arr2 = data[np.nonzero(data[:, feat] != val)]
        else:
            arr1 = data[np.nonzero(data[:, feat].astype(float) < val)]
            arr2 = data[np.nonzero(data[:, feat].astype(float) >= val)]
        return arr1, arr2

    @staticmethod
    # 连续变量的切分点处理
    def continuity_params_process(arr, feat):
        c = arr[:, feat].astype(float)
        c_sort = sorted(set(c))
        new_c = []
        for i in range(len(c_sort) - 1):
            val = (c_sort[i] + c_sort[i + 1]) / 2
            new_c.append(val)
        return new_c

    # 选择最好的切分点
    # 满足基尼系数减少最快的方向
    def select_split(self,data):
        min_gini = math.inf
        best_feat = None
        best_val = None
        left = None
        right = None
        flag = 0
        for i in range(data.shape[1] - 1):
            if self.__columns[i] in self.__cat_var:
                c_set = set(data[:, i])
                data_type = 'classifier'
            else:
                c_set = self.continuity_params_process(data, i)
                data_type = 'continuity'
            for val in c_set:
                arr1, arr2 = self.split_data(data, i, val, data_type)
                if (len(arr1) < self.__min_samples_leaf) or (len(arr2) < self.__min_samples_leaf):
                    continue
                g1 = self.calc_gini(arr1)
                g2 = self.calc_gini(arr2)
                g = len(arr1) / len(data) * g1 + len(arr2) / len(data) * g2
                if g < self.__min_impurity_decrease:
                    continue
                if min_gini > g:
                    flag = 1
                    min_gini = g
                    best_feat = i
                    best_val = val
                    left = arr1
                    right = arr2
        if flag == 0:
            return
        return best_feat, best_val, left, right

    def fit(self,X,y,cat_var = None):
        self.__cat_var = cat_var if cat_var != None else []
        if not isinstance(X,pd.DataFrame):
            X = pd.DataFrame(X,columns=range(X.shape[1]))
        self.__columns = X.columns
        X['y'] = y
        X = X.drop_duplicates()
        self.__tree = self.create_tree(X.values)
        print(self.__tree)

    def create_tree(self,data,n=0):
        # 构建递归树
        tree = {}
        if len(set(data[:, -1])) <= 1:
            return data[:, -1][0]
        # 如果数据的特征一模一样,则无法进一步切分
        # 返回
        dd = data[:,:-1].copy()
        dd = dd.astype(np.dtype("<U21"))
        dd = np.unique(dd,axis=0)
        if len(dd) == 1:
            return data[:,-1].mean()
        rr = self.select_split(data)
        if rr is None:
            return tree
        best_feat, best_val, left, right = rr
        n += 1
        if n >= self.__max_depth:
            tree[(best_feat, best_val, 'left')] = round(left[:,-1].mean(),4)
            tree[(best_feat, best_val, 'right')] = round(right[:,-1].mean(),4)
        else:
            if len(left) >= self.__min_samples_split:
                tree[(best_feat, best_val, 'left')] = self.create_tree(left)
            else:
                tree[(best_feat, best_val, 'left')] = round(left[:,-1].mean(),4)
            if len(right) >= self.__min_samples_split:
                tree[(best_feat, best_val, 'right')] = self.create_tree(right)
            else:
                tree[(best_feat, best_val, 'right')] = round(right[:,-1].mean(),4)
        return tree

    def predict_prob(self,X):
        tree = self.__tree
        pred = []
        xx = pd.DataFrame(X,columns=self.__columns)
        for i in range(len(xx)):
            pred.append(self.__predict(tree,xx.iloc[i:i+1]))
        return np.array(pred)

    def __predict(self,tree,X):
        if type(tree) != dict:
            return tree
        for key in tree:
            col = self.__columns[key[0]]
            if col in self.__cat_var:
                if X[col].iloc[0] == key[1]:
                    r = tree[(key[0], key[1], 'left')]
                else:
                    r = tree[(key[0], key[1], 'right')]
            else:
                if X[col].iloc[0] < key[1]:
                    r = tree[(key[0], key[1], 'left')]
                else:
                    r = tree[(key[0], key[1], 'right')]
            return self.__predict(r, X)


def calc_ks(ytrue,ypred):
    fpr, tpr, thresholds = roc_curve(ytrue, ypred)
    ks = max(np.abs(tpr - fpr))
    return ks


marry = ['single','single','married','single','married','divorced','married','divorced','single','married','single']
income = [125,125,100,70,120,95,60,220,85,75,90]
house = ['yes','yes','no','no','yes','no','no','yes','no','no','no']
loan = ['yes','no','no','no','no','yes','no','no','yes','no','yes']
data = np.array([marry,income,house,loan]).T
df = pd.DataFrame(data,columns=['marry','income','house','loan'])
df['loan'] = df['loan'].apply(lambda x:1 if x == 'yes' else 0)
df['income'] = df['income'].astype(float)

X = df.iloc[:,:-1]
y = df.iloc[:,-1]

if __name__ == '__main__':
    # X, y = make_moons(n_samples=400,noise=0.2)
    # tree = DecisionTreeClassifier(max_depth=3,min_samples_leaf=2)
    # tree.fit(X,y)
    # pred = tree.predict_proba(X)[:,-1]
    # ks = calc_ks(y,pred)
    # print("sklearn:",ks)

    mytree = CARTClassifier(max_depth=3,min_samples_leaf=2)
    mytree.fit(X,y,cat_var=['marry','house']) # 如果都是连续变量则cat_var设置为None或[]
    r_pred = mytree.predict_prob(X)

    ks = calc_ks(y,r_pred)
    print("mytree:",ks)
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值