决策树实现(CART生成及剪枝)

这个代码是可以直接运行的,数据集在github链接下的breast_cancer.csv,把代码和数据集放到同一目录下就好了

或者:百度云链接,提取码:eu8q

详细分析和完善下周日再写吧,感觉复杂度好高啊。。。。。。

import numpy as np
import copy as copy
class Node(object):
    def __init__(self):
        self.left = None
        self.right = None
        self.parent = None
        self.items = []
        self.feature = None
        self.feature_value = None

    @property
    def predict(self):
        maxCount = 0
        for i in np.unique(self.items[1]):
            if self.items[1].count(i) > maxCount:
                maxCount = self.items[1].count(i)
                maxPredict = i
        return maxPredict

    def __str__(self):
        if self.left == None and self.right == None:
            return "size:%d predict:%s"%(len(self.items),str(self.predict))
        else:
            return "feature:%s feature_value:%s"%(self.feature,self.feature_value)
    def get_leafEntropy(self):
        g = 1
        n = len(self.items[1])
        p = {}
        for item in self.items[1]:
            p.setdefault(item,0)
            p[item] += 1
        for v in p.values():
            g -= (v / n) ** 2
        # print(self.items)
        # print(p)
        return g
    def get_leaf_num(self):
        if self.left is not None and self.right is not None:
            return self.right.get_leaf_num() + self.left.get_leaf_num()
        else:
            return 1

class Dtree(object):
    def __init__(self):
        self.root = Node()

    def __str__(self):
        queue = [(self.root, -1)]
        level = 0
        res = []
        while queue:
            node,prelevel = queue.pop(0)
            res.append("%d -> %d: %s"%(prelevel, prelevel + 1, str(node)))
            if node.left:
                queue.append((node.left, prelevel + 1))
            if node.right:
                queue.append((node.right, prelevel + 1))

            level += 1
        return "\n".join(res)

    def get_nodeEntropy(self,node):
        ll = len(node.left.items[0])
        lr = len(node.right.items[0])
        return (ll * node.left.get_leafEntropy() + lr * node.right.get_leafEntropy()) / (ll + lr)

    def split(self,feature,feature_value,idx,X):
        div = [[],[]]    #对于不同类型的特征选用不同的划分方法:对于离散的,根据是否相等来划分;对于连续的,根据大于还是小于进行划分
        # for i in idx:
        #     if X[i][feature] == feature_value:
        #         div[0].append(i)
        #     else:
        #         div[1].append(i)
        # return div
        for i in idx:
            if X[i][feature] <= feature_value:
                div[0].append(i)
            else:
                div[1].append(i)
        return div

    def get_G(self,idx,X,y):
        g = 1
        n = len(idx)
        p = {}
        for i in idx:
            p.setdefault(y[i], 0)
            p[y[i]] += 1
        for v in p.values():
            g -= (v / n) ** 2
        return g

    def get_bestFeatureValue_forAFeature(self,X,y,idx,feature,best_feature,best_feature_value,minG):
        feature_vs = np.unique([X[i][feature] for i in idx])
        for feature_v in feature_vs:
            div = self.split(feature,feature_v,idx,X)
            ll = len(div[0])
            lr = len(div[1])
            curG = (ll * self.get_G(div[0],X,y) + lr * self.get_G(div[1],X,y)) / (ll + lr)
            # print(feature,feature_v,curG)
            if curG < minG:
                minG = curG
                best_feature = feature
                best_feature_value = feature_v
        return best_feature,best_feature_value,minG

    def get_bestFeatureAndValue(self,X,y,idx):
        best_feature = 0
        best_feature_value = X[0][best_feature]
        minG = 1
        for feature in range(len(X[0])):
            best_feature,best_feature_value, minG = self.get_bestFeatureValue_forAFeature(X,y,idx,feature,best_feature,best_feature_value,minG)
        return best_feature,best_feature_value

    def create_Dtree(self,X,y):
        queue = [(self.root,range(len(X)))]
        while queue:
            node,idx = queue.pop(0)
            if len(np.unique([y[i] for i in idx])) == 1:
                node.items = [copy.deepcopy(idx),[y[i] for i in idx]]
                # node.predict = y[idx[0]]
                continue
            best_feature,best_feature_value = self.get_bestFeatureAndValue(X,y,idx)

            print("bestFeature: %s, bestFeatureValue: %s"%(str(best_feature),str(best_feature_value)))

            node.feature = best_feature
            node.feature_value = best_feature_value
            node.items = [copy.deepcopy(idx),[y[i] for i in idx]]  #为便于剪枝时比较单节点树形式和子树形式的基尼系数,子树的标记也需要保存到根节点

            div = self.split(best_feature,best_feature_value,idx,X)
            if div[0] != []:
                node.left = Node()
                node.left.parent = node
                queue.append((node.left,div[0]))
            if div[1] != []:
                node.right = Node()
                node.right.parent = node
                queue.append((node.right,div[1]))

    def predict(self,xi):
        node = self.root
        while node.left or node.right:
            if xi[node.feature] <= node.feature_value:
                node = node.left
            else:
                node = node.right
        return node.predict

    def get_min_gt(self):
        minGt = 0
        targetNode = None
        queue = [(self.root)]
        while queue:
            node = queue.pop(0)
            Ct = node.get_leafEntropy()  #寻找最小的g(t),见统计学习方法p86
            CTt = self.get_nodeEntropy(node)
            leafnum = node.get_leaf_num()
            curGt = (Ct - CTt) / (leafnum - 1)
            if minGt == 0 or curGt < minGt:
                minGt = curGt
                targetNode = node
            if node.left.left and node.left.right:
                queue.append((node.left))
            if node.right.left and node.right.right:
                queue.append((node.right))
        return targetNode,minGt
    def merge_subTree(self,node):
        node.left = None
        node.right = None


# 用统计学习方法第五章贷款申请样本测试一下:
# 注意这些特征都是离散的,所以要在split函数里把划分标准改成是否相等
# def test_create():
#     dt = Dtree()
#     X = [
#         ['young', 'no', 'no', 'normal'],
#         ['young', 'no', 'no', 'good'],
#         ['young', 'yes', 'no', 'good'],
#         ['young', 'yes', 'yes', 'normal'],
#         ['young', 'no', 'no', 'normal'],
#
#         ['midage', 'no', 'no', 'normal'],
#         ['midage', 'no', 'no', 'good'],
#         ['midage', 'yes', 'yes', 'good'],
#         ['midage', 'no', 'yes', 'verygood'],
#         ['midage', 'no', 'yes', 'verygood'],
#
#         ['old', 'no', 'yes', 'verygood'],
#         ['old', 'no', 'yes', 'good'],
#         ['old', 'yes', 'no', 'good'],
#         ['old', 'yes', 'no', 'verygood'],
#         ['old', 'no', 'no', 'normal'],
#     ]
#     y = ['no', 'no', 'yes', 'yes', 'no',
#          'no', 'no', 'yes', 'yes', 'yes',
#          'yes', 'yes', 'yes', 'yes', 'no']
#     dt.create_Dtree(X,y)
#     dt2 = copy.deepcopy(dt)
#     print("dt:\n",dt)
#     print("dt2:\n",dt2)
#     # print(dt.get_C_multiNode(dt.root))  #准确分类时,任意节点的基尼系数为零
#     dt2.root.left.feature = 100
#     print("dt:\n", dt)
#     print("dt2:\n", dt2)
#
#     print(dt.root.items)
#
#     # print(dt.get_min_gt())
# test_create()


import numpy as np
data = np.loadtxt('breast_cancer.csv',delimiter=',',dtype=float)
n = data.shape[0]

# 按顺序划分数据,这种方式得到的准确率要比随机生成低,大概几个百分点
# X_train = data[:(int) (0.6 * n), : -1]
# y_train = [int(x) for x in data[:(int) (0.6 * n), -1]]
#
# X_cv = data[(int) (0.6 * n) : (int)(0.8 * n), : -1]
# y_cv = data[(int) (0.6 * n) : (int)(0.8 * n), -1]
#
# X_test = data[(int) (0.8 * n) :, : -1]
# y_test = data[(int) (0.8 * n) :, -1]


from numpy.random import choice,seed

X = data[:,:-1]
y = data[:,-1]

train_size =int(0.6 * n)  #训练集大小
cv_size = int(0.2 * n)      #交叉验证集大小
test_size = int(0.2 * n)      #测试集大小

train_rows = choice(range(n),size = train_size,replace=False)
X_train = [X[i] for i in train_rows]
y_train = [y[i] for i in train_rows]

remains = [i for i in range(n) if i not in train_rows]
cv_rows = choice(remains,size=cv_size,replace=False)
X_cv = [X[i] for i in cv_rows]
y_cv = [y[i] for i in cv_rows]

X_test = [X[i] for i in remains if i not in cv_rows]
y_test = [y[i] for i in remains if i not in cv_rows]

t1 = Dtree()
t1.create_Dtree(X_train,y_train)

predictTrue = 0
queue = [(t1)]
bestTree = t1
maxAcc = 0
alpha = []

while queue:   #剪枝并进行交叉验证
    curTree = queue.pop(0)
    predictTrue = 0
    for i in range(len(X_cv)):
        curPredict = curTree.predict(X_cv[i])
        if curPredict == y_cv[i]:
            predictTrue += 1
    curAcc = predictTrue / len(y_cv)

    print("交叉验证准确率: ",curAcc)

    if curAcc > maxAcc:
        bestTree = curTree
        maxAcc = curAcc
    if curTree.root.left and curTree.root.right:
        nextTree = copy.deepcopy(curTree)
        bestNode,ai = nextTree.get_min_gt()  #选在交叉验证集上准确率最高的决策树作为最优模型

        print("每次生成的参数alpha: ",ai)

        alpha.append(ai)
        nextTree.merge_subTree(bestNode)
        queue.append(nextTree)

predictTrue = 0
for i in range(len(X_test)):
    curPredict = bestTree.predict(X_test[i])
    if curPredict == y_test[i]:
        predictTrue += 1
acc = predictTrue / len(y_test)
print(acc)

  • 2
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值