decision Tree (ID3 , C4.5, CART)

首先分析分析ID3/C4.5算法之间的区别于联系
ID3: 对可取数目较多的类别有所偏好
C4.5: 对可取数目较少的类别有所偏好。
在实际应用中会使用一种折中的方法,该方法是先从ID3中选取出信息增益大于平均信息增益的类别,再利用C4.5选取增益率最大的作为最佳分类准则。

CART: 基尼系数

以下是ID3算法的实现代码:

def ID3Ent( Data):
    m,n = shape(Data)
    lableCount = {}
    for vect in Data:
        curentLabel = vect[-1]
        if curentLabel in lableCount.keys():
           lableCount[curentLabel] += 1
        else:
            lableCount[curentLabel] = 1
    ent = 0.0
    for key in lableCount.keys():
        p = lableCount[key]/m
        ent += -p*log(p, 2)
    return ent

def splitDataSet(data, axis, value):
    reductData = []
    for feat in data:
        if feat[axis] == value:
            reductFeat = feat[:axis]
            reductFeat.extend(feat[axis + 1:])
            reductData.append(reductFeat)
    return reductData

def cla_info_gain(data, axis):
    Ent = 0
    base_Ent = ID3Ent(data)
    feat_list = [example[axis] for example in data]
    feat = set(feat_list)
    for value in feat:
        reductData =  splitDataSet(data, axis, value)
        p = len(reductData)/ len(data)
        Ent += p * ID3Ent(reductData)
    info_gain = base_Ent - Ent
    return info_gain

以下是C4.5 算法的代码实现:

def Gain_ratio(data, axis):
    feat = set([example[axis] for example in data])
    info_Gain = cla_info_gain(data, axis)
    IV = 0.0
    for value in feat:
        reductData = splitDataSet(data, axis, value)
        p = len(reductData)/len(data)
        IV += -p * log(p, 2)
    Gain_R = info_Gain/ IV
    return Gain_R

以下是CART代码部分:

#################  CART  ###################################
def Gini(data):
    Gini_value = 0.0
    labelCount = {}
    for vec in data:
        if vec[-1] in labelCount.keys():
            labelCount[vec] += 1
        else:
            labelCount[vec] = 1
    for key in labelCount.keys():
        Gini_value += labelCount[key]/len(data)
    return 1 - Gini_value**2

def Gini_index(data, axis):
    feat = set([example[axis] for example in data])
    Gini_index_value = 0
    for value in feat:
        reducData = splitDataSet(data,axis, value[axis])
        p = len(reducData)/len(data)
        Gini_index_value += Gini(data)

以下是实践部分

# -*- coding: utf-8 -*-

import numpy as np
     

def loadData(file):
    fr = open(file)
    data = []
    for line in fr.readlines():
        lineArr = [float(i) for i in line.split('\t')]
        data.append(lineArr)
    return data

#测试
#import pandas as pd
#file = r"C:\Users\Administrator\Desktop\python\data\ex0.txt"
#data = loadData(file)
#pd.DataFrame(data).plot(x = 0, y= 1, kind = 'scatter')

#三个参数<数据集合, 待切分的特征, 该特征的某个值>
def binSplitDataSet(dataSet, feature, value):
    mat0 = dataSet[np.nonzero(dataSet[:, feature] > value)[0], :]
    mat1 = dataSet[np.nonzero(dataSet[: , feature] <= value)[0], :]
    return mat0, mat1

#四个参数<数据集>, 其他三个参数可选<leafType建议叶节点的函数;errType误差计算函数;ops树构建所需的其他参数的元组>
#choosebestSplit() -- 切分函数
'''伪代码:
找到最佳的待切分特征:
    如果该节点不能再分,将该节点存为叶节点
    执行二元切分
    在右子树调用create Tree()方法
    在左子树调用createtree()方法
''' 
#生成叶节点
def regLeaf(dataSet):
    return np.mean(dataSet[:,-1])

#计算目标变量的平方误差
def regErr(dataSet):
    return np.var(dataSet[:, -1]) * np.shape(dataSet)[0]

'''
#chooseBestSplit() 最佳方式切分数据集和生成相应的叶节点 数据集切分的最佳位置
遍历所有的特征及其可能的取值来找到使误差最小化的切分阈值,伪代码如下:
对每个特征:
    对每个特征值:
        将数据集切分成两份
        计算切分的误差
        如果当前误差小于当前最小误差,那么将当前切分设定为最佳切分,并更新最小误差
返回最佳切分的特征和阈值
'''
def chooseBestSplit(dataSet, leafType = regLeaf, errType = regErr, ops = (1,4)):
    tolS = ops[0] #容许的误差下降值
    tolN = ops[1] #切分的最少样本数
    #如果所有值都相等,则退出
    if len(set(dataSet[:, -1])) == 1:
        return None, leafType(dataSet)
    m, n = np.shape(dataSet)
    S = errType(dataSet)
    bestS = float('inf') # inf 表示无穷的意思
    bestIndex = 0; bestValue = 0
    for featIndex in range(n-1):
        for splitVal in set(dataSet[:, featIndex]):
            mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
            #如果误差减少不大则退出
            if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN): continue
            newS = errType(mat0) + errType(mat1)
            if newS < bestS:
                bestIndex = featIndex
                bestValue = splitVal
                bestS = newS
    if (S - bestS) < tolS:
        return None, leafType(dataSet)
    mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
    #如果切分的数据集很小,则退出
    if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN):
        return None, leafType(dataSet)
    return bestIndex, bestValue


def creatTree(dataSet, leafType = regLeaf, errType = regErr, ops = (1,4)):
    feat, val = chooseBestSplit(dataSet, leafType, errType, ops)
    if feat ==  None: return val
    retTree = {}
    retTree['spInd'] = feat
    retTree['spVal'] = val
    lSet, rSet = binSplitDataSet(dataSet, feat, val)
    retTree['left'] = creatTree(lSet, leafType, errType, ops)
    retTree['right'] = creatTree(rSet, leafType, errType, ops)
    return retTree


#测试    
#testMat = np.mat(np.eye(4))
#mat0, mat1 = binSplitDataSet(testMat, 1, 0.5)

if __name__ == '__main__':
    import matplotlib.pyplot as plt
    file = r"C:\Users\Administrator\Desktop\python\data\ex0.txt"
    dataSet = loadData(file)
    dataSet = np.array(dataSet)
    plt.scatter(dataSet[:, 0], dataSet[:, 1])
    re = creatTree(dataSet)


'''
基于已有的树切分测试数据:
    如果存在任一子集是一棵树,则在该子集递归剪枝过程
    计算将当前两个叶节点合并后的误差
    计算不合并的误差
    如果合并会降低叶节点误差的画,就将叶节点合并
'''
def isTree(obj):
    return (type(obj).__name__ == 'dict')

#递归函数,如果找到两个叶节点,则对他们求均值
def getMean(tree):
    if isTree(tree['right']): 
        tree['right'] = getMean(tree['right'])
    if isTree(tree['left']): 
        tree['left'] = getMean(tree['left'])
    return (tree['left'] + tree['right'])/2.0

def prune(tree, testData):
    '''
    tree:待剪枝的树
    testData:剪枝所需的测试数据
    '''   
    #检验测试数据否为空
    if np.shape(testData)[0] == 0:
        return getMean(tree)
    if (isTree(tree['right']) or isTree(tree['left'])):
        lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
    if isTree(tree['left']) :
        tree['left'] = prune(tree['left'], lSet)
    if isTree(tree['right']) :
        tree['right']  = prune(tree['right'] , rSet)
    if not isTree(tree['left']) and not isTree(tree['right']):
        lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
        errorNoMerge = np.sum(np.power(lSet[:, -1] - tree['left'] , 2)) +\
                        np.sum(np.power(rSet[:, -1] - tree['right'] , 2))
        treeMean = (tree['left'] + tree['right'])/2.0
        errorMerge = np.sum(np.power(testData[:, -1] - treeMean, 2))
        if errorMerge <errorNoMerge:
            print("merging")
            return treeMean
        else:
            return tree
    else: 
        return tree

#测试        
if __name__ == '__main__':    
    ex2 = r'C:\Users\Administrator\Desktop\python\data\ex2.txt'
    myDat2 = loadData(ex2)
    myDat2 = np.array(myDat2)
    plt.scatter(myDat2[:, 0], myDat2[:, 1])
    myTree = creatTree(myDat2, ops= (0, 1))
    
    ex2test = r'C:\Users\Administrator\Desktop\python\data\ex2test.txt'
    myDataTest = loadData(ex2test)
    myDataTest = np.array(myDataTest)
    plt.scatter(myDataTest[:, 0], myDataTest[:, 1])
    testData = myDataTest.copy()
    prune(myTree, myDataTest)

#将数据集格式化成目标变量Y和自变量X
def linearSolve(dataSet):
    m, n = np.shape(dataSet)
    X = np.mat(np.ones((m, n-1)))
    Y = np.mat(np.ones((m, 1)))
    X[:, 1:n] = dataSet[:, 0: n-1]
    Y = np.mat(dataSet[:, -1]).reshape(m,1)
    xTx = X.T*X
    #判断特征值是否为0
    if np.linalg.det(xTx) == 0.0:
        raise NameError ('This matrix is singular, cannot do inverse,\n\
                         try increasing the second value of ops')
    ws = xTx.T*(X.T*Y)
    return ws, X, Y

#负责生成叶节点
def modelLeaf(dataSet):
    ws, X, Y = linearSolve(dataSet)
    return ws

#在给定的数据集上计算误差
def modelErr(dataSet):
    ws, X, Y = linearSolve(dataSet)
    yHat = X*ws
    return np.sum(np.power(Y - yHat, 2))

#测试        
if __name__ == '__main__':  
    ex2 = r'C:\Users\Administrator\Desktop\python\data\ex2.txt'
    myDat2 = loadData(ex2)
    myDat2 = np.array(myDat2)
    plt.scatter(myDat2[:, 0], myDat2[:, 1])
    myTree = creatTree(myDat2, modelLeaf, modelErr ,ops= (1, 10))   
    ex2test = r'C:\Users\Administrator\Desktop\python\data\ex2test.txt'
    myDataTest = loadData(ex2test)
    myDataTest = np.array(myDataTest)
    plt.scatter(myDataTest[:, 0], myDataTest[:, 1])
    testData = myDataTest.copy()
    prune(myTree, myDataTest)


def regTreeEval(model, inDat):
    return float(model)

#对叶节点数据进行预测
def modelTreeEval(model, inDat):
    n = np.shape(inDat)[1]
    X = np.mat(np.ones((1, n+1)))
    X[:, 1: n+1] = inDat
    return float(X*model)

#自顶向下遍历整棵树,直到命中叶节点为止,一旦到达叶节点,它就会在输入数据上调用modelEval()函数,而该函数的默认值实regTreeEval()
def treeForeCast(tree, inData, modelEval = regTreeEval):
    if not isTree(tree):
        return modelEval(tree, inData)
    if inData[tree['spInd']] > tree['spVal']:
        if isTree(tree['left']):
            return treeForeCast(tree['left'], inData, modelErr)
        else:
            return modelEval(tree['left'], inData)
    else:
        if isTree(tree['right']):
            return treeForeCast(tree['right'], inData, modelEval)
        else:
            return modelEval(tree['right'], inData)
        
def creatForeCast(tree, testData, modelEval = regTreeEval):
    m = len(testData)
    yHat = np.mat(np.zeros((m, 1)))
    for i in range(m):
        yHat[i, 0] = treeForeCast(tree, np.mat(testData[i]), modelEval)
        return yHat



        















  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
决策树是一种常见的机器学习算法,其中ID3C4.5和CART是三种常见的决策树分类算法。下面是它们的Python代码介绍: 1. ID3算法代码: ``` def ID3(data, labels): # 计算数据集的信息熵 entropy = calcEntropy(data) # 如果数据集的信息熵为0,则直接返回该数据集的类别 if entropy == 0: return data[0][-1] # 如果特征集为空,则返回数据集中出现次数最多的类别 if len(labels) == 0: return majorityClass(data) # 选择最优划分特征 bestFeature = chooseBestFeature(data, labels) # 根据最优特征生成决策树 decisionTree = {bestFeature: {}} # 从特征集中删除已经使用的特征 labels.remove(bestFeature) # 获取最优特征的所有取值 featureValues = set([example[bestFeature] for example in data]) # 遍历最优特征的所有取值,生成子树 for value in featureValues: subLabels = labels[:] decisionTree[bestFeature][value] = ID3(splitData(data, bestFeature, value), subLabels) return decisionTree ``` 2. C4.5算法代码: ``` def C45(data, labels): # 计算数据集的信息熵 entropy = calcEntropy(data) # 如果数据集的信息熵为0,则直接返回该数据集的类别 if entropy == 0: return data[0][-1] # 如果特征集为空,则返回数据集中出现次数最多的类别 if len(labels) == 0: return majorityClass(data) # 选择最优划分特征 bestFeature = chooseBestFeature(data, labels) # 根据最优特征生成决策树 decisionTree = {bestFeature: {}} # 从特征集中删除已经使用的特征 labels.remove(bestFeature) # 获取最优特征的所有取值 featureValues = set([example[bestFeature] for example in data]) # 遍历最优特征的所有取值,生成子树 for value in featureValues: subLabels = labels[:] # 计算每个取值的信息增益率 subData = splitData(data, bestFeature, value) ratio = calcRatio(subData, entropy) # 如果信息增益率高于平均水平,则生成子树 if ratio >= averageRatio(data, bestFeature): decisionTree[bestFeature][value] = C45(subData, subLabels) # 否则返回数据集中出现次数最多的类别 else: decisionTree[bestFeature][value] = majorityClass(subData) return decisionTree ``` 3. CART算法代码: ``` def CART(data, labels): # 如果数据集中只有一个类别,则直接返回该类别 if len(set([example[-1] for example in data])) == 1: return data[0][-1] # 如果特征集为空,则返回数据集中出现次数最多的类别 if len(labels) == 0: return majorityClass(data) # 选择最优划分特征 bestFeature, bestValue, bestScore = chooseBestSplit(data) # 根据最优特征生成决策树 decisionTree = {bestFeature: {}} # 从特征集中删除已经使用的特征 labels.remove(bestFeature) # 生成左子树和右子树 leftData = [example for example in data if example[bestFeature] <= bestValue] rightData = [example for example in data if example[bestFeature] > bestValue] decisionTree[bestFeature]['left'] = CART(leftData, labels) decisionTree[bestFeature]['right'] = CART(rightData, labels) return decisionTree ```
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值