西瓜书 习题4.4 编程实现基尼指数决策树

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/qdbszsj/article/details/79096767

数据及代码地址:https://github.com/qdbszsj/decisionTreeGini

这里的代码在信息熵决策树的基础上稍加修改就可以,之前是根据熵增的最大值来确定用哪个属性划分,现在是根据基尼指数(表现数据集D的纯度)的最小值来建树。

这里网上的很多人说建出来的树长得和书上的不一样!一样TM就怪了,书上P81页的树那是根据entropy建的,我们建出来的树是根据Gini index的,而且经常会遇到有属性值的Gini值相同,比如根节点,脐部和色泽,这俩属性的Gini指数都是0.875,所以用谁都行,那么这里就要加入随机因素,所以我这份代码跑出来的结果都是千变万化的,根据理论来讲,这个树有很多种解法。

具体的代码过程解释,请看我的上一篇博客,习题4.3,这里改动不大。

然后有一点要注意的,一个bug我调了好久。我传入的参数A,应该是一个list,但是我用np.ones初始化的,然后想用newA=A[ : ]深copy的时候,会出问题,numpy是不支持B=A[ : ]这样子深拷贝的,这样子copy出来的是浅拷贝,把B改了,A也跟着改了,这个问题出现在了我的建树的递归函数里。解决这个问题,要么一开始你就list(A)一下强制转换,后面就可以用B=A[ : ]深拷贝了,或者你写的时候就用numpy,然后import copy,用B=copy.deepcopy(A)来深拷贝。

程序最下面,我用建好的树,去预测了一下7个测试用例,输出了混淆矩阵,每次的运行,树都建的不一样,混淆矩阵的结果也不一样,出入很大,意外的一次运行,我发现了一个很棒的结果,正确率100%。在实际操作中,我们也是应该挑选这种在测试集中表现好的模型,来作为我们的最终模型。

剪枝,这里没有做,后面有时间再写。

#make the data watermelon_2 from 3
# import numpy as np
# import pandas as pd
# dataset = pd.read_csv('/home/parker/watermelonData/watermelon_3.csv', delimiter=",")
# del dataset['密度']
# del dataset['含糖率']
# dataset.to_csv('/home/parker/watermelonData/watermelon_2.csv',header=True,index=False)

# import numpy as np
# import pandas as pd
# dataset = pd.read_csv('/home/parker/watermelonData/watermelon_2.csv', delimiter=",")
# #print(dataset)
# trainID=[0,1,2,5,6,9,13,14,15,16]
# testID=[3,4,7,8,10,11,12]
# trainData=dataset.iloc[trainID,range(8)]
# testData=dataset.iloc[testID,range(8)]
# print(trainData)
# print(testData)
# trainData.to_csv('/home/parker/watermelonData/watermelon_2train.csv', header=True, index=False)
# testData.to_csv('/home/parker/watermelonData/watermelon_2test.csv', header=True, index=False)

import numpy as np
import pandas as pd
dataset = pd.read_csv('/home/parker/watermelonData/watermelon_2train.csv', delimiter=",")
testData = pd.read_csv('/home/parker/watermelonData/watermelon_2test.csv', delimiter=",")
print(dataset)

Attributes=dataset.columns
m,n=np.shape(dataset)
# print(m,n)

dataset=np.matrix(dataset)
attributeSet=[]
for i in range(n):
    curSet=set()
    for j in range(m):
        curSet.add(dataset[j,i])
    attributeSet.append(curSet)

DD=np.arange(0,m,1)
AA=np.ones(n)
AA=list(AA)
AA[0]=AA[n-1]=-1
EPS=1

import random
import copy
def treeGenerate(D,A,title):
    node=Node(title)
    if isSameY(D):#p74 condition(1),samples are in the same cluster
        node.v=dataset[D[0], n - 1]
        return node
    if isBlankA(A) or isSameAinD(D,A):#condition(2),A==NULL or all the D have the same attribute selected
        node.v=mostCommonY(D)
        return node
    #choose the best attribute
    giniGain=n#max num=n , formula 4.6
    floatV=0
    sameValue = []  # for random choose
    for i in range(len(A)):
        if(A[i]>0):
            curGini,divideV=giniIndex(D,i)#formula 4.6
            # print(Attributes[i],curGini)
            if curGini<=giniGain:
                if curGini<giniGain:
                    sameValue=[i]
                    p=i
                    giniGain=curGini
                    floatV=divideV
                else:#random choose
                    sameValue.append(i)

    p = sameValue[random.randint(0,len(sameValue)-1)]
    # print("\n")
    if isSameValue(-1000,floatV,EPS):#not a float devide
        node.v=Attributes[p]+"=?"
        curSet=attributeSet[p]
        for i in curSet:
            Dv=[]
            for j in range(len(D)):
                if dataset[D[j],p]==i:
                    Dv.append(D[j])
            if Dv==[]:#condition(3)
                nextNode = Node(i)
                nextNode.v=mostCommonY(D)
                node.children.append(nextNode)
                #book said we should return here, but I think we should continue
            else:
                #newA=A[:]
                newA=copy.deepcopy(A)
                newA[p]=-1
                node.children.append(treeGenerate(Dv,newA,i))
    else:#is a float devide,the floatV is the boundary
        Dleft=[]
        Dright=[]
        node.v=Attributes[p]+"<="+str(floatV)+"?"
        for i in range(len(D)):
            if dataset[D[i],p]<=floatV:Dleft.append(D[i])
            else: Dright.append(D[i])
        #A[:] should be deepcopy, I found a bug here,A[:] does not work as a deepcopy
        #then I see why it is, numpy array can not use A[:] to deepcopy
        #change the A to list and A[:] can be a deepcopy
        node.children.append(treeGenerate(Dleft,A[:],"yes"))
        node.children.append(treeGenerate(Dright,A[:],"no"))
    return node

class Node(object):
    def __init__(self,title):
        self.title=title
        self.v=1
        self.children=[]
        self.deep=0#for plot
        self.ID=-1#for plot

def isSameY(D):
    curY = dataset[D[0], n - 1]
    for i in range(1, len(D)):
        if dataset[D[i],n-1]!=curY:
            return False
    return True

def isBlankA(A):
    for i in range(n):
        if A[i]>0:return False
    return True

def isSameAinD(D,A):
    for i in range(n):
        if A[i]>0:
            for j in range(1,len(D)):
                if not isSameValue(dataset[D[0],i],dataset[D[j],i],EPS):
                    return False
    return True

def isSameValue(v1,v2,EPS):
    # if type(v1)==type(dataset[0,8]):
    #     return abs(v1-v2)<EPS
    # else: return v1==v2
    return v1==v2

def mostCommonY(D):
    res=dataset[D[0],n-1]#1 or 0
    maxC = 1
    count={}
    count[res]=1
    for i in range(1,len(D)):
        curV=dataset[D[i],n-1]
        if curV not in count:
            count[curV]=1
        else:count[curV]+=1
        if count[curV]>maxC:
            maxC=count[curV]
            res=curV
    return res

def gini(D):#formula 4.5
    types = []
    count = {}
    for i in range(len(D)):
        curY = dataset[D[i], n - 1]
        if curY not in count:
            count[curY] = 1
            types.append(curY)
        else:
            count[curY] += 1
    ans = 1
    total = len(D)
    for i in range(len(types)):
        ans -= count[types[i]] / total * count[types[i]] / total
        #print(count[types[i]] / total * count[types[i]] / total)
    return ans

def giniIndex(D,p):#formula 4.6
    # if type(dataset[0,p])==type(dataset[0,8]):
    #     res,divideV=gainFloat(D,p)
    if False:
        nothing=1
    else:
        types=[]
        count={}
        for i in range(len(D)):
            a=dataset[D[i],p]
            if a not in count:
                count[a]=[D[i]]
                types.append(a)
            else:
                count[a].append(D[i])
        res=0
        for i in range(len(types)):
            res+=gini(count[types[i]])
        divideV=-1000
    return res,divideV


myDecisionTreeRoot=treeGenerate(DD,AA,"root")


def countLeaf(root,deep):
    root.deep=deep
    res=0
    if root.v=='是' or root.v=='否':
        res+=1
        return res,deep
    curdeep=deep
    for i in root.children:
        a,b=countLeaf(i,deep+1)
        res+=a
        if b>curdeep:curdeep=b
    return res,curdeep
cnt,deep=countLeaf(myDecisionTreeRoot,0)
def giveLeafID(root,ID):
    if root.v=='是' or root.v=='否':
        root.ID=ID
        #print(root.title,ID,root.deep)
        ID+=1
        return ID
    for i in root.children:
        ID=giveLeafID(i,ID)
    return ID
giveLeafID(myDecisionTreeRoot,0)

import matplotlib.pyplot as plt
decisionNode=dict(boxstyle="sawtooth",fc="0.8")
leafNode=dict(boxstyle="round4",fc="0.8")
arrow_args=dict(arrowstyle="<-")

def plotNode(nodeTxt,centerPt,parentPt,nodeType):
    plt.annotate(nodeTxt,xy=parentPt,xycoords='axes fraction',xytext=centerPt,
                                textcoords='axes fraction',va="center",ha="center",bbox=nodeType,arrowprops=arrow_args)
fig=plt.figure(1,facecolor='white')

import matplotlib as  mpl
mpl.rcParams[u'font.sans-serif'] = ['simhei']
mpl.rcParams['axes.unicode_minus'] = False


def dfsPlot(root):
    if root.ID==-1:
        childrenPx=[]
        meanPx=0
        for i in root.children:
            cur=dfsPlot(i)
            meanPx+=cur
            childrenPx.append(cur)
        meanPx=meanPx/len(root.children)
        c=0
        for i in root.children:
            nodetype=leafNode
            if i.ID<0:nodetype=decisionNode
            plotNode(i.v,(childrenPx[c],0.9-i.deep*0.8/deep),(meanPx,0.9-root.deep*0.8/deep),nodetype)
            plt.text((childrenPx[c]+meanPx)/2,(0.9-i.deep*0.8/deep+0.9-root.deep*0.8/deep)/2,i.title)
            c+=1
        return meanPx
    else:
        return 0.1+root.ID*0.8/(cnt-1)
rootX=dfsPlot(myDecisionTreeRoot)
plotNode(myDecisionTreeRoot.v,(rootX,0.9),(rootX,0.9),decisionNode)


testData=np.matrix(testData)

def treePredictSet(root,testSet):#return the precision
    testM,testN=np.shape(testSet)
    confusionMatrix=np.zeros((2,2))
    for i in range(testM):
        predictV=treePredictOne(root,testSet,i)
        trueV=testSet[i,testN-1]
        if predictV==trueV:
            if trueV=='否':confusionMatrix[0,0]+=1
            else: confusionMatrix[1,1]+=1
        else:
            if trueV=='否':confusionMatrix[0,1]+=1
            else:confusionMatrix[1,0]+=1
    return confusionMatrix

def treePredictOne(root,testSet,p):#not support float
    while(True):
        if root.children==[]:#ID!=-1 leafnode
            return root.v
        curAttribute=root.v
        for i in range(len(Attributes)):
            if Attributes[i] in curAttribute:#curAttribute.contain(Attributes[i]):
                curAttribute=i
                break
        title=testSet[p,curAttribute]
        for i in root.children:
            if i.title==title:
                root=i
                break

print(treePredictSet(myDecisionTreeRoot,testData))

# fuck=np.ones(3)
# fuck=list(fuck)
# fuck2=fuck[:]
# fuck2[0]=999
# print(fuck)


plt.show()

best answer:



other1:


other2:


other3:


other4:这个也是满分,但是树长得跟第一个不一样


还有好多种组合方式,就先这样吧


展开阅读全文

没有更多推荐了,返回首页