机器学习实战决策树代码

不知道我是不是入行晚了(爆哭),为啥找的资料都是好久好久以前的了,算了将就着看看吧

1.第三章代码

# -*- coding: utf-8 -*-
"""
Created on Sat Mar 21 11:27:43 2020

@author: 29033
"""

''' script 3-1 '''

from math import log
import operator

#计算给定数据集的熵
def calcShannonEnt(dataSet):
    numEntries = len(dataSet)#确定dataset有几行==有几个样本
    labelCounts = {} #创建字典
    for featVec in dataSet:#针对每一行处理
        currentLabel = featVec[-1]#查找到标签   
        if currentLabel not in labelCounts.keys():
            labelCounts[currentLabel] = 0#如果没有的话,就创建
        labelCounts[currentLabel] += 1#有的话,计数+1

    shannonEnt = 0.0
    for key in labelCounts:
        prob = float(labelCounts[key])/numEntries
        shannonEnt -= prob*log(prob,2)
    return shannonEnt

#创建数据集
def createDataSet():
    dataSet = [[1,1,'yes'],#最后一个实际是标签
               [1,1,'yes'],
               [1,0,'no'],
               [0,1,'no'],
               [0,0,'no']]
    labels = ['no surfacing','flippers']#实际是特征的名字,在字典形式里是键
    return dataSet,labels


#按给定数据集进行划分
def splitDataSet(dataSet,axis,value)->list:
    retDataSet = []
    for featVec in dataSet:
        if featVec [axis] == value:
            reducedFeatVec = featVec[:axis]
            reducedFeatVec.extend(featVec[axis+1:])
            retDataSet.append(reducedFeatVec)
    return retDataSet


#选择最好的数据集划分方式
def chooseBestFeatureToSplit(dataSet):
    numFeatures = len(dataSet[0]) - 1
    baseEntropy = calcShannonEnt(dataSet)
    bestInfoGain = 0.0;bestFeature = -1
    for i in range(numFeatures):
        feaList = [example[i] for example in dataSet]#第i个属性所有可能的值
        uniqueVals = set(feaList)
        newEntropy = 0.0
        for value in uniqueVals:
            subDataSet = splitDataSet(dataSet,i,value)#针对某个属性的特定值
            prob = len(subDataSet)/float(len(dataSet))
            newEntropy += prob*calcShannonEnt(subDataSet)#加权
        infoGain = baseEntropy - newEntropy
        if infoGain > bestInfoGain:
            bestInfoGain = infoGain#找最大的熵增益
            bestFeature = i
    return bestFeature

#递归构建决策树
def majorityCnt(classList):
    classCount = {}
    for vote in classList:
        if vote not in classCount.keys():
            classCount[vote] = 0
        classCount[vote] += 1
    sortedClassCount = sorted(classCount.iteritems(),#返回的是一个新的列表
                              key = operator.itemgetter(1),#按照对象的第1个域【第二个位置】的值
                              reverse = True)#降序排列
    
    #sort 会修改原始的 list(返回值为None)
    return sortedClassCount

#递归创建树的函数代码
def createTree(dataSet,labels):
    classList = [example[-1] for example in dataSet]#标签列表
    if classList.count(classList[0]) == len(classList):#count() 方法用于统计某个元素在列表中出现的次数。
        return classList[0]
    if len(dataSet[0]) == 1:#消耗到只有标签
        return majorityCnt(classList)#所有标签中,最多的那一项
    bestFeat = chooseBestFeatureToSplit(dataSet)
    bestFeatLabel = labels[bestFeat]#最佳特征
    myTree = {bestFeatLabel:{}}
    del(labels[bestFeat])#del删除的是变量,而不是数据;del 是删除引用而不是删除对象;因为可能迭代
    featValues = [example[bestFeat] for example in dataSet]#挑出最佳特征的全部值
    uniqueVals = set(featValues)#最佳特征的值只保留不同的
    for value in uniqueVals:
        subLabels = labels[:]#为了分辨,因为可能迭代
        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet,bestFeat,value),
              subLabels)
    return myTree

#使用决策树进行分类
def classify(inputTree,featLabels,testVec):#inputTree:训练集得到的决策树;featLabels:训练集属性名;testVec:测试集
    firstStr = list(inputTree.keys())[0]
    #print('firstStr:',firstStr)
    secondDict = inputTree[firstStr]#确定有哪几条路
    #print('secondDict:',secondDict)
    featIndex = featLabels.index(firstStr)#将出发点属性(键)转换为索引,方便找出测试集同样的属性的值
    #print('featIndex:',featIndex)
    for key in secondDict.keys():
        #print('key:',key)
        #print('**',testVec[featIndex])
        if testVec[featIndex] == key:#看对应位置与键对应吗-->走这条路吗
            if type(secondDict[key]).__name__ == 'dict':
         #       print('secondDict[key]:',secondDict[key])#走这条路,可以确定这条路有哪些分叉
                classLabel = classify(secondDict[key],featLabels,testVec)
            else:
         #       print('***999***',secondDict[key])
                classLabel = secondDict[key]
         #   print('***666***')
    return classLabel

#决策树的储存
def storeTree(inputTree,filename):
    import pickle
    fw = open(filename,'wb')
    pickle.dump(inputTree,fw)
    fw.close()

def grabTree(filename):
    import pickle
    fr = open(filename,'rb')
    return pickle.load(fr)
    

treeplotter

# -*- coding: utf-8 -*-
"""
Created on Mon Apr 13 08:49:44 2020

@author: 29033
"""
import matplotlib
import matplotlib.pyplot as plt


decisionNode = dict(boxstyle = "sawtooth",fc = "0.8")
leafNode = dict(boxstyle = "round4",fc = "0.8")
arrow_args = dict(arrowstyle = "<-")
matplotlib.rcParams['font.family'] = 'STSong'

def plotNode(nodeTxt,centerPt,parentPt,nodeType):
    createPlot.axl.annotate(nodeTxt,xy = parentPt,\
                            xycoords = "axes fraction",
                            xytext = centerPt,textcoords = "axes fraction",\
                            va = "center",ha = "center",bbox = nodeType,
                            arrowprops = arrow_args)
    

def createPlot():
    fig = plt.figure(1,facecolor = "white")
    fig.clf()
    createPlot.axl = plt.subplot(111,frameon = False)
    plotNode('决策结点',(0.5,0.1),(0.1,0.5),decisionNode)
    plotNode('叶结点',(0.8,0.1),(0.3,0.8),leafNode)
    plt.show()
    

def getNumLeafs(myTree):
    numLeafs = 0
    firstStr = list(myTree.keys())[0]#在python3中myTree.keys()返回的则是dick_keys类型
    #实际返回的是首个key,确定了出发点
    #print(firstStr)
    secondDict = myTree[firstStr]#返回的是摘掉首个Key剩下的,即键值;或者是确定将要走的路的范围
    #print(secondDict)
    #举例来说,调用依次返回的是
    '''
    no surfacing#key
    {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}#value
    flippers
    {0: 'no', 1: 'yes'}
    '''
    for key in secondDict.keys():
        #print('**',key)
        if type(secondDict[key]).__name__ == 'dict':
            numLeafs += getNumLeafs(secondDict[key])#牛掰
        else:
            numLeafs += 1
    return numLeafs

def getTreeDepth(myTree):
    maxDepth = 0
    firstStr = list(myTree.keys())[0]
    secondDict = myTree[firstStr]
    for key in secondDict.keys():
        if type(secondDict[key]).__name__ == 'dict':
            maxDepth = getTreeDepth(secondDict[key])
        else:
            thisDepth = 1
        if thisDepth > maxDepth:
            maxDepth = thisDepth      
    return maxDepth


def retrieveTree(i):
    listOfTree = [{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}},
                  {'no surfacing': {0: 'no', 1: {'flippers': {0: {'head':{0: 'no', 1: 'yes'}}, 1: 'no'}}}}
            ]
    
    return listOfTree[i]


def plotMidText(cntrPt,parentPt,txtString):
    xMid = (parentPt[0]-cntrPt[0])/2.0+cntrPt[0]
    yMid = (parentPt[1]-cntrPt[1])/2.0+cntrPt[1]
    createPlot.axl.text(xMid,yMid,txtString)
    
def plotTree(myTree,parentPt,nodeTxt):
    numLeafs = getNumLeafs(myTree)
    depth = getTreeDepth(myTree)
    firstStr = list(myTree.keys())[0]
    cntrPt = (plotTree.xOff + (1.0+float(numLeafs))/2.0/plotTree.totalW,\
              plotTree.yOff)
    plotMidText(cntrPt,parentPt,nodeTxt)
    plotNode(firstStr,cntrPt,parentPt,decisionNode)
    secondDict = myTree[firstStr]
    plotTree.yOff = plotTree.yOff - 1.0/plotTree.totalD
    
    for key in secondDict.keys():
        if type (secondDict[key]).__name__ == 'dict':
            plotTree(secondDict[key],cntrPt,str(key))
        else:
            plotTree.xOff = plotTree.xOff + 1.0/plotTree.totalW
            plotNode(secondDict[key],(plotTree.xOff,plotTree.yOff),\
                     cntrPt,leafNode)
            plotMidText((plotTree.xOff,plotTree.yOff),cntrPt,str(key))
    plotTree.yOff = plotTree.yOff + 1.0/plotTree.totalD

#主函数 
def createPlot(inTree):
    fig = plt.figure(1,facecolor = 'white')
    fig.clf()
    axprops = dict(xticks = [],yticks = [])
    createPlot.axl = plt.subplot(111,frameon = False,**axprops)
    plotTree.totalW = float(getNumLeafs(inTree))
    plotTree.totalD = float(getTreeDepth(inTree))
    plotTree.xOff = -0.5/plotTree.totalW
    plotTree.yOff = 1.0
    plotTree(inTree,(0.5,1.0),'')
    plt.show()

调用函数

# -*- coding: utf-8 -*-
"""
Created on Mon Mar 23 12:52:29 2020

@author: 29033
"""

#建立树
import trees
import treePlotter
'''
#测试
myDat,labels = trees.createDataSet()
#myDat[0][-1] = 'maybe'
#print(trees.calcShannonEnt(myDat))

#print(trees.chooseBestFeatureToSplit(myDat))

#myTree = trees.createTree(myDat,labels)
#print(myTree)

myTree = treePlotter.retrieveTree(0)
print(trees.classify(myTree,labels,[1,0]))
#print(trees.classify(myTree,labels,[1,1]))
'''

'''
import treePlotter
treePlotter.createPlot()
myTree = treePlotter.retrieveTree(0)
treePlotter.createPlot(myTree)
#print(treePlotter.getTreeDepth(myTree))

'''

#trees.storeTree(myTree,'cl.txt')
myTree = trees.grabTree('cl.txt')
print(myTree)

第九章 树回归

# -*- coding: utf-8 -*-
"""
Created on Thu Apr 23 10:40:34 2020

@author: 29033
"""

'''
from numpy import *

def loadDataSet(fileName):
    dataMat = []
    fr = open(fileName)
    for line in fr.readlines():
        curLine = line.strip().split('\t')
        fltLine = list(map(float,curLine))
        dataMat.append(fltLine)
    return dataMat

def binSplitDataSet(dataSet,feature,value):
    mat0 = dataSet[nonzero(dataSet[:,feature] > value)[0],:]
    mat1 = dataSet[nonzero(dataSet[:,feature] <= value)[0],:]
    return mat0,mat1



def regLeaf(dataSet):
    return mean(dataSet[:,-1])

def regErr(dataSet):
    return var(dataSet[:,-1])*shape(dataSet)[0]

def chooseBestSplit(dataSet,leafType = regLeaf,errType = regErr,ops=(1,4)):
    #进行了预剪枝
    tolS = ops[0];tolN = ops[1]
    if len(set(dataSet[:,-1].T.tolist()[0])) == 1:
        return None,leafType(dataSet)
    m,n = shape(dataSet)
    S = errType(dataSet)
    bestS = inf;bestIndex = 0;bestValue = 0
    for featIndex in range(n-1):
        for splitVal in set(dataSet[:,featIndex].T.A.tolist()[0]):
            mat0,mat1 = binSplitDataSet(dataSet,featIndex,splitVal)
            if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN):
                continue
            newS = errType(mat0)+errType(mat1)
            if newS  < bestS:
                bestIndex = featIndex
                bestValue = splitVal
                bestS = newS
    if (S-bestS) < tolS:#下降的误差起码要有,不然不分类
        return None,leafType(dataSet)
    mat0,mat1 = binSplitDataSet(dataSet,bestIndex,bestValue)
    if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN):#最少应有的切分样本数目,过低容易导致过拟合
        return None,leafType(dataSet)
    return bestIndex,bestValue
        

def createTree(dataSet,leafType = regLeaf,errType = regErr,ops = (1,4)):
    feat,val = chooseBestSplit(dataSet,leafType,errType,ops)
    if feat == None:
        return val
    retTree = {}
    retTree['spInd'] = feat
    retTree['spVal'] = val
    lSet,rSet = binSplitDataSet(dataSet,feat,val)
    retTree['left'] = createTree(lSet,leafType,errType,ops)
    retTree['right'] = createTree(rSet,leafType,errType,ops)
    return retTree

#后剪枝
def isTree(obj):
    return (type(obj).__name__ == 'dict')

def getMean(tree):
    if isTree(tree['right']):
        tree['right'] = getMean(tree['right'])
    if isTree(tree['left']):
        tree['left'] = getMean(tree['left'])
    return (tree['left']+tree['right'])/2.0

def prune(tree,testData):#待剪枝的树与,实际上该是验证集
    if shape(testData)[0] == 0:
        return getMean(tree)
    if isTree(tree['left']) or isTree(tree['right']):
        lSet,rSet = binSplitDataSet(testData,tree['spInd'],tree['spVal'])
    if isTree(tree['left']):
        tree['left'] = prune(tree['left'],lSet)
    if isTree(tree['right']):
        tree['right'] = prune(tree['right'],rSet)
    if not isTree(tree['left']) and not isTree(tree['right']):
        lSet,rSet = binSplitDataSet(testData,tree['spInd'],tree['spVal'])
        errorNoMerge = sum (power(lSet[:,-1] - tree['left'],2)) + sum(power(rSet[:,-1] - tree['right'],2))#不分支
        treeMean = (tree['left'] + tree['right'])/2.0
        errorMerge = sum(power(lSet[:,-1] - treeMean,2))#进行分支
        if errorMerge < errorNoMerge:
            print("merging")
            return treeMean#合并
        else:
            return tree
    else:
        return tree#不剪枝
    
#模型树
def linearSolve(dataSet):
    m,n = shape(dataSet)
    X = mat(ones((m,n)));Y = mat(ones((m,1)))
    X[:,1:n] = dataSet[:,0:n-1]#从0开始计数.第一列是常数项
    Y = dataSet[:,-1]
    xTx = X.T*X
    if linalg.det(xTx) == 0.0:
        raise NameError('This matrix is singular,cannot do inverse,\n\
                        try increasing the second value of ops')#不可逆
    ws = xTx.I*(X.T*Y)
    return ws,X,Y

def modelLeaf(dataSet):
    ws,X,Y =linearSolve(dataSet)
    return ws

def modelErr(dataSet):
    ws,X,Y = linearSolve(dataSet)
    yHat = X*ws
    return sum(power(Y-yHat,2))

#用树回归进行预测
def regTreeEval(model,inDat):#对回归叶节点进行预测
    return float(model)

def modelTreeEval(model,inMat):
    n=inMat.shape[1]
    X=mat(ones((1,n)))
    X[:,1:n]=inMat[:,:-1]
    return float(X*model)

def treeForeCast(tree,inData,modelEval = regTreeEval):
    if not isTree(tree):
        return modelEval(tree,inData)
    if float(inMat[:,tree['spInd']])>tree['spVal']:
        if isTree(tree['left']):
            return treeForeCast(tree['left'],inData,modelEval)
        else:
            return modelEval(tree['left'],inData)
    else:
        if isTree(tree['right']):
            return treeForeCast(tree['right'],inData,modelEval)
        else:
            return modelEval(tree['right'],inData)

def createForeCast(tree,testData,modelEval = regTreeEval):#预测
    m = len(testData)
    yHat = mat(zeros((m,1)))
    for i in range(m):
        yHat[i,0] = treeForeCast(tree,mat(testData[i]),modelEval)
    return yHat
#这段是我写的,下面是大佬写的
'''

# -*- coding:utf-8 -*-
import math
from numpy import *
import matplotlib.pyplot as plt

def loadDataSet(fileName):
    fr=open(fileName)
    dataSet=[]
    for line in fr.readlines():
        items=line.strip().split('\t')
        dataSet.append(list(map(float,items)))
    return dataSet

def regLeaf(dataMat):
    return mean(dataMat[:,-1])

def regErr(dataMat):
    return var(dataMat[:,-1])*dataMat.shape[0]

def modelLeaf(dataMat):
    ws,X,Y=linearSolve(dataMat)
    return ws

def modelErr(dataMat):
    ws,X,Y=linearSolve(dataMat)
    YHat=X*ws
    return sum(power(YHat-Y,2))

def binSplitDataSet(dataMat,feature,value):
    mat0=dataMat[nonzero(dataMat[:,feature]>value)[0]]
    mat1=dataMat[nonzero(dataMat[:,feature]<=value)[0]]
    return mat0,mat1

def chooseBestFeature(dataMat,leafType,errType,ops):
    tolS=ops[0];tolN=ops[1]
    if len(set(dataMat[:,-1].T.tolist()[0]))==1:
        return None,leafType(dataMat)
    m,n=shape(dataMat);S=errType(dataMat)
    bestS=inf;bestVal=0;bestFeature=0
    for feat in range(n-1):
        for splitValue in set(dataMat[:,feat].T.tolist()[0]):
            mat0,mat1=binSplitDataSet(dataMat,feat,splitValue)
            if (mat0.shape[0]<tolN) or (mat1.shape[0]<tolN):
                continue
            nowErr=errType(mat0)+errType(mat1)
            if nowErr<bestS:
                bestS=nowErr
                bestFeature=feat
                bestVal=splitValue
    if abs(S-bestS)<tolS:
        return None,leafType(dataMat)
    mat0,mat1=binSplitDataSet(dataMat,bestFeature,bestVal)
    if (mat0.shape[0]<tolN) or (mat1.shape[0]<tolN):
        return None,leafType(dataMat)
    return bestFeature,bestVal

def isTree(obj):
    return (type(obj).__name__=='dict')

def createTree(dataMat,leafType=modelLeaf,errType=modelErr,ops=(1,4)):
    feat,val=chooseBestFeature(dataMat,leafType,errType,ops)
    if feat==None:
        return val
    retTree={}
    retTree['spInd']=feat
    retTree['spVal']=val
    leftMat,rightMat=binSplitDataSet(dataMat,feat,val)
    retTree['lTree']=createTree(leftMat,leafType,errType,ops)
    retTree['rTree']=createTree(rightMat,leafType,errType,ops)
    # 建树的时候计算出每个节点下面的叶子节点数量,并且计算出该节点下面的叶子节点的和
    # 方便后剪枝的时候能够快速的对树进行塌陷处理
    # 此处改动已经和原书中的写法有了很大不同
    if isTree(retTree['lTree']) and isTree(retTree['rTree']):
        retTree['leafN']=retTree['lTree']['leafN']+retTree['rTree']['leafN']
        retTree['total']=retTree['lTree']['total']+retTree['rTree']['total']
    elif (not isTree(retTree['lTree'])) and isTree(retTree['rTree']):
        retTree['leafN']=1+retTree['rTree']['leafN']
        retTree['total']=retTree['lTree']+retTree['rTree']['total']
    elif isTree(retTree['lTree']) and (not isTree(retTree['rTree'])):
        retTree['leafN']=retTree['lTree']['leafN']+1
        retTree['total']=retTree['lTree']['total']+retTree['rTree']
    else:
        retTree['leafN']=2
        retTree['total']=retTree['lTree']+retTree['rTree']
    return retTree

def getMean(tree):
    if isTree(tree):
        if isTree(tree['lTree']):
            tree['lTree']=tree['lTree']['total']
        if isTree(tree['rTree']):
            tree['rTree']=tree['rTree']['total']
        return tree['total']*1.0/tree['leafN']
    else:
        return tree

def prune(tree,testData):
    if testData.shape[0]==0:
        return getMean(tree)
    if isTree(tree['lTree']) or isTree(tree['rTree']):
        lSet,rSet=binSplitDataSet(testData,tree['spInd'],tree['spVal'])
    if isTree(tree['lTree']):
        tree['lTree']=prune(tree['lTree'],lSet)
    if isTree(tree['rTree']):
        tree['rTree']=prune(tree['rTree'],rSet)
    if not isTree(tree['lTree']) and not isTree(tree['rTree']):
        lSet,rSet=binSplitDataSet(testData,tree['spInd'],tree['spVal'])
        errNoMerge=sum(power(lSet[:,-1]-tree['lTree'],2))+sum(power(rSet[:,-1]-tree['rTree'],2))
        treeMean=tree['total']/tree['leafN']
        errMerge=sum(power(testData[:,-1]-treeMean,2))
        if errNoMerge<errMerge:
            print ("merging")
            return treeMean
        else:
            return tree
    else:
        return tree

def linearSolve(dataSet):
    m,n = shape(dataSet)
    X = mat(ones((m,n)));Y = mat(ones((m,1)))
    X[:,1:n] = dataSet[:,0:n-1]#从0开始计数.第一列是常数项
    Y = dataSet[:,-1]
    xTx = X.T*X
    if linalg.det(xTx) == 0.0:
        raise NameError('This matrix is singular,cannot do inverse,\n\
                        try increasing the second value of ops')#不可逆
    ws = xTx.I*(X.T*Y)
    return ws,X,Y

# 回归树预测
def regTreeEval(model,inMat):
    return float(model)

# 模型树预测
def modelTreeEval(model,inMat):
    n=inMat.shape[1]
    X=mat(ones((1,n)))
    X[:,1:n]=inMat[:,:-1]
    return float(X*model)

def treeForeCast(tree,inMat,modelEval=modelTreeEval):
    if not isTree(tree):
        return modelEval(tree,inMat)
    if float(inMat[:,tree['spInd']])>tree['spVal']:
        if not isTree(tree['lTree']):
            return modelEval(tree['lTree'],inMat)
        else:
            return treeForeCast(tree['lTree'],inMat,modelEval)
    else:
        if not isTree(tree['rTree']):
            return modelEval(tree['rTree'],inMat)
        else:
            return treeForeCast(tree['rTree'],inMat,modelEval)

def createForeCast(tree,testMat,modelEval=modelTreeEval):
    m=testMat.shape[0]
    yHat=mat(zeros((m,1)))
    for i in range(m):
        yHat[i]=treeForeCast(tree,testMat[i],modelEval)
    return yHat

def run():
    dataSet=loadDataSet('bikeSpeedVsIq_train.txt')
    testSet=loadDataSet('bikeSpeedVsIq_test.txt')
    tree=createTree(mat(dataSet),ops=(1,20))
    yHat=createForeCast(tree,mat(testSet))
    print (corrcoef(yHat.T,mat(testSet)[:,1].T))
    fig=plt.figure()
    ax=fig.add_subplot(111)
    ax.scatter(array(dataSet)[:,0],array(dataSet)[:,1],c='cyan',marker='o')
    plt.show()

run()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值