机器学习算法之决策树算法

该节主要是把《机器学习实战》书上第三章关于决策树的相关代码照样子实现了一遍。对其中一些内容作了些补充,对比ID3与C45区别,同时下载了一个大样本集实验决策树的准确率。

首先,对于决策树的原理,很多很好的博客值得一看:

从决策树学习谈到贝叶斯分类算法、EM、HMM

决策树算法总结

这两个已经详解了关于决策树的所有,慢慢品读吧。

下面是书上外加添加的部分程序,首先是tree.py里面的程序:

import operator
from math import log

# calculate the data's entropy
def calcShannonEnt(dataSet):
    numEntries = len(dataSet)
    labelCounts = {}
    for featVec in dataSet:
        currentLabel = featVec[-1]#load data's label
        if currentLabel not in labelCounts.keys():
            labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1
    shannonEnt = 0.0
    for key in labelCounts:
        prob = float(labelCounts[key])/numEntries
        shannonEnt -= prob * log(prob,2)
    return shannonEnt

# spilt the data's axis feature as value
def spiltDataSet(dataSet, axis, value):
    retDataSet = []
    for featVec in dataSet:
        if featVec[axis] == value:
            reducedFeatVec = featVec[:axis]
            reducedFeatVec.extend(featVec[axis+1:]) #as a simple
            retDataSet.append(reducedFeatVec) #as a total
    return retDataSet

# choose the best feature:ID3
def chooseBestFeatureToSplit_ID3(dataSet):
    numFeatures = len(dataSet[0]) - 1
    baseEntropy = calcShannonEnt(dataSet)
    bestInfoGain = 0.0;bestFeature = -1
    for i in range(numFeatures):
        featList = [example[i] for example in dataSet]
        uniqueVals = set(featList)
        newEntropy = 0.0
        for value in uniqueVals:
            subDataSet = spiltDataSet(dataSet,i,value)
            prob = len(subDataSet)/float(len(dataSet))
            newEntropy += prob * calcShannonEnt(subDataSet)
        infoGain = baseEntropy - newEntropy
        if (infoGain > bestInfoGain):
            bestInfoGain = infoGain
            bestFeature = i
    return bestFeature

# choose the best feature:C45
def chooseBestFeatureToSplit_C45(dataSet):
    numFeatures = len(dataSet[0]) - 1
    baseEntropy = calcShannonEnt(dataSet)
    bestInfoGain = 0.0;bestFeature = -1
    for i in range(numFeatures):
        featList = [example[i] for example in dataSet]
        uniqueVals = set(featList)
        newEntropy = 0.0
        splitInfo = 0.0
        for value in uniqueVals:
            subDataSet = spiltDataSet(dataSet,i,value)
            prob = len(subDataSet)/float(len(dataSet))
            newEntropy += prob * calcShannonEnt(subDataSet)
            splitInfo -= prob * log(prob,2) #记录分裂信息
        infoGain = baseEntropy - newEntropy
        gainRatio = infoGain / splitInfo #信息增益率
        if (gainRatio > bestInfoGain):
            bestInfoGain = gainRatio
            bestFeature = i
    return bestFeature

# if classList has only one feature,but cannot spilt all
def majorityCnt(classList):
    classCount = {}
    for vote in classList:
        if vote not in classCount.keys(): 
            classCount[vote] = 0
        classCount[vote] += 1
    sortedClassCount = sorted(classCount.iteritems(),
    key=operator.itemgetter(1),reverse=True)
    return sortedClassCount[0][0]

# create tree
# method = 'ID3'  or  'C45'
def createTree(dataSet,labels,method):
    classList = [example[-1] for example in dataSet]
    if classList.count(classList[0]) == len(classList):
        return classList[0]
    if len(dataSet[0]) == 1:
        return majorityCnt(classList)
    if method == 'ID3':
        bestFeat = chooseBestFeatureToSplit_ID3(dataSet)
    else:
        bestFeat = chooseBestFeatureToSplit_C45(dataSet)
    bestFeatLabel = labels[bestFeat]
    myTree = {bestFeatLabel:{}}
    del(labels[bestFeat]) #delate the 'bestFeat' label 
    featValues = [example[bestFeat] for example in dataSet]
    uniqueVals = set(featValues)
    for value in uniqueVals:
        subLabels = labels[:]
        myTree[bestFeatLabel][value] = createTree(spiltDataSet\
        (dataSet,bestFeat,value),subLabels,method)
    return myTree
# define classify
def classify(inputTree,featLabels,testVec):
    firstStr = inputTree.keys()[0]
    secondDict = inputTree[firstStr]
    featIndex = featLabels.index(firstStr)
    global classLabel
    for key in secondDict.keys():
        if testVec[featIndex] == key:
            if type(secondDict[key]).__name__=='dict':
                classLabel = classify(secondDict[key],featLabels,testVec)
            else:
                classLabel = secondDict[key]
    return classLabel

关于程序的讲解,书上说的非常的详细。添加的部分多了个C45算法。决策树从ID3到C45的转变不是很多,从原理上也可以看到,单纯从算法,就是用信息增益率代表信息增益,这一点带来的变化一会做个实验可以看到。
为了使得结果的可视化,书中写了一些画图函数,这里直接搬上来吧,另存在terrPlotter.py中:

import matplotlib.pyplot as plt
#
decisionNode = dict(boxstyle="sawtooth", fc="0.8")
leafNode = dict(boxstyle="round4", fc="0.8")
arrow_args = dict(arrowstyle="<-")
# get the weight anddepth
def getNumLeafs(myTree):
    numLeafs = 0
    firstStr = myTree.keys()[0]
    secondDict = myTree[firstStr]
    for key in secondDict.keys():
        if type(secondDict[key]).__name__=='dict':
            numLeafs += getNumLeafs(secondDict[key])
        else:
            numLeafs += 1
    return numLeafs

def getTreeDepth(myTree):
    maxDepth = 0
    firstStr = myTree.keys()[0]
    secondDict = myTree[firstStr]
    for key in secondDict.keys():
        if type(secondDict[key]).__name__=='dict':
            thisDepth = 1 + getTreeDepth(secondDict[key])
        else:
            thisDepth = 1
        if thisDepth > maxDepth: 
            maxDepth = thisDepth
    return maxDepth
#
def plotNode(nodeTxt, centerPt, parentPt, nodeType):
    createPlot.ax1.annotate(nodeTxt, xy=parentPt,  xycoords='axes fraction',
             xytext=centerPt, textcoords='axes fraction',
             va="center", ha="center", bbox=nodeType, arrowprops=arrow_args )

def plotMidText(cntrPt, parentPt, txtString):
    xMid = (parentPt[0]-cntrPt[0])/2.0 + cntrPt[0]
    yMid = (parentPt[1]-cntrPt[1])/2.0 + cntrPt[1]
    createPlot.ax1.text(xMid, yMid, txtString, va="center", ha="center", rotation=30)

def plotTree(myTree, parentPt, nodeTxt):#if the first key tells you what feat was split on
    numLeafs = getNumLeafs(myTree)  #this determines the x width of this tree
    #depth = getTreeDepth(myTree)
    firstStr = myTree.keys()[0]     #the text label for this node should be this
    cntrPt = (plotTree.xOff + (1.0 + float(numLeafs))/2.0/plotTree.totalW, plotTree.yOff)
    plotMidText(cntrPt, parentPt, nodeTxt)
    plotNode(firstStr, cntrPt, parentPt, decisionNode)
    secondDict = myTree[firstStr]
    plotTree.yOff = plotTree.yOff - 1.0/plotTree.totalD
    for key in secondDict.keys():
        if type(secondDict[key]).__name__=='dict':#test to see if the nodes are dictonaires, if not they are leaf nodes   
            plotTree(secondDict[key],cntrPt,str(key))        #recursion
        else:   #it's a leaf node print the leaf node
            plotTree.xOff = plotTree.xOff + 1.0/plotTree.totalW
            plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode)
            plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key))
    plotTree.yOff = plotTree.yOff + 1.0/plotTree.totalD

def createPlot(inTree):
    fig = plt.figure(1, facecolor='white')
    fig.clf()
    axprops = dict(xticks=[], yticks=[])
    createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)    #no ticks
    #createPlot.ax1 = plt.subplot(111, frameon=False) #ticks for demo puropses 
    plotTree.totalW = float(getNumLeafs(inTree))
    plotTree.totalD = float(getTreeDepth(inTree))
    plotTree.xOff = -0.5/plotTree.totalW; plotTree.yOff = 1.0;
    plotTree(inTree, (0.5,1.0), '')
    plt.show()

使用的时候加载进去,然后在决策树训练好以后,直接treePlotter.createPlot(Tree)就可以了,至于画图的中间计算过程,有兴趣可以再慢慢研究。

好了有了这些可以进行实验了。首先实验一个小样本的决策树(也是书上的样本),下载地址:
http://archive.ics.uci.edu/ml/datasets/Lenses

为了和后面对比,这里得到样本就是这样的(24个样本,4维特征,分成三类(最后一列)):
这里写图片描述
写一个这样的主程序:

import tree
import treePlotter

fr = open('lenses.txt')
lenses=[inst.strip().split('\t') for inst in fr.readlines()]
lensesLabels=['age','prescript','astigmatic','tearRate','add']

lensesTree = tree.createTree(lenses,lensesLabels,'ID3')
treePlotter.createPlot(lensesTree)

可以看到结果是这样的:
这里写图片描述
下面来说说使用C45,那么为什么需要C45算法?使用C45算法的目的是去除某些特征划分的太细太纯。由于信息增益选择分裂属性的方式会倾向于选择具有大量值的属性(即自变量),如对于客户ID,每个客户ID对应一个满意度,即按此变量划分每个划分都是纯的(即完全的划分,只有属于一个类别),客户ID的信息增益为最大值1。但这种按该自变量的每个值进行分类的方式是没有任何意义的。为了克服这一弊端,才采用信息增益率来选择分裂属性。

比如这里我们对上述的数据集再加一维特征,这个特征值对于每个样本都不一样,我们就用数字1-24来表示吧,加以后的数据另存为lensesC.txt以后,在分别用ID3和C45去实验看看,加后的数据是这样的:
这里写图片描述

import tree
import treePlotter

fr = open('lensesC.txt')
lenses=[inst.strip().split('\t') for inst in fr.readlines()]
lensesLabels=['age','prescript','astigmatic','tearRate','num']

lensesTree = tree.createTree(lenses,lensesLabels,'ID3')
treePlotter.createPlot(lensesTree)

那么ID3的结果:
这里写图片描述
修改lensesTree = tree.createTree(lenses,lensesLabels,’C45’)得到C45的结果:
这里写图片描述
很显然,C45似乎更符合实际。

下面来使用决策树算法进行一个分类预测,样本我们选择大一些的,维数多一些的,需要说明的是,这里的算法只适用于所有特征都是离散量化的,不能有连续的特征,如果有,需要删掉这一维数据或者自己去把它进行离散化,离散化的方法很多,比如对于连续的数据,可以设置一个阈值,然后将大于该阈值的设置为统一的一个值,小于该阈值的设置为一个值,这样就把连续的变量变成了一个离散的变量,变量包含两个值。下面是含有8维数据的12000多个样本集,每一维都是离散的,免去转换的痛苦,数据下载地址:
http://archive.ics.uci.edu/ml/datasets/Nursery
下载完,另存为txt放到工作目录下,然后我们就可以实验了。这里我们可以规定训练集与样本集了。为了保证随机性,我们随机将其中的一部分选择作为训练集来构造决策树,再用剩下的测试集去测试准确率,至于怎么去划分类,在tree.py中也有一个classify函数去实现。现在我们选择2000个样本试试:

import tree
import treePlotter
from numpy import *
# 将字符型运算统一,否则会出现错误
import sys
reload(sys)
sys.setdefaultencoding('utf8')

fr = open('data_nursery.txt')
data = [inst.strip().split(',') for inst in fr.readlines()]
choose = random.permutation(len(data)) #随机排序
test1 = []
test2 = []
# 选择训练集样本个数
choose_num = 2000
for i in range(choose_num):
    temp = data[choose[i]]
    test1.append(temp)
for i in range(len(data)-choose_num):    
    temp = data[choose[choose_num+i]]
    test2.append(temp)

lensesLabels=['parents','has_nurs','form','childreb','housing','finance','social','health']
lensesTree = tree.createTree(test1,lensesLabels,'ID3')
#calculate the accuracy
true_num = 0
lensesLabels=['parents','has_nurs','form','childreb','housing','finance','social','health']
for test_vector in test2:
    classLabel = tree.classify(lensesTree,lensesLabels,test_vector)
    if classLabel == test_vector[-1]:
        true_num += 1
accuracy = float(true_num)/len(test2)

print 'The classify accuracy is: %.2f%%'%(accuracy * 100)
treePlotter.createPlot(lensesTree)

得到的一个结果:
The classify accuracy is: 91.09%
由于维数太多,画出来的图难以显示清楚,就不贴出来了。因为训练样本的随机选择,每次计算的结果不一样。这种决策树还有许多优化的地方,比如修枝剪枝操作会使得决策树不至于过拟合等等,还有待改进。

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值