# 机器学习实战 决策树 python3实现 R语言实现（1）

1.python3实现

 Python实现：ID3算法1.决策树构造：trees.pyfrom math import logimport operator#简单的鉴定函数def createDataSet():    dataSet = [[1, 1, 'yes'],               [1, 1, 'yes'],               [1, 0, 'no'],               [0, 1, 'no'],               [0, 1, 'no']]    labels = ['no surfacing','flippers']    return dataSet, labels#计算给定的数据集的香农熵def calcShannonEnt(dataSet):    numEntries = len(dataSet)    labelCounts = {}    for featVec in dataSet:        currentLabel = featVec[-1]        if currentLabel not in labelCounts.keys():            labelCounts[currentLabel] = 0        labelCounts[currentLabel] += 1    shannonEnt = 0.0    for key in labelCounts:        prob = float(labelCounts[key])/numEntries        shannonEnt -= prob * log(prob,2)    return shannonEnt #按照给定特征划分数据集def splitDataSet(dataSet, axis, value):    retDataSet = []    for featVec in dataSet:        if featVec[axis] == value:            reducedFeatVec = featVec[:axis]            reducedFeatVec.extend(featVec[axis+1:])            retDataSet.append(reducedFeatVec)    return retDataSet #选择最好的数据划分方式def chooseBestFeatureToSplit(dataSet):    numFeatures = len(dataSet[0]) - 1    baseEntropy = calcShannonEnt(dataSet)    bestInfoGain = 0.0; bestFeature = -1    for i in range(numFeatures):        featList = [example[i] for example in dataSet]        uniqueVals = set(featList)        newEntropy = 0.0        for value in uniqueVals:            subDataSet = splitDataSet(dataSet, i, value)            prob = len(subDataSet)/float(len(dataSet))            newEntropy += prob * calcShannonEnt(subDataSet)             infoGain = baseEntropy - newEntropy        if (infoGain >= bestInfoGain):            bestInfoGain = infoGain            bestFeature = i    return bestFeature#字典对象存储了classList中每个类标签出现的频率，最后利用operator操作键值排序，返回出现次数最多的分类名称。def majorityCnt(classList):    classCount={}    for vote in classList:        if vote not in classCount.keys(): classCount[vote] = 0        classCount[vote] += 1    sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)    return sortedClassCount[0][0]#创建树的函数代码def createTree(dataSet,labels):    classList = [example[-1] for example in dataSet]    if classList.count(classList[0]) == len(classList):         return classList[0]#类别完全相同时停止继续划分    if len(dataSet[0]) == 1:        return majorityCnt(classList)#遍历完所有特征值时返回出现次数最多的类别    bestFeat = chooseBestFeatureToSplit(dataSet)    bestFeatLabel = labels[bestFeat]    myTree = {bestFeatLabel:{}}    subLabels = labels[:]    del(subLabels[bestFeat])    featValues = [example[bestFeat] for example in dataSet]    uniqueVals = set(featValues)#得到列表包含的所有属性    for value in uniqueVals:        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels)    return myTree                                def classify(inputTree,featLabels,testVec):    firstStr = list(inputTree.keys())[0]    secondDict = inputTree[firstStr]    featIndex = featLabels.index(firstStr)    key = testVec[featIndex]    valueOfFeat = secondDict[key]    if isinstance(valueOfFeat, dict):         classLabel = classify(valueOfFeat, featLabels, testVec)    else: classLabel = valueOfFeat    return classLabel#使用pickle模块存储决策树def storeTree(inputTree,filename):    import pickle    fw = open(filename,'wb')    pickle.dump(inputTree,fw)    fw.close()    def grabTree(filename):    import pickle    fr = open(filename,"rb")    return pickle.load(fr)     2.绘制树型图：treePlotter.pyimport matplotlib.pyplot as plt#使用文本注解绘制树节点#定义文本框和箭头格式decisionNode = dict(boxstyle="sawtooth", fc="0.8")leafNode = dict(boxstyle="round4", fc="0.8")arrow_args = dict(arrowstyle="<-")#获取叶节点的数目和树的层数def getNumLeafs(myTree):    numLeafs = 0    firstStr = list(myTree.keys())[0]#这里书中有错误。用list转变为列表后才能用【】提取键值    secondDict = myTree[firstStr]    for key in secondDict.keys():        if type(secondDict[key]).__name__=='dict':            numLeafs += getNumLeafs(secondDict[key])        else:            numLeafs +=1    return numLeafsdef getTreeDepth(myTree):    maxDepth = 0    firstStr =list(myTree.keys())[0]    secondDict = myTree[firstStr]    for key in secondDict.keys():        if type(secondDict[key]).__name__=='dict':            thisDepth = 1 + getTreeDepth(secondDict[key])        else:            thisDepth = 1        if thisDepth > maxDepth:            maxDepth = thisDepth    return maxDepth#绘制带箭头的注解def plotNode(nodeTxt, centerPt, parentPt, nodeType):    createPlot.ax1.annotate(nodeTxt, xy=parentPt,  xycoords='axes fraction',             xytext=centerPt, textcoords='axes fraction',             va="center", ha="center", bbox=nodeType, arrowprops=arrow_args )#在父子节点间填充文本信息    def plotMidText(cntrPt, parentPt, txtString):    xMid = (parentPt[0]-cntrPt[0])/2.0 + cntrPt[0]    yMid = (parentPt[1]-cntrPt[1])/2.0 + cntrPt[1]    createPlot.ax1.text(xMid, yMid, txtString, va="center", ha="center", rotation=30)#计算宽与高def plotTree(myTree, parentPt, nodeTxt):    numLeafs = getNumLeafs(myTree)    depth = getTreeDepth(myTree)    firstStr = list(myTree.keys())[0]    cntrPt = (plotTree.xOff + (1.0 + float(numLeafs))/2.0/plotTree.totalW, plotTree.yOff)    plotMidText(cntrPt, parentPt, nodeTxt)  #标记子节点的属性    plotNode(firstStr, cntrPt, parentPt, decisionNode)    secondDict = myTree[firstStr]    plotTree.yOff = plotTree.yOff - 1.0/plotTree.totalD  #减少y偏移    for key in secondDict.keys():        if type(secondDict[key]).__name__=='dict':            plotTree(secondDict[key],cntrPt,str(key))        else:            plotTree.xOff = plotTree.xOff + 1.0/plotTree.totalW            plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode)            plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key))    plotTree.yOff = plotTree.yOff + 1.0/plotTree.totalD#主函数def createPlot(inTree):    fig = plt.figure(1, facecolor= 'white')    fig.clf()    axprops = dict(xticks=[], yticks=[])    createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)    plotTree.totalW = float(getNumLeafs(inTree))    plotTree.totalD = float(getTreeDepth(inTree))    plotTree.xOff = -0.5/plotTree.totalW; plotTree.yOff = 1.0    plotTree(inTree, (0.5,1.0), '')    plt.show()def retrieveTree(i):    listOfTrees =[{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}},                  {'no surfacing': {0: 'no', 1: {'flippers': {0: {'head': {0: 'no', 1: 'yes'}}, 1: 'no'}}}}                  ]    return listOfTrees[i]

3.测试：使用决策树执行分类。预测隐形眼镜类型

1. 收集数据；《机器学习实战》提供的文本文件

2. 准备数据：解析tab键分隔的数据行

3. 分析数据：快速检查数据，确保正确的解析数据内容，使用createPlot（）函数绘制最终的树形图。

4. 训练算法：使用createTree()函数

5. 测试算法：编写测试函数验证决策树可以正确分类给定的数据实例。

6. 使用算法：存储树的数据结构，一边下次使用时无需重新构造。

2.R语言实现

 方法1：library(party)  #用于实现决策树算法library（sampling）  #用于实现数据分层随机抽样，构造训练集和测试集data(iris)str(iris)'data.frame':150 obs. of  5 variables: $Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...$ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... $Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...$ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... $Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ... dim(iris)[1] 150 5 sub_train = strata(iris,stratanames = "Species",size = rep(35,3),method = "srswor") #strata:分层抽样在包sampling中，Strata(data,stratanames = NULL,size,method=c(“srswor”,””srswr”,”poisson”,”systematic”),pik,description = FALSE)Data:待抽样数据Stratanames：进行分层所依变量名Size：各层中要抽出的观测样本数Method：选择四种抽样方法，分别为无放回，有放回，泊松，系统抽样，默认srsworPik：设置各层中样本的抽样概率Description：选择是否输出含有各层基本信息的结果。data_train = iris[sub_train$ID_unit,]data_train = iris[-sub_train$ID_unit,] iris_tree = ctree(Species~.,data = data_train) #ctree：条件推理树是一种比较常用的基于树的分类算法，与传统决策树（rpart）不同之处在于条件推理树是选择分类变量时依据的是显著性测量的结果，而不是采用信息最大化法。Rpart采用的是基尼系数。 print(iris_tree) Conditional inference tree with 3 terminal nodes Response: Species Inputs: Sepal.Length, Sepal.Width, Petal.Length, Petal.Width Number of observations: 45 1) Petal.Length <= 1.7; criterion = 1, statistic = 40.933 2)* weights = 15 1) Petal.Length > 1.7 3) Petal.Width <= 1.6; criterion = 1, statistic = 19.182 4)* weights = 15 3) Petal.Width > 1.6 5)* weights = 15 plot(iris_tree)  plot(iriis_tree,type = "simple")  方法2：用RWeka实现C4.5算法该过程需要安装java。(或者不用？把变量定义为因子,还没试过。)library(RWeka)library(grid)library(mvtnorm)library(modeltools)library(stats4)library(strucchange)library(zoo)library(partykit)library(rJava)data(iris)str(iris) m1 <- J48(Species~.,data=iris)m1J48 pruned tree------------------ Petal.Width <= 0.6: setosa (50.0)Petal.Width > 0.6| Petal.Width <= 1.7| | Petal.Length <= 4.9: versicolor (48.0/1.0)| | Petal.Length > 4.9| | | Petal.Width <= 1.5: virginica (3.0)| | | Petal.Width > 1.5: versicolor (3.0/1.0)| Petal.Width > 1.7: virginica (46.0/1.0) Number of Leaves : 5 Size of the tree : 9 table(iris$Species,predict(m1))       setosa versicolor virginica  setosa         50          0         0  versicolor      0         49         1  virginica       0          2        48plot(m1)