# 机器学习笔记（4）——ID3决策树算法及其Python实现

### 1. 利用信息增益选择最优划分属性

• 信息熵（information entropy）

$Ent(D)=-\sum_{k=1}^{|Y|}p_klog_2p_k$

Ent(D)的值越小，则D的纯度越高。直观理解一下：假设样本集合有2个分类，每类样本的比例为1/2，Ent(D)=1；只有一个分类，Ent（D）= 0，显然后者比前者的纯度高。

$Ent(D)=\sum_{k=1}^{2}p_klog_2p_k=-(\frac{8}{17}log_2\frac{8}{17}+\frac{9}{17}log_2\frac{9}{17})=0.998$

• 信息增益（information gain）

$Gain(D,a)=Ent(D)-\sum_{v=1}^{V}\frac{|D^v|}{|D|}Ent(D^v)$

$Ent(D^1)=-\left ( \frac{3}{6}log_2\frac{3}{6}+\frac{3}{6}log_2\frac{3}{6} \right )=1$

$Ent(D^2)=-\left ( \frac{4}{6}log_2\frac{4}{6}+\frac{2}{6}log_2\frac{2}{6} \right )=0.918$

$Ent(D^3)=-\left ( \frac{1}{5}log_2\frac{1}{5}+\frac{4}{5}log_2\frac{4}{5} \right )=0.722$

$Gain(D,a)=Ent(D)-\sum_{v=1}^{3}\frac{|D^v|}{|D|}Ent(D^v)$

$=0.998-\left ( \frac{6}{17} \times 1 +\frac{6}{17}\times 0.918+ \frac{5}{17}\times0.722\right) =0.109$

from math import log

# 计算信息熵
def calcShannonEnt(dataSet):
numEntries = len(dataSet)  # 样本数
labelCounts = {}
for featVec in dataSet:  # 遍历每个样本
currentLabel = featVec[-1]  # 当前样本的类别
if currentLabel not in labelCounts.keys():  # 生成类别字典
labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
shannonEnt = 0.0
for key in labelCounts:  # 计算信息熵
prob = float(labelCounts[key]) / numEntries
shannonEnt = shannonEnt - prob * log(prob, 2)
return shannonEnt

# 划分数据集，axis:按第几个属性划分，value:要返回的子集对应的属性值
def splitDataSet(dataSet, axis, value):
retDataSet = []
featVec = []
for featVec in dataSet:
if featVec[axis] == value:
reducedFeatVec = featVec[:axis]
reducedFeatVec.extend(featVec[axis + 1:])
retDataSet.append(reducedFeatVec)
return retDataSet

# 选择最好的数据集划分方式
def chooseBestFeatureToSplit(dataSet):
numFeatures = len(dataSet[0]) - 1  # 属性的个数
baseEntropy = calcShannonEnt(dataSet)
bestInfoGain = 0.0
bestFeature = -1
for i in range(numFeatures):  # 对每个属性技术信息增益
featList = [example[i] for example in dataSet]
uniqueVals = set(featList)  # 该属性的取值集合
newEntropy = 0.0
for value in uniqueVals: # 对每一种取值计算信息增益
subDataSet = splitDataSet(dataSet, i, value)
prob = len(subDataSet) / float(len(dataSet))
newEntropy += prob * calcShannonEnt(subDataSet)
infoGain = baseEntropy - newEntropy
if (infoGain > bestInfoGain):  # 选择信息增益最大的属性
bestInfoGain = infoGain
bestFeature = i
return bestFeature

### 2. 递归构建决策树

import operator  # 此行加在文件顶部

# 通过排序返回出现次数最多的类别
def majorityCnt(classList):
classCount = {}
for vote in classList:
if vote not in classCount.keys(): classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.iteritems(),
key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]

# 递归构建决策树
def createTree(dataSet, labels):
classList = [example[-1] for example in dataSet]  # 类别向量
if classList.count(classList[0]) == len(classList):  # 如果只有一个类别，返回
return classList[0]
if len(dataSet[0]) == 1:  # 如果所有特征都被遍历完了，返回出现次数最多的类别
return majorityCnt(classList)
bestFeat = chooseBestFeatureToSplit(dataSet)  # 最优划分属性的索引
bestFeatLabel = labels[bestFeat]  # 最优划分属性的标签
myTree = {bestFeatLabel: {}}
del (labels[bestFeat])  # 已经选择的特征不再参与分类
featValues = [example[bestFeat] for example in dataSet]
uniqueValue = set(featValues)  # 该属性所有可能取值，也就是节点的分支
for value in uniqueValue:  # 对每个分支，递归构建树
subLabels = labels[:]
myTree[bestFeatLabel][value] = createTree(
splitDataSet(dataSet, bestFeat, value), subLabels)
return myTree


# -*- coding: cp936 -*-
import trees
import json

fr = open(r'C:\Python27\py\DecisionTree\watermalon.txt')

listWm = [inst.strip().split('\t') for inst in fr.readlines()]
labels = ['色泽', '根蒂', '敲声', '纹理', '脐部', '触感']
Trees = trees.createTree(listWm, labels)

print json.dumps(Trees, encoding="cp936", ensure_ascii=False)

{"纹理": {"模糊": "否", "清晰": {"根蒂": {"稍蜷": {"色泽": {"乌黑": {"触感": {"软粘": "否", "硬滑": "是"}}, "青绿": "是"}}, "蜷缩": "是", "硬挺": "否"}}, "稍糊": {"触感": {"软粘": "是", "硬滑": "否"}}}}

### 3. 使用Matplotlib绘制决策树

# -*- coding: cp936 -*-
import matplotlib.pyplot as plt

# 设置决策节点和叶节点的边框形状、边距和透明度，以及箭头的形状
decisionNode = dict(boxstyle="square,pad=0.5", fc="0.9")
leafNode = dict(boxstyle="round4, pad=0.5", fc="0.9")
arrow_args = dict(arrowstyle="<-", connectionstyle="arc3", shrinkA=0,
shrinkB=16)

def plotNode(nodeTxt, centerPt, parentPt, nodeType):
createPlot.ax1.annotate(unicode(nodeTxt, 'cp936'), xy=parentPt,
xycoords='axes fraction',
xytext=centerPt, textcoords='axes fraction',
va="top", ha="center", bbox=nodeType,
arrowprops=arrow_args)

def getNumLeafs(myTree):
numLeafs = 0
firstStr = myTree.keys()[0]
secondDict = myTree[firstStr]
for key in secondDict.keys():
if type(secondDict[key]).__name__ == 'dict':
numLeafs += getNumLeafs(secondDict[key])
else:
numLeafs += 1
return numLeafs

def getTreeDepth(myTree):
maxDepth = 0
firstStr = myTree.keys()[0]
secondDict = myTree[firstStr]
for key in secondDict.keys():
if type(secondDict[key]).__name__ == 'dict':
thisDepth = 1 + getTreeDepth(secondDict[key])
else:
thisDepth = 1
if thisDepth > maxDepth: maxDepth = thisDepth
return maxDepth

def plotMidText(cntrPt, parentPt, txtString):
xMid = (parentPt[0] - cntrPt[0]) / 2.0 + cntrPt[0]
yMid = (parentPt[1] - cntrPt[1]) / 2.0 + cntrPt[1]
createPlot.ax1.text(xMid, yMid, unicode(txtString, 'cp936'))

def plotTree(myTree, parentPt, nodeTxt):
numLeafs = getNumLeafs(myTree)
depth = getTreeDepth(myTree)
firstStr = myTree.keys()[0]
cntrPt = (plotTree.xOff + (1.0 + float(numLeafs)) / 2.0 / plotTree.totalW,
plotTree.yOff)
plotMidText(cntrPt, parentPt, nodeTxt)
plotNode(firstStr, cntrPt, parentPt, decisionNode)
secondDict = myTree[firstStr]
plotTree.yOff = plotTree.yOff - 1.0 / plotTree.totalD
for key in secondDict.keys():
if type(secondDict[key]).__name__ == 'dict':
plotTree(secondDict[key], cntrPt, str(key))
else:
plotTree.xOff = plotTree.xOff + 1.0 / plotTree.totalW
plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff),
cntrPt, leafNode)
plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key))
plotTree.yOff = plotTree.yOff + 1.0 / plotTree.totalD

def createPlot(inTree):
fig = plt.figure(1, facecolor='white')
fig.clf()
axprops = dict(xticks=[], yticks=[])
createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)
plotTree.totalW = float(getNumLeafs(inTree))
plotTree.totalD = float(getTreeDepth(inTree))
plotTree.xOff = -0.5 / plotTree.totalW
plotTree.yOff = 1.0
plotTree(inTree, (0.5, 1.0), '')
plt.show()

# -*- coding: cp936 -*-
import trees
import treePlotter
import json

fr = open(r'C:\Python27\py\DecisionTree\watermalon.txt')

listWm = [inst.strip().split('\t') for inst in fr.readlines()]
labels = ['色泽', '根蒂', '敲声', '纹理', '脐部', '触感']
Trees = trees.createTree(listWm, labels)

print json.dumps(Trees, encoding="cp936", ensure_ascii=False)

treePlotter.createPlot(Trees)

### 4. 测试算法

# 测试算法
def classify(inputTree, featLabels, testVec):
firstStr = inputTree.keys()[0]  # 根节点
secondDict = inputTree[firstStr]
featIndex = featLabels.index(firstStr)  # 跟节点对应的属性
classLabel = None
for key in secondDict.keys():  # 对每个分支循环
if testVec[featIndex] == key:  # 测试样本进入某个分支
if type(secondDict[key]).__name__ == 'dict':  # 该分支不是叶子节点，递归
classLabel = classify(secondDict[key], featLabels, testVec)
else:  # 如果是叶子， 返回结果
classLabel = secondDict[key]
return classLabel

labels = ['色泽', '根蒂', '敲声', '纹理', '脐部', '触感']
testData = ['浅白', '蜷缩', '浊响', '稍糊', '凹陷', '硬滑']
testClass = trees.classify(Trees, labels, testData)
print json.dumps(testClass, encoding="cp936", ensure_ascii=False)

### 5. 存储决策树

# 存储决策树
def storeTree(inputTree, filename):
import pickle
fw = open(filename, 'w')
pickle.dump(inputTree, fw)
fw.close()

# 读取决策树, 文件不存在返回None
def grabTree(filename):
import pickle
if os.path.isfile(filename):
fr = open(filename)
else:
return None
>>> import WaterMalonTree
>>> import trees
>>> fileName = r'C:\Python27\py\DecisionTree\TreeFile.txt'
>>> trees.storeTree(WaterMalonTree.Trees, fileName)
>>> import json
>>> print json.dumps(readTrees, encoding="cp936", ensure_ascii=False)
{"纹理": {"清晰": {"根蒂": {"稍蜷": {"色泽": {"乌黑": {"触感": {"软粘": "否", "硬滑": "是"}}, "青绿": "是"}}, "蜷缩": "是", "硬挺": "否"}}, "模糊": "否", "稍糊": {"触感": {"软粘": "是", "硬滑": "否"}}}}


### 6. 总结

ID3决策树利用信息增益来选择最优划分属性，它可以处理标称型数据，无法处理连续和缺失值。后续我们会持续学习C4.5算法，它可以处理连续值和缺失值，而且增加了剪枝过程来应对过拟合现象。

Peter Harrington 《机器学习实战》