0417学习笔记：3.1决策树的构造-划分数据集

def splitDataSet(dataSet, axis, value): #创建新的list对象（为了不修改原始数据集）
retDataSet = []
for featVec in dataSet:
if featVec[axis] == value: #抽取符合特征的数据
reducedFeatVec = featVec[:axis]
reducedFeatVec.extend(featVec[axis+1:])
retDataSet.append(reducedFeatVec)
return retDataSet

append和extend函数：

a=[1,2,3]
b=[4,5,6]
a.append(b)
a
》》》[1, 2, 3, [4, 5, 6]] #第四个元素也是个列表

a=[1,2,3]
a.extend(b)
a
》》》[1, 2, 3, 4, 5, 6]

P38

def chooseBestFeatureToSplit(dataSet): #选取特征，划分数据集，计算得最好的划分数据集的特征
numFeatures = len(dataSet[0]) - 1
baseEntropy = calcShannonEnt(dataSet) #计算整个数据集的原始熵，用于比较
bestInfoGain = 0.0; bestFeature = -1
for i in range(numFeatures):
featList = [example[i] for example in dataSet] #创建新列表
uniqueVals = set(featList) #变为集合类型（去重）
newEntropy = 0.0
for value in uniqueVals: #计算各种划分方式的信息熵
subDataSet = splitDataSet(dataSet, i, value)
prob = len(subDataSet)/float(len(dataSet))
newEntropy += prob * calcShannonEnt(subDataSet)
infoGain = baseEntropy - newEntropy
if (infoGain > bestInfoGain): #计算最好的信息增益
bestInfoGain = infoGain
bestFeature = I
return bestFeature

P39

after度量信息熵，有效划分数据集，

then将这些函数功能放一起，构建决策树

3.1.3递归构建决策树

import operator
def majorityCnt(classList):
classCount={} #用来存储classList中每个类标签出现的频率
for vote in classList:
if vote not in classCount.keys(): classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.iteritems(),key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0] #返回出现次数最多的分类名称

def createTree(dataSet,labels):
classList = [example[-1] for example in dataSet]
if classList.count(classList[0]) == len(classList):
return classList[0] #所有类标签完全相同则停止划分，返回该类标签
if len(dataSet[0]) == 1:
return majorityCnt(classList) #遍历完所有特征时返回出现次数最多的
bestFeat = chooseBestFeatureToSplit(dataSet) #存当前数据集选取的最好特征
bestFeatLabel = labels[bestFeat]
myTree = {bestFeatLabel:{}} #得到列表的所有属性值
del(labels[bestFeat])
featValues = [example[bestFeat] for example in dataSet]
uniqueVals = set(featValues)
for value in uniqueVals:
subLabels = labels[:] #复制类标签
myTree[bestFeatLabel][value] = createTree(splitDataSet\
(dataSet, bestFeat, value),subLabels)
return myTree

3.2绘制树形图

31.Matplotlib注解工具：annotations

使用文本注解绘制树节点

# Author：J.Kong
import matplotlib.pyplot as plt
decisionNode = dict(boxstyle="sawtooth", fc="0.8") #定义文本框和箭头格式
leafNode = dict(boxstyle="round4", fc="0.8") #
arrow_args = dict(arrowstyle="<-")

def plotNode(nodeTxt, centerPt, parentPt, nodeType): #执行实际的绘图功能，绘制带箭头的注释
createPlot.ax1.annotate(nodeTxt, xy=parentPt,xycoords='axes fraction',
xytext=centerPt, textcoords='axes fraction',
va="center", ha="center", bbox=nodeType, arrowprops=arrow_args)

def createPlot(): #创建新图形并清空绘图区，并画两个代表不同类型的树节点
fig = plt.figure(1, facecolor='white')
fig.clf()
createPlot.ax1 = plt.subplot(111, frameon=False)
plotNode('a decision node', (0.5, 0.1), (0.1, 0.5), decisionNode) #决策节点
plotNode('a leaf node', (0.8, 0.1), (0.3, 0.8), leafNode) #叶节点
plt.show()

import sys
sys.path.append("F:\k18\swhite")
import treePlotter
Backend Qt4Agg is interactive backend. Turning interactive mode on.
treePlotter.createPlot()

3.2.2构造注解树