# 决策树(Decision Trees)

(1)收集数据：可以使用任何方法。

(2)准备数据：树构造算法只适用于标称型数据，因此数值型数据必须离散化。

(3)分析数据：可以使用任何方法，构造树完成后，我们应该检查是否符合预期。

(4)训练算法：构造树的数据结构。

(5)测试算法：使用经验树计算错误率。

(6)使用算法：此步骤可以适用于任何监督学习算法，而使用决策树可以更好地理解数据的内在含义。

if so: return 类标签;

else:

for 每个划分的子集

return 分支节点

# function to calculate the Shannon entropy of dataset
def calcShannonEnt(dataSet):
numEntries = len(dataSet)
labelCounts = {}
# create dictionary of all possible classes
for featVec in dataSet:
currentLabel = featVec[-1]
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
shannonEnt = 0.0
# logarithm base 2
for key in labelCounts:
prob = float(labelCounts[key])/numEntries
shannonEnt -= prob * log(prob, 2)
return shannonEnt

# dataset splitting on a given feature
def splitDataSet(dataSet, axis, value):
"""
params:
dataSet: the dataset we will split
axis: the feature we will split on
value: the value of the feature to return
"""
# create separate list
retDataSet = []
# cut out the feature split on
for featVec in dataSet:
if featVec[axis] == value:
reducedFeatVec = featVec[:axis]
reducedFeatVec.extend(featVec[axis+1:])
retDataSet.append(reducedFeatVec)
return retDataSet

# choosing the best feature to split on
def chooseBestFeatureToSplit(dataSet):
numFeatures = len(dataSet[0]) - 1
# calculate the Shannon entropy of the whole dataset
# before any splitting has occured
baseEntropy = calcShannonEnt(dataSet)
bestInfoGain = 0.0
bestFeature = -1
for i in range(numFeatures):
# create unique list of class labels
featList = [example[i] for example in dataSet]
uniqueVals = set(featList)

newEntropy = 0.0
# calculate entropy for each split
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value)
prob = len(subDataSet)/float(len(dataSet))
newEntropy += prob * calcShannonEnt(subDataSet)
# find the best information gain
infoGain = baseEntropy - newEntropy
if(infoGain > bestInfoGain):
bestInfoGain = infoGain
bestFeature = i
return bestFeature


def majorityCnt(classList):
"""
If our dataset has run out of attributes but the class labels
are not all the same, we must decide what to call that leaf node.
In this situation, we will take a majority vote.
"""
classCount = {}
for vote in classList:
if vote not in classCount.keys(): classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.iteritems(),
key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]

# tree building code
def createTree(dataSet, labels):
"""
params:
dataSet: the dataSet we will use to create decision-tree
labels: the list of labels contains a label for each of
the features in the dataset
"""
classList = [example[-1] for example in dataSet]
# stop when all classes are equal
if classList.count(classList[0]) == len(classList):
return classList[0]
# when no more features, return majority
if len(dataSet[0]) == 1:
return majorityCnt(classList)
bestFeat = chooseBestFeatureToSplit(dataSet)
bestFeatLabel = labels[bestFeat]
myTree = {bestFeatLabel:{}}
# get list of unique values
del(labels[bestFeat])
featValues = [example[bestFeat] for example in dataSet]
uniqueVals = set(featValues)
for value in uniqueVals:
subLabels = labels[:]
myTree[bestFeatLabel][value] = createTree( \
splitDataSet(dataSet, bestFeat, \
value), subLabels)
return myTree

# classification function for an existing decision tree
def classify(inputTree, featLabels, testVec):
firstStr = inputTree.keys()[0]
secondDict = inputTree[firstStr]
# translate label string to index
featIndex = featLabels.index(firstStr)
for key in secondDict.keys():
if testVec[featIndex] == key:
if type(secondDict[key]).__name__ == 'dict':
classLabel = classify(secondDict[key], featLabels, testVec)
else:
classLabel = secondDict[key]
return classLabel

# methods for persisting the decision tree with pickle
def storeTree(inputTree, filename):
import pickle
fw = open(filename, 'w')
pickle.dump(inputTree, fw)
fw.close()

def grabTree(filename):
import pickle
fr = open(filename)
return pickle.load(fr)