决策树
构造决策树需要解决的第一个问题就是,当前数据集上那个特征在划分数据分类时起决定性作用。为了找到决定性的特征,划分出最好的结果,需要对每个特征进行评估,一般评估方式为信息增益也叫做熵
,还有吉尼斯系数。原始数据集会被划分为几个数据子集。这些数据子集会分布在第一个决策点的所有分支上。如果某个分之下的数据属于同一类型,则当前无需对数据进一步划分。如果数据子集不属于同一类型,需要重复划分数据子集。对子集的划分和原始数据划分相同(递归)
创建分支的为代码如下
if so return 类标签;
else
寻找划分数据集的最好特征
划分数据集
创建分支结点
for 每个划分子集
调用本函数进行递归
return 分支结点
其中寻找最好特征,熵的计算公式为 H=−∑ni=1p(xi)log2p(xi) ,H为熵, p(xi) 为该分类的概率
from math import log
import operator
# 计算数据集信息增益
def calcShannonEnt(dataSet):
numEntrices = len(dataSet)
labelCounts = {}
for featVec in dataSet:
currentLabel = featVec[-1]
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
shannonEnt = 0.0
for key in labelCounts:
prob = float(labelCounts[key])/numEntrices
shannonEnt -= prob*log(prob, 2)
return shannonEnt
# 创建数据集, 标签
def createDataSet():
dataSet = [[1, 1, 'yes'],
[1, 1, 'yes'],
[1, 0, 'no'],
[0, 1, 'no'],
[0, 1, 'no']]
labels = ['no surfacing', 'flippers']
return dataSet, labels
# 数据集属性分离
# dataSet: 原始数据集
# axis: 对应属性索引
# value: 对应特征值
# return retDataSet:分离属性后数据集
def splitDataSet(dataSet, axis, value):
retDataSet = []
for featVec in dataSet:
if featVec[axis] == value:
reducedFeatVec = featVec[:axis]
reducedFeatVec.extend(featVec[axis+1:])
retDataSet.append(reducedFeatVec)
return retDataSet
# 数据集分离最优特征属性, 通过信息增益进行比较
def chooseBestFeatureToSplit(dataSet):
numFeatures = len(dataSet[0]) - 1
baseEntropy = calcShannonEnt(dataSet)
bestInfoGain = 0.0
bestFeature = -1
for i in range(numFeatures):
featList = [example[i] for example in dataSet]
uniqueVals = set(featList)
newEntropy = 0.0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value)
prob = len(subDataSet)/float(len(dataSet))
newEntropy += prob * calcShannonEnt(subDataSet)
infoGain = baseEntropy - newEntropy
if infoGain > bestInfoGain:
bestInfoGain = infoGain
bestFeature = i
return bestFeature
# 数据集中标签投票,适用于特征值单一,标签不单一
def majorityCnt(classList):
classCount = {}
for vote in classList:
if vote not in classCount.keys(): classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.items(), key = operator.itemgetter(1), reversed = True)
return sortedClassCount[0][0]
# 创建决策树
def createTree(dataSet, labels):
classList = [example[-1] for example in dataSet]
# 类别完全相同停止划分
if classList.count(classList[0]) == len(classList):
return classList[0]
# 遍历完所有特征时返回出现次数最多的
if len(dataSet[0])==1:
return majorityCnt(classList)
bestFeat = chooseBestFeatureToSplit(dataSet)
bestFeatLabel = labels[bestFeat]
myTree = {bestFeatLabel: {}}
# 得到的列表包含所有属性值
del(labels[bestFeat])
featValues = [example[bestFeat] for example in dataSet]
uniqueValues = set(featValues)
for value in uniqueValues:
subLabels = labels[:]
myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels)
return myTree
# 对输入向量进行分类,主要通过遍历
def classify(inputTree,featLabels, testVec):
firstStr = list(inputTree.keys())[0]
secondDict = inputTree[firstStr]
featIndex = featLabels.index(firstStr)
for key in secondDict.keys():
if testVec[featIndex] == key:
if type(secondDict[key]).__name__ == 'dict':
classLabel = classify(secondDict[key], featLabels, testVec)
else:
classLabel = secondDict[key]
return classLabel
# 决策树对象序列化存储
def storeTree(inputTree, filename):
import pickle
fw = open(filename, 'wb')
pickle.dump(inputTree, fw)
fw.close()
# 决策树对象序列化加载
def grabTree(filename):
import pickle
fr = open(filename, 'rb')
return pickle.load(fr)
# 获取叶子节点数目
def getNumLeafs(myTree):
numLeafs = 0
firstStr = list(myTree.keys())[0]
secondDict = myTree[firstStr]
for key in secondDict.keys():
if type(secondDict[key]).__name__ == 'dict':
numLeafs += getNumLeafs(secondDict[key])
else:
numLeafs += 1
return numLeafs
# 获取树的深度
def getTreeDepth(myTree):
maxDepth = 0
firstStr = list(myTree.keys())[0]
secondDict = myTree[firstStr]
for key in secondDict.keys():
if type(secondDict[key]).__name__ == 'dict':
thisDepth = 1 + getTreeDepth(secondDict[key])
else:
thisDepth = 1
if thisDepth > maxDepth:
maxDepth = thisDepth
return maxDepth
决策树树形图:
决策树特点
优点:计算复杂度不高,输出结果易于理解,对中间值的缺失不敏感,可以处理不相关的特征数据
缺点:可能会出现过度匹配的情况
适用数据:数值型和标称型