决策树算法
决策树(decision tree)是一种基本的分类与回归方法。
本章介绍ID3和C4.5决策树,CART决策树后面再介绍
用决策树进行分类,是从根节点开始,对实例的某一特征进行测试,根据测试结果,将实例分配到其子节点,若该子节点仍为划分节点,则继续进行判断与分配,直至将实例分到叶节点的类中。
ID3分类算法
以下图为例:
总的信息熵为:(8个好瓜,9个坏瓜)
下面计算色泽属性的信息熵:(青绿色6个(3好3坏)-乌黑色6个(4好2坏)-浅白色5个(1好4坏))
最后得到该属性的信息增益:
信息熵代表着信息的不确定性,信息熵越低,信息的不确定性越低。
用上述方法得到所有属性的信息增益,然后按照信息增益从大到小选取属性对数据集进行划分。
C4.5算法
信息增益比:
其中:(青绿色6个-乌黑色6个-浅白色5个)
信息增益已计算得到,即:
实际上,信息增益准则对可取值数目较多的属性有所偏好,例如如果将前面表格中的第一列ID也作为特征的话,它的信息增益将达到最大值,而这样做显然不对,会造成过拟合。为了减少这种偏好可能带来的不利影响,C4.5算法中将采用信息增益比来进行特征的选择。克服了用信息增益选择属性时偏向选择取值多的属性的不足。
Python代码如下:
import operator
import numpy as np
from math import log
# 导入数据集
def loadDataset(filename):
fr = open(filename).readlines()
dataset = []
for line in fr:
dataset.append(line.strip().split('\t'))
return np.array(dataset)
# 计算信息熵
def calcShannonEnt(dataset):
labels = dataset[:, -1]
num_labels = len(labels)
label_Counts = {}
for i in range(num_labels):
if labels[i] not in label_Counts.keys():
label_Counts[labels[i]] = 0
label_Counts[labels[i]] += 1
shannonEnt = 0
for key in label_Counts.keys():
prob = float(label_Counts[key])/num_labels
shannonEnt -= prob*log(prob, 2)
return shannonEnt
# 分裂数据集
def splitDataset(dataSet, FeatIndex, value):
newdataset = []
for line in dataSet:
if line[FeatIndex] == value:
newdataset.append(line)
return np.delete(newdataset, FeatIndex, axis=1)
# 找出信息熵最大的特征-ID3决策树
def chooseBestFeature1(dataSet):
numFeatures = len(dataSet[0]) - 1
baseEntropy = calcShannonEnt(dataSet)
bestInfoGain = 0.0
bestFeature = -1
for key in range(numFeatures):
uniqueVals = set([example[key] for example in dataSet])
newEntropy = 0.0
for value in uniqueVals:
subDataSet = splitDataset(dataSet, key, value)
subprob = len(subDataSet)/len(dataSet)
newEntropy += subprob*calcShannonEnt(subDataSet)
infoGain = baseEntropy - newEntropy
if infoGain > bestInfoGain:
bestInfoGain = infoGain
bestFeature = key
return bestFeature
# 找出信息熵最大的特征-C4.5决策树
def chooseBestFeature2(dataSet):
numFeatures = len(dataSet[0]) - 1
baseEntropy = calcShannonEnt(dataSet)
bestInfoGain = 0.0
bestFeature = -1
for key in range(numFeatures):
featdata = []
featValues = [example[key] for example in dataSet]
for value in featValues:
featdata.append([value])
subEntropy = calcShannonEnt(np.array(featdata))
uniqueVals = set(featValues)
newEntropy = 0.0
for value in uniqueVals:
subDataSet = splitDataset(dataSet, key, value)
subprob = len(subDataSet) / len(dataSet)
newEntropy += subprob * calcShannonEnt(subDataSet)
infoGain = (baseEntropy - newEntropy)/subEntropy
if infoGain > bestInfoGain:
bestInfoGain = infoGain
bestFeature = key
return bestFeature
# 返回出现频数最多的类的名称
def majorityCnt(classList):
classCount = {}
for key in range(len(classList)):
if key not in classCount.keys():
classCount[key] = 0
classCount[key] += 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
# 生成决策树
def createTree(dataSet,labels):
classList = [example[-1] for example in dataSet]
if classList.count(classList[0]) == len(classList):
return classList[0]
if len(dataSet[0]) == 0:
return majorityCnt(classList)
bestFeatIndex = chooseBestFeature2(dataSet)
bestFeatLabel = labels[bestFeatIndex]
myTree = {bestFeatLabel:{}}
del (labels[bestFeatIndex])
uniqueVals = set([example[bestFeatIndex] for example in dataSet])
for value in uniqueVals:
subLabels = labels[:]
myTree[bestFeatLabel][value] = createTree(splitDataset(dataSet, bestFeatIndex, value), subLabels)
return myTree
# 测试
def test():
lenses = loadDataset('lenses.txt')
lensesLabels = ['age', 'prescript', 'astigmatic', 'tearRate']
lensesLabels_copy = lensesLabels[:]
lensesTree = createTree(lenses, lensesLabels_copy) #lensesLabels会在代码中改变,故传递lensesLabels_copy
print(lensesTree)
if __name__ == '__main__':
test()