理论部分参考博客机器学习实战-决策树
"""
ID3决策树
"""
from math import log
from operator import itemgetter
def calShannonEnt(dataSet):
"""
计算dataset的信息熵
:param dataSet:
:return:
"""
data_num = len(dataSet)
labelCounts = {}
for featVec in dataSet:
currentLabel = featVec[-1]
labelCounts[currentLabel] = labelCounts.get(currentLabel, 0) + 1
ShannonEnt = 0.0
for key in labelCounts:
prob = float(labelCounts[key]) / data_num # 当前这类样本所占的比例
ShannonEnt -= prob * log(prob, 2) # 计算信息熵(香农熵)
return ShannonEnt
def spiltDataSet(dataSet, axis, value):
"""
计算axis列特征中值为value的数据集(去除axis列)
:param dataSet: 需要被划分的数据集
:param axis: 特征列序号
:param value: 特征的值
:return: axis列值为value且不包含axis列的数据集
"""
retDataSet = []
for featVec in dataSet:
if featVec[axis] == value:
reductedFeatVec = featVec[:axis]
reductedFeatVec.extend(featVec[axis + 1:])
retDataSet.append(reductedFeatVec)
return retDataSet
def createDataSet():
dataSet = [[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']]
feature_labels = ['no sufaceing', 'flippers']
return dataSet, feature_labels
def chooseBestFeatureToSpilt(dataSet):
"""
在数据集中选择最适合分类的特征
:param dataSet:数据集
:return:返回特征所在的列axis
存在多个最合适的特征的时候取第一个
"""
numFeatures = len(dataSet[0]) - 1 # 最后一列是标签值y去掉,前面的列都是特征
# 计算原始的信息熵
baseEntropy = calShannonEnt(dataSet)
bestinforGain = 0.0
bestFeature = -1
for i in range(numFeatures):
# 计算当前特征的值的集合
featureList = [example[i] for example in dataSet]
uniqueVals = set(featureList)
newEntropy = 0.0
# 计算根据当前特征进行划分时的信息熵的期望值(条件熵)
for value in uniqueVals:
subDataSet = spiltDataSet(dataSet, i, value)
prob = len(subDataSet) / float(len(dataSet)) # 根据当前特征的值划分得到的数据集的比例
newEntropy += prob * calShannonEnt(subDataSet)
# 计算信息增益并记录信息增益最大时的特征特征序列
# 信息增益最大时,新的熵最小
infoGain = baseEntropy - newEntropy
if infoGain > bestinforGain:
bestinforGain = infoGain
bestFeature = i
return bestFeature # 如果新信息熵比原始信息熵还大就返回-1
def majorityCut(classList):
"""
:param classList: 一维特征数据集
:return: 返回出现频率最高的特征的值
"""
classCount = {}
for tmp in classList:
classCount[tmp] = classCount.get(tmp, 0) + 1
sortedClassCount = sorted(classCount.items(), key=itemgetter(1), reverse=True)
return sortedClassCount[0][0]
def creatTree(dataSet, input_feature_labels):
"""
:param dataSet: 数据集(最后一列为label)
:param fetureLable: 每个特征的名称
:return:
"""
feature_label = input_feature_labels[:] # 使用input_feature_labels的拷贝
classList = [example[-1] for example in dataSet] # 当前数据集的标签
# 如果该数据集的标签完全相同则停止继续划分,返回标签值
if classList.count(classList[0]) == len(classList):
return classList[0]
# 如果数据集只有一个特征列(也就是只剩下标签了),则返回出现频率最高的标签值
if len(dataSet[0]) == 1:
return majorityCut(dataSet)
bestFeature = chooseBestFeatureToSpilt(dataSet)
bestFeatureLabel = feature_label[bestFeature]
myTree = {bestFeatureLabel: {}}
del (feature_label[bestFeature])
featureValues = [example[bestFeature] for example in dataSet]
uniqueVals = set(featureValues)
for value in uniqueVals:
subLabels = feature_label[:]
myTree[bestFeatureLabel][value] = creatTree(spiltDataSet(dataSet, bestFeature, value), subLabels)
return myTree
def classify(inputTree, featLabels, testVec):
"""
:param inputTree: 输入树
:param featLabels:每个特征的名称
:param testVec:测试数据
:return:返回预测值
"""
firstStr = list(inputTree.keys())[0]
secondDict = inputTree[firstStr]
featIndex = featLabels.index(firstStr)
predictLabel = -1
for key in secondDict.keys():
if testVec[featIndex] == key:
if type(secondDict[key]).__name__ == 'dict':
predictLabel = classify(secondDict[key], featLabels, testVec)
else:
predictLabel = secondDict[key]
return predictLabel
def storeTree(inputTree, filename):
# 存储决策树
"""
:param inputTree: 输入的决策树
:param filename: 存储树的文件名
:return:
"""
import pickle
fw = open(filename, 'wb')
pickle.dump(inputTree, fw)
fw.close()
def grabTree(filename):
import pickle
fr = open(filename, 'rb')
return pickle.load(fr)
if __name__ == "__main__":
dataSet, feature_labels = createDataSet()
tree = creatTree(dataSet, feature_labels)
storeTree(tree, 'treeStorage.txt')
mytree = grabTree('treeStorage.txt')
ans = classify(mytree, feature_labels, [1, 0])
print(ans)
ans = classify(mytree, feature_labels, [1, 1])
print(ans)