# 通过长发的长短与粗细进行决策树的构造

##### 将数据集设为样本D，则熵为：

H(D)=-[3/8*log2(3/8)+5/8*log2(5/8)]=0.9544

##### 先以头发长短分计算出熵：

H(D|A)=-4/8*(1/4*log2(1/4)+3/4*log2(3/4))-4/8*(2/4*log2(2/4)+2/4*log2(2/4))=0.9057

I(D,A)=H(D)-H(D|A)=0.0487

##### 先以声音粗细分计算出熵：

H(D|B)=-6/8*(3/6*log2(3/6)+3/6*log2(3/6))-2/8*(2/2*log2(2/2))=0.75

I(D,B)=H(D)-H(D|B)=0.2087

#### 通过python实现ID3算法：

#!/usr/bin/env python3
# -*- coding:utf-8 -*-
from asn1crypto.cms import ClassList
from audioop import reverse
from math import log
import operator

def calcShannonEnt(dataSet):        #计算数据的熵
numEntries = len(dataSet)
labelCounts = {}
for featVec in dataSet:
currentLabel = featVec[-1]
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
shannonEnt = 0
for key in labelCounts:
prob = float(labelCounts[key]) / numEntries
shannonEnt -= prob * log(prob, 2)
return shannonEnt

def createDataSet1():        #创建数据集
dataSet = [['长', '粗', '男'],
['短', '粗', '男'],
['短', '粗', '男'],
['长', '细', '女'],
['短', '细', '女'],
['短', '粗', '女'],
['长', '粗', '女'],
['长', '粗', '女']]
labels = ['头发', '声音']
return dataSet, labels

def splitDataSet(dataSet, axis, value):       #按某个特征分类
retDataSet = []
for featVec in dataSet:
if featVec[axis] == value:
reducedFeatvec = featVec[:axis]
reducedFeatvec.extend(featVec[axis + 1:])
retDataSet.append(reducedFeatvec)
return retDataSet

def chooseBestFeatureToSplit(dataSet):      #选择最优的特征分类
numFeatures = len(dataSet[0]) - 1
baseEntropy = calcShannonEnt(dataSet)
bestInfoGain = 0
bestFeature = -1
for i in range(numFeatures):
featList = [example[i]for example in dataSet]
uniqueVals = set(featList)
newEntropy = 0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value)
prob = len(subDataSet) / float(len(dataSet))
newEntropy += prob * calcShannonEnt(subDataSet)
infoGain = baseEntropy - newEntropy
if(infoGain > bestInfoGain):        #如果按某特征分类后熵值减少最大，则此特征为最优分类
bestInfoGain = infoGain
bestFeature = i
return bestFeature

def majorityCnt(classList):        #按分类后的类别数量排序，例：最后分为2男1女，则判定为男
classCount = {}
for vote in classList:
if vote not in classCount.keys():
classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(
classCount.items(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]

def createTree(dataSet, labels):            #决策树的创建
classList = [example[-1] for example in dataSet]
if classList.count(classList[0]) == len(classList):
return classList[0]
if len(dataSet[0]) == 1:
return majorityCnt(classList)
bestFeat = chooseBestFeatureToSplit(dataSet)       #选择最优特征
bestFeatLable = labels[bestFeat]
myTree = {bestFeatLable: {}}       #分类结果以字典形式保存
del(labels[bestFeat])
featValues = [example[bestFeat]for example in dataSet]
uniqueVals = set(featValues)
for value in uniqueVals:
subLabels = labels[:]
myTree[bestFeatLable][value] = createTree(splitDataSet
(dataSet, bestFeat, value), subLabels)
return myTree

if __name__ == "__main__":
dataSet, labels = createDataSet1()
print(createTree(dataSet, labels))      #输出模型结果


• 广告
• 抄袭
• 版权
• 政治
• 色情
• 无意义
• 其他

120