from math import log
import operator
#计算的是经验熵H(D)
def calcShannonEnt(dataset):
numEntries = len(dataset)
labelCounts = {}
for featVec in dataset:
currentLabel = featVec[-1]
if currentLabel not in labelCounts.keys():
#key对应的value值赋1
labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
shannonEnt = 0.0
for key in labelCounts:
#print(labelCounts)
prob = float(labelCounts[key]) / numEntries
#print('prob',prob)
shannonEnt -= prob * log(prob,2)
#print('shan',shannonEnt)
return shannonEnt
def createdatset():
dataset = [[0,0,0,0,'no'],
[0,0,0,1,'no'],
[0,1,0,1,'yes'],
[0,1,1,0,'yes'],
[0,0,0,0,'no'],
[1,0,0,0,'no'],
[1,0,0,1,'no'],
[1,1,1,1,'yes'],
[1,0,1,2,'yes'],
[1,0,1,2,'yes'],
[2,0,1,2,'yes'],
[2,0,1,1,'yes'],
[2,1,0,1,'yes'],
[2,1,0,2,'yes'],
[2,0,0,0,'no']
]
labels = ['age','work','house','信贷']
return dataset,labels
def splitDataset(dataSet, axis, value):
retDataset = []
for featVec in dataSet:
if featVec[axis] == value:
reduceFeatVec = featVec[0:axis]
reduceFeatVec.extend(featVec[axis+1:])
#print(reduceFeatVec)
retDataset.append(reduceFeatVec)
return retDataset
def chooseBestFeatureToSplit(dataset):
numFeatures = len(dataset[0]) - 1
baseEntropy = calcShannonEnt(dataset)
bestInfoGain = 0.0
bestFeature = -1
for i in range(numFeatures):
featList = [example[i] for example in dataset]
uniqueVals = set(featList)
#print(uniqueVals)
newEntropy = 0.0
for value in uniqueVals:
subDataset = splitDataset(dataset,i,value)
#print(subDataset)
#计算某个特征中各取值总量 占总特征数的比例
prob = len(subDataset)/float(len(dataset))
#计算经验条件熵H(D|A)
newEntropy += prob * calcShannonEnt(subDataset)
infoGain = baseEntropy - newEntropy
if(infoGain > bestInfoGain):
bestInfoGain = infoGain
bestFeature = i
return bestFeature
#多数表决
def majority(classlist):
classcount = {}
for vote in classlist:
if vote not in classlist.keys():
classcount[vote] = 0
classcount += 1
sortedclasscount = sorted(classcount.items(),operator.itemgetter(1),reverse=True)
return sortedclasscount[0][0]
def createtree(dataset,labels):
classlist = [ example[-1] for example in dataset]
if classlist.count(classlist[0]) == len(classlist):
return classlist[0]
#只有1列,即计算过所有的特征,而splitDataset需要进行些许的更改
if len(dataset[0]) == 1:
return majority(dataset)
bestfeature = chooseBestFeatureToSplit(dataset)
#返回的是label而不是label对应的列数
bestfeaturelabel = labels[bestfeature]
#print(bestfeaturelabel)
#此为嵌套型字典
mytree = {bestfeaturelabel:{}}
#删除当前情况下的最好特征,避免重复
del(labels[bestfeature])
#对该最佳特征中的取值进行去重
featurevalues = [example[bestfeature] for example in dataset]
uniquevals = set(featurevalues)
for value in uniquevals:
sublabels = labels[:]
#bestfeaturelabel是当前最好的label,value是此特征下的取值,而createtree最终返回的是yes或no
mytree[bestfeaturelabel][value] = createtree(splitDataset(dataset,bestfeature,value),sublabels)
return mytree
dataset,labels = createdatset()
#majority(dataset)
mytree = createtree(dataset,labels)
print(mytree)