from math import log
import operator
class ClassTree:
def __init__(self):
pass
def calcShanonEnt(self,dataSet):
lenDataSet = len(dataSet)
labelCounts = {}
for featVec in dataSet:
currentLabel = featVec[-1]
#if currentLabel not in labelCounts:
# labelCounts[currentLabel] = 0
#labelCounts[currentLabel] += 1
labelCounts.setdefault(currentLabel,0)
labelCounts[currentLabel] += 1
shannonEnt = 0.0
for key in labelCounts:
prob = labelCounts[key]/float(lenDataSet)
shannonEnt -= prob * log(prob,2)
return shannonEnt
def giniImpurity(self,rows):
total = len(rows)
counts = {}
for item in l:
counts.setdefault(item,0)
counts[item] += 1
imp = 0
for k1 in counts:
p1 = float(counts[k1])/total
for k2 in counts:
if k1==k2:continue
p2 = float(counts[k2])/total
imp += p1*p2
return imp
def splitDataSet(self,dataSet,axis,value):
retDataSet = []
for featVec in dataSet:
if featVec[axis] == value:
reducedFeatVec = featVec[:axis]
reducedFeatVec.extend(featVec[axis+1:])
retDataSet.append(reducedFeatVec)
return retDataSet
def chooseBestFeatureToSplit(self,dataSet):
numFeatures = len(dataSet[0]) - 1
lenDataSet = len(dataSet)
baseEntropy = self.calcShanonEnt(dataSet)
bestInfoGain = 0.0
bestFeature = -1
for i in range(numFeatures):
featList = [example[i] for example in dataSet]
uniqueVals = set(featList)
newEntropy = 0.0
for value in uniqueVals:
subDataSet = self.splitDataSet(dataSet,i,value)
prob = len(subDataSet)/float(lenDataSet)
newEntropy += prob * self.calcShanonEnt(subDataSet)
infoGain = baseEntropy - newEntropy
if (infoGain > bestInfoGain):
bestInfoGain = infoGain
bestFeature = i
return bestFeature
def majorityCnt(self,classList):
classCount = {}
for vote in classList:
classCount.setdefault(vote,0)
classCount[vote] += 1
sortedClassCount = sorted(classCount.iteritems(),key = operator.itemgetter(1),reverse = True)
return sortedClassCount[0][0]
def createTree(self,dataSet,labels):
classList = [example[-1] for example in dataSet]
if classList.count(classList[0]) == len(classList):
return classList[0]
if len(dataSet[0]) == 1:
return majorityCnt(classList)
bestFeat = self.chooseBestFeatureToSplit(dataSet)
bestFeatLabel = labels[bestFeat]
print bestFeatLabel
myTree = {bestFeatLabel:{}}
del(labels[bestFeat])
featValues = [example[bestFeat] for example in dataSet]
uniqueVals = set(featValues)
for value in uniqueVals:
subLabels = labels[:]
myTree[bestFeatLabel][value] = self.createTree(self.splitDataSet(dataSet,bestFeat,value),subLabels)
return myTree
def createDataSet(self):
dataSet = [[1,1,'yes'],
[1,1,'yes'],
[1,0,'no' ],
[0,1,'no' ],
[0,1,'no' ]]
labels = ['no surfacing','flippers']
return dataSet,labels
if __name__=='__main__':
tree = ClassTree()
myDat,labels = tree.createDataSet()
print tree.createTree(myDat,labels)
决策树的python实现
最新推荐文章于 2024-09-29 08:42:56 发布