接之前的【机器学习】决策树算法–1(算法介绍)
三好学生评选表进行代码实例
1、整体决策树模块(先从离散型数据开始)
集美大学三好学生评选表
— |
是否挂科 |
获得奖学金次数 |
综测评价 |
体质健康是否达标 |
宿舍检评 |
是否符合条件 |
1 |
no |
4 |
excellect |
yes |
excellent |
yes |
2 |
no |
1 |
good |
no |
excellent |
no |
3 |
no |
0 |
excellect |
yes |
excellent |
yes |
4 |
no |
1 |
excellect |
no |
excellent |
no |
5 |
no |
2 |
good |
yes |
excellent |
yes |
6 |
no |
1 |
excellect |
yes |
excellent |
no |
7 |
no |
1 |
excellect |
yes |
excellent |
yes |
8 |
yes |
0 |
good |
yes |
excellent |
no |
9 |
no |
2 |
good |
yes |
good |
no |
10 |
no |
2 |
excellect |
yes |
excellent |
yes |
11 |
yes |
2 |
excellect |
yes |
excellent |
no |
12 |
yes |
0 |
good |
yes |
good |
no |
13 |
yes |
0 |
excellect |
yes |
pass |
no |
14 |
no |
4 |
excellect |
yes |
excellent |
yes |
15 |
no |
2 |
excellect |
yes |
excellent |
yes |
def createDataSet1():
dataSet = [['no', '4','excllent', 'yes','excllent','yes'],
['no', '1', 'good','no','excllent','no'],
['no', '0', 'excllent','yes','excllent','yes'],
['no', '1', 'excllent','no','excllent','no'],
['no', '2', 'good','yes','excllent','yes'],
['no', '1', 'excllent','yes','excllent','no'],
['no', '1', 'excllent','yes','excllent','yes'],
['yes', '0', 'good','yes','excllent','no'],
['no', '2', 'good','yes','good','no'],
['no', '2', 'excllent','yes','excllent','yes'],
['yes', '2', 'excllent','yes','excllent','no'],
['yes', '0', 'good','yes','good','no'],
['yes', '0', 'excllent','yes','pass','no'],
['no', '4', 'excllent','yes','excllent','yes'],
['no', '2', 'excllent','yes','excllent','yes']]
labels = ['Failclass','Scholarship-num','Grade-ranking','Physically-fit','Hostel-assessment']
return dataSet,labels
def createTree(dataSet,labels):
classList=[example[-1] for example in dataSet]
if classList.count(classList[0])==len(classList):
return classList[0]
if len(dataSet[0]) == 1:
return majorityCnt(classList)
bestFeat=chooseBestFeatureToSplit(dataSet)
bestFeatLabel=labels[bestFeat]
myTree={
bestFeatLabel:{
}}
del(labels[bestFeat])
featValues=[example[bestFeat] for example in dataSet]
uniqueVals=set(featValues)
for value in uniqueVals:
subLabels=labels[:]
myTree[bestFeatLabel][value]=createTree(splitDataSet\
(dataSet,bestFeat,value),subLabels)
return myTree
def majorityCnt(classList):
classCount={
}
for vote in classList:
if vote not in classCount.keys():
classCount[vote]=0
classCount[vote]+=1
sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
return sortedClassCount[0][0]
def chooseBestFeatureToSplit(dataSet):
numFeatures = len(dataSet[0])-1
baseEntropy = calcShannonEnt(dataSet)
bestInfoGain = 0
bestFeature = -1
for i in range(numFeatures):
featList = [example[i] for example in dataSet]
uniqueVals = set(featList)
newEntropy = 0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet,i,value)
prob =len(subDataSet)/float(len(dataSet))
newEntropy +=prob*calcShannonEnt(subDataSet)
print("信息熵:%f" %newEntropy)
infoGain = baseEntropy - newEntropy