note:
python3使用keys返回的不是不是list类型需要转换
利用.__name__可以判断他的类型,直接type判断没结果
a = [example[0] for example in list] 可以把双重列表里的某位值取出来获得新的列表
决策树需要读取数据,需要计算香农熵模块,需要划分数据计算香农熵,最后建树
from math import log
import operator
def get_data():
data = [[1, 1, 'yes'],
[1, 0, 'no'],
[0, 1, 'no'],
[0, 1, 'no']]
label = ['sb', 'handsonboy']
return label, data
def caclulate_shannone(data):
""""计算香农熵"""
labelcounts = {}
for item in data:
currlabel= item[-1]
if currlabel not in labelcounts.keys():
labelcounts[currlabel] = 0
labelcounts[currlabel] += 1
shannonent = 0.0
for key in labelcounts:
prob = float(labelcounts[key])/len(data)
shannonent -= prob * log(prob, 2)
return shannonent
def split_data(data, axis, value):
"分割数据,传入data数据,axis是第几个特征,如果该值等于特征列表里取出的值就减少"
retdata = []
for featvec in data:
if featvec[axis] == value:
reduecefeat = featvec[:axis]
reduecefeat.extend(featvec[axis+1:])
retdata.append(reduecefeat)
return retdata
def choosefeatrue(label, data, shannonent):
numfeatrue = len(data[0]) - 1
baseentry = shannonent
bestinfogain = 0.0
bestfeature = -1
for i in range(numfeatrue):
featlist = [example[i] for example in data]
uniquevals = set(featlist) # 将他变成字典类型
newentropy = 0.0
# 计算最佳特征
for value in uniquevals:
subdata = split_data(data, i, value)
prob = len(subdata)/float(len(data))
newentropy += prob * caclulate_shannone(subdata)
infogain = baseentry - newentropy
if infogain > bestinfogain:
bestinfogain = infogain
bestfeature = i
return bestfeature
def get_mytree(data, label, shannoneent):
classlist = [example[-1] for example in data]
if classlist.count(classlist[0]) == len(classlist):
return classlist[0]
if len(data[0]) == 1:
return major(classlist)
bestfeat = choosefeatrue(label, data, shannoneent)
bestfeatlabel = label[bestfeat]
mytree = {bestfeatlabel:{}}
del(label[bestfeat])
featvalues = [example[bestfeat] for example in data] # 最好描述特征的那一位的全部值取出
uniquevals = set(featvalues)
for value in uniquevals:
sublabel = label[:]
mytree[bestfeatlabel][value] = get_mytree(split_data(data, bestfeat, value), sublabel, shannoneent)
return mytree
def major(classlist):
classcount = {}
for vote in classlist:
if vote not in classcount.keys():
classcount[vote] = 0
classcount[vote] += 1
sortclasscount = sorted(classcount.iteritems(), key = operator.itemgetter(1), reverse=True)
return sortclasscount[0][0]
def classify(inputtree, featlabels, testvec):
firstStr = list(inputtree.keys())[0] # 3返回的类型不是列表需要转换
secondDict = inputtree[firstStr]
featindex = featlabels.index(firstStr)
for key in secondDict.keys():
if testvec[featindex] == key:
if type(secondDict[key]).__name__ == 'dict': # 用__name__来表示他是字典类型
classlabel = classify(secondDict[key], featlabels, testvec)
else:
classlabel = secondDict[key]
return classlabel
if __name__ == "__main__":
data = get_data()
shannoneent = caclulate_shannone(data[1])
mytree = get_mytree(data[1], data[0], shannoneent)
data = get_data()
print(classify(mytree, data[0], [1, 1]))