import json
import operator
from math import log
'''创建数据集'''
def createDataSet():
fr = open('D:\单片机实验&JDK文档\lenses.txt') #路径根据自己文件更改
dataSet = [rl.strip().split('\t') for rl in fr.readlines()]
labels = ['1','2','3','4','5']
return dataSet, labels 性
'''经验熵'''
def calShannonEnt(dataset):
m = len(dataset)
lableCount = {}
'''计数'''
for data in dataset:
currentLabel = data[-1]
if currentLabel not in lableCount.keys():
lableCount[currentLabel] = 0
lableCount[currentLabel] += 1
'''遍历字典求和'''
entropy = 0
for label in lableCount:
p = float(lableCount[label]) / m
entropy -= p * log(p,2)
return entropy
def splitdataset(dataset,axis,value):
subSet = []
for data in dataset:
if(data[axis] == value):
data_x = data[:axis]
data_x.extend(data[axis+1:])
subSet.append(data_x)
return subSet
def chooseBestFeatureToSpit(dataSet):
feature_num = len(dataSet[0])-1
origin_ent = calShannonEnt(dataSet)
infoGain = 0.0
best_infogain = 0.0
for i in range(feature_num):
fi_all = [data[i] for data in dataSet]
fi_all = set(fi_all)
#print fi_all
subset_Ent = 0
'''遍历所有可能value'''
for value in fi_all:
subset = splitdataset(dataSet,i,value)
p = float(len(subset)) / len(dataSet)
subset_Ent += p * calShannonEnt(subset)
#计算信息增益
infoGain = origin_ent - subset_Ent
if(infoGain > best_infogain):
best_feature = i
best_infogain = infoGain
return best_feature
'''计数并返回最多类别'''
def majorityCnt(classList):
classCount = {}
for class_ in classList:
if(class_ not in classCount.keys()):
classCount[class_] = 0
classCount[class_] += 1
classSort = sorted(classCount.iteritems(),key = operator.itemgetter(1),reverse=True)
return classSort[0][0]
'''向下递归创建树 '''
def createTree(dataSet,labels,feaLabels):
classList = [example[-1] for example in dataSet]
if(len(classList) == classList.count(classList[0])):
return classList[0]
if(len(dataSet[0]) == 1):
majorClass = majorityCnt(classList)
return majorClass
'''继续划分'''
best_feature = chooseBestFeatureToSpit(dataSet)#最优划分特征 下标号
best_feaLabel = labels[best_feature]
feaLabels.append(best_feaLabel) #存储最优特征
del(labels[best_feature])#特征属性中删去最优特征《——ID3消耗特征
feaValue = [example[best_feature] for example in dataSet]
feaValue = set(feaValue) #获取最优特征的属性值列表
deci_tree = {best_feaLabel:{}}#子树的根的key是此次划分的最优特征名,value是再往下递归划分的子树
for value in feaValue:
subLabel = labels[:] #因为每个value都需要label,copy以免递归更改
subset = splitdataset(dataSet,best_feature,value)
deci_tree[best_feaLabel][value] = createTree(subset,subLabel,feaLabels)
#print deci_tree
return deci_tree
if __name__ == '__main__':
dataSet, labels = createDataSet()
feaLabels = []
mytree = createTree(dataSet,labels,feaLabels)
print(json.dumps(mytree,ensure_ascii=False))
决策树的应用
最新推荐文章于 2024-08-04 21:08:26 发布