决策树的应用

import json
import operator
from math import log

'''创建数据集'''
def createDataSet():
    fr = open('D:\单片机实验&JDK文档\lenses.txt') #路径根据自己文件更改
    dataSet = [rl.strip().split('\t') for rl in  fr.readlines()]
    labels = ['1','2','3','4','5']        
    return dataSet, labels               性

'''经验熵'''
def calShannonEnt(dataset):
    m = len(dataset)
    lableCount = {}
    '''计数'''
    for data in dataset:
        currentLabel = data[-1]
        if currentLabel not in lableCount.keys():
            lableCount[currentLabel] = 0
        lableCount[currentLabel] += 1
    '''遍历字典求和'''
    entropy = 0
    for label in lableCount:
        p = float(lableCount[label]) / m
        entropy -= p * log(p,2)
    return entropy

def splitdataset(dataset,axis,value):
    subSet = []
    for data in dataset:
        if(data[axis] == value):
            data_x = data[:axis]
            data_x.extend(data[axis+1:])
            subSet.append(data_x)
    return subSet

def chooseBestFeatureToSpit(dataSet):
    feature_num = len(dataSet[0])-1
    origin_ent = calShannonEnt(dataSet)
    infoGain = 0.0
    best_infogain = 0.0
    for i in range(feature_num):
        fi_all = [data[i] for data in dataSet]
        fi_all = set(fi_all)
        #print fi_all
        subset_Ent = 0
        '''遍历所有可能value'''
        for value in fi_all:
         
            subset = splitdataset(dataSet,i,value)
          
            p = float(len(subset)) / len(dataSet)
            subset_Ent += p * calShannonEnt(subset)
        #计算信息增益
        infoGain = origin_ent - subset_Ent
       
        if(infoGain > best_infogain):
            best_feature = i
            best_infogain = infoGain
    return best_feature

'''计数并返回最多类别'''
def majorityCnt(classList):
    classCount = {}
    for class_ in classList:
        if(class_ not in classCount.keys()):
            classCount[class_] = 0
        classCount[class_] += 1
    classSort = sorted(classCount.iteritems(),key = operator.itemgetter(1),reverse=True)
    return classSort[0][0]

'''向下递归创建树 '''
def createTree(dataSet,labels,feaLabels):
   
    classList = [example[-1] for example in dataSet]

    if(len(classList) == classList.count(classList[0])):
        return classList[0]

    if(len(dataSet[0]) == 1):
        majorClass = majorityCnt(classList)
        return majorClass
    '''继续划分'''
    best_feature = chooseBestFeatureToSpit(dataSet)#最优划分特征 下标号

    best_feaLabel = labels[best_feature]
    feaLabels.append(best_feaLabel) #存储最优特征
    del(labels[best_feature])#特征属性中删去最优特征《——ID3消耗特征
    feaValue = [example[best_feature] for example in dataSet]
    feaValue = set(feaValue) #获取最优特征的属性值列表
    deci_tree = {best_feaLabel:{}}#子树的根的key是此次划分的最优特征名,value是再往下递归划分的子树
    for value in feaValue:
        subLabel = labels[:] #因为每个value都需要label,copy以免递归更改
        subset = splitdataset(dataSet,best_feature,value)
        deci_tree[best_feaLabel][value] = createTree(subset,subLabel,feaLabels)
    #print deci_tree
    return deci_tree



if __name__ == '__main__':

    dataSet, labels = createDataSet()
    feaLabels = []
    mytree = createTree(dataSet,labels,feaLabels)
    print(json.dumps(mytree,ensure_ascii=False))
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值