决策树实现

#-*- coding=utf8 -*-
import numpy as np
from math import log

def createTree(dataSet,labels):
    classList=[s[-1] for s in dataSet]
    if classList.count(classList[0])==len(classList):
        return classList[0]              #叶节点
    elif len(dataSet[0])==1:
        return majorityLabel(dataSet)   #叶节点
    bestFeat=chooseBestFeature(dataSet)
    # print labels
    # print bestFeat
    bestfeatlabel=labels[bestFeat]
    myTree={bestfeatlabel:{}}
    del(labels[bestFeat])

    featvalues=[example[bestFeat] for example in dataSet]
    uniqueValue=set(featvalues)
    for value in uniqueValue:
        subLabels = labels[:]  #这里必须写成subLabels = labels[:]而不能是subLabels = labels
        myTree[bestfeatlabel][value]=createTree(splitDataSet(dataSet,bestFeat,value),subLabels)
    return myTree

def splitDataSet(dataSet,feat,value):
    subDateSet=[]
    for line in dataSet:
        if line[feat]==value:
            lineSubData=line[:feat]
            lineSubData.extend(line[feat+1:])
            subDateSet.append(lineSubData)
    return subDateSet


def majorityLabel(classList):
    counts={}
    unique=set(classList)
    for i in unique:
        counts[i]=classList.count(i)
    sortedlabel = sorted(counts.iteritems(), key=lambda asd: asd[1], reverse=False)
    return sortedlabel[0][0]

def calcEntropy(dataSet):
    classList=[example[-1] for example in dataSet]
    numberData = len(classList)
    uniqueLabel=set(classList)
    counts=[]
    for i in uniqueLabel:
        counts.append(classList.count(i))
    entropy=0.0
    for i in counts:
        prop=i*1.0/numberData
        entropy-=prop*log(prop,2)
    # print entropy
    return entropy

def chooseBestFeature(dataSet):
    numFeatures=len(dataSet[0])-1
    baseEntropy=calcEntropy(dataSet)
    maxEntropy=0.0;bestFeature=-1
    for i in range(numFeatures):
        featureValues=[example[i] for example in dataSet]
        uniqueValue=set(featureValues)
        newEntropy=0.0
        for value in uniqueValue:
            subDataSet=splitDataSet(dataSet,i,value)
            prob=len(subDataSet)/float(len(dataSet))
            newEntropy+=prob*calcEntropy(subDataSet)
        infoGain=baseEntropy-newEntropy
        if infoGain>maxEntropy:
            maxEntropy=infoGain
            bestFeature=i
    return bestFeature


def storeTree(inputTree, filename):
    import pickle
    fw = open(filename, 'w')
    pickle.dump(inputTree, fw)
    fw.close()


def grabTree(filename):
    import pickle
    fr = open(filename)
    return pickle.load(fr)

dataSet = [[1, 1, 'yes'],
               [1, 1, 'yes'],
               [1, 0, 'no'],
               [0, 1, 'no'],
               [0, 1, 'no']]
labels = ['no surfacing','flippers']
#
# storeTree(createTree(dataSet,labels),'mytree.txt')

fr=open('Ch03\lenses.txt')
lenses=[inst.strip().split('\t') for inst in fr.readlines()]
lenseLabel=['age','pres','asti','tear','ui','iu']
lensetree=createTree(lenses,lenseLabel)
# grabTree('mytree.txt')
print lensetree
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值