id3算法python实现,id3算法python实现

import numpy as np

import operator

def createDataSet():

dataSet = [

[1,1,‘yes‘],

[1,1,‘yes‘],

[1,0,‘no‘],

[0,1,‘no‘],

[0,1,‘no‘]

]

labels = [‘no surfacing‘,‘flippers‘]

return dataSet,labels

#计算给定数据集的香浓熵

def calcShannonEnt(dataSet):

numEntries = len(dataSet) #计算机数据有多少行

labelCounts = {} #创建一个空字典

for featVec in dataSet:

currentLabel = featVec[-1]

if currentLabel not in labelCounts.keys():

labelCounts[currentLabel]=0

labelCounts[currentLabel]+=1

shannonEnt = 0.0

for key in labelCounts:

prob = labelCounts[key]/numEntries

shannonEnt = shannonEnt - prob*np.log2(prob)

return shannonEnt

def splitDataSet(dataSet,axis,value):

retDataSet = []

for featVec in dataSet:

if featVec[axis]==value:

reduceFeatVec = featVec[:axis]

reduceFeatVec.extend(featVec[axis+1:])

retDataSet.append(reduceFeatVec)

return retDataSet

# 循环计算香农熵和splitDataSet()函数,找到最好的特征划分方式

def chooseBestFeatureToSplit(dataSet):

numFeatures = len(dataSet[0])-1

baseEntropy = calcShannonEnt(dataSet)

bestInfoGain = 0.0

bestFeature = -1

for i in range(numFeatures):

featList = [example[i] for example in dataSet]

uniqueVals = set(featList)

newEntropy = 0.0

for value in uniqueVals:

subDataSet = splitDataSet(dataSet,i,value)

prob = len(subDataSet)/float(len(dataSet))

newEntropy += prob*calcShannonEnt(subDataSet)

infoGain = baseEntropy - newEntropy

if infoGain>bestInfoGain:

bestInfoGain = infoGain

bestFeature = i

return bestFeature

def majorityCnt(classList):

classCount = {}

for vote in classList:

if vote not in classCount.keys():

classCount = 0

classCount[vote] += 1

sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)

return sortedClassCount[0][0]

# 参数:数据集和标签列表

def createTree(dataSet,labels):

classList = [example[-1] for example in dataSet] # 包含了数据集的所有类标签

if classList.count(classList[0]) == len(classList): # 当所有的类都相等时停止分裂

return classList[0]

if len(dataSet[0]) == 1: # 使用完了所有特征,仍然不能将数据集划分成仅包含唯一类别的分组

return majorityCnt(classList) # 采用多数多数原则选出分组

bestFeat = chooseBestFeatureToSplit(dataSet) # 选出最佳的特征值

bestFeatLabel = labels[bestFeat] # 获取该特征的名称

# 这里直接使用字典变量来存储树信息,这对于绘制树形图很重要。

myTree = {bestFeatLabel:{}} # 当前数据集选取最好的特征存储在bestFeat中

del(labels[bestFeat]) # 删除已经在选取的特征

featValues = [example[bestFeat] for example in dataSet]

uniqueVlas = set(featValues)

for value in uniqueVlas:

subLabels = labels[:] # 复制所有标签,这样树就不会破坏现有的标签

myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels)

return myTree

myDat,labels = createDataSet()

print("myDat:",myDat)

print(len(myDat[0]) -1 )

c = chooseBestFeatureToSplit(myDat)

MyTree = createTree(myDat,labels)

print(MyTree)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值