整理并分析了里面一些错误和不适之处,已经在代码中修改,原因是Python版本的更新,目前只是基础部分的代码,后面例子的代码调试分析后再上传,供大家学习交流。
# -*- coding:utf-8 -*-
from math import log
import matplotlib.pyplot as plt
import operator
import pickle
decisionNode=dict(boxstyle="sawtooth",fc="0.8")#文本框类型为锯齿形
leafNode=dict(boxstyle="round4",fc="0.8")#叶节点为圆一点的四边形
arrow_args=dict(arrowstyle="<-")#箭头的类型
def createDataSet():
dataSet=[[1,1,'yes'],
[1,1,'yes'],
[1,0,'no'],
[0,1,'no'],
[0,1,'no']]
labels=['no surfacing','flippers']
return dataSet,labels
def calcShannonEnt(dataSet): # 计算数据的熵(entropy)
numEntries=len(dataSet) # 数据条数
labelCounts={}#dictionary whose keys are the values in the final column
for featVec in dataSet:
currentLabel=featVec[-1] # 每行数据的最后一个字(类别)
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel]=0
labelCounts[currentLabel]+=1 # 统计有多少个类以及每个类的数量
shannonEnt=0.0
for key in labelCounts:
prob=float(labelCounts[key])/numEntries # 计算单个类的熵值
shannonEnt-=prob*log(prob,2) # 累加每个类的熵值
return shannonEnt
def splitDataSet(dataSet,axis,value): # 按某个特征分类后的数据
'''
:param dataSet:the data we'll split
:param axis:the feature we’ll split on
:param value:and the value of the feature to return
Our dataset is a list of lists; you iterate over every item in the list and if it
contains the value you’re looking for, you’ll add it to your newly created list. Inside the
if statement, you cut out the feature that you split on.
'''
retDataSet=[]
for featVec in dataSet:
if featVec[axis]==value:
reducedFeatVec =featVec[:axis]
reducedFeatVec.extend(featVec[axis+1:])
retDataSet.append(reducedFeatVec)
return retDataSet
def chooseBestFeatureToSplit(dataSet): # 选择最优的分类特征
numFeatures = len(dataSet[0])-1
baseEntropy = calcShannonEnt(dataSet) # 原始的熵
bestInfoGain = 0
bestFeature = -1
for i in range(numFeatures):
featList = [example[i] for example in dataSet]
uniqueVals = set(featList)
newEntropy = 0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet,i,value)
prob =len(subDataSet)/float(len(dataSet))
newEntropy +=prob*calcShannonEnt(subDataSet) # 按特征分类后的熵
infoGain = baseEntropy - newEntropy # 原始熵与按特征分类后的熵的差值
if (infoGain>bestInfoGain): # 若按某特征划分后,熵值减少的最大,则此特征为最优分类特征
bestInfoGain=infoGain
bestFeature = i
return bestFeature
def majorityCnt(classList):
'''
return the class that occurs with the greatest frequency.
'''
classCount={}
for vote in classList:
if vote not in classCount.keys():
classCount[vote]=0
classCount[vote]+=1
sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
return sortedClassCount[0][0]
def createTree(dataSet,labels):
classList=[example[-1] for example in dataSet] # 类别:男或女
if classList.count(classList[0])==len(classList):
return classList[0]
if len(dataSet[0])==1:
return majorityCnt(classList)
bestFeat=chooseBestFeatureToSplit(dataSet) #选择最优特征
bestFeatLabel=labels[bestFeat]
myTree={bestFeatLabel:{}} #分类结果以字典形式保存
del(labels[bestFeat])#删除了最佳划分特征的标签
featValues=[example[bestFeat] for example in dataSet]
uniqueVals=set(featValues)
for value in uniqueVals:
subLabels=labels[:]
myTree[bestFeatLabel][value]=createTree(splitDataSet\
(dataSet,bestFeat,value),subLabels)
return myTree
'''
Attenti