本节我们将使用决策树构建分类器,我们可以将它用于实际数据的分类。
首先在第一节 trees.py 中添加:
# -*- coding:utf-8 -*-
from math import log
import operator
def calcShannonEnt(dataSet):
numEntries = len(dataSet)
labelCounts = {}
for featVec in dataSet:
currentLabel = featVec[-1]
if currentLabel not in labelCounts.keys(): # 为所有可能分类创建字典
labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
shannonEnt = 0.0
for key in labelCounts:
prob = float(labelCounts[key])/numEntries
shannonEnt -= prob *log(prob, 2) # 以 2 为底求对数
return shannonEnt
def createDataSet():
dataSet = [[1,1,'yes'],
[1,1,'yes'],
[1,0,'no'],
[0,1,'no'],
[0,1,'no']]
labels = ['no surfacing','flippers']
return dataSet, labels
def splitDataSet(dataSet, axis, value): # 待划分的数据集,划分数据集的特征,需要返回的特征的值
retDataSet= [] # 创建新的 list 对象
for featVec in dataSet:
if featVec[axis] == value:
# 抽取符合要求的值
reducedFeatVec = featVec[:axis]
reducedFeatVec.extend(featVec[axis+1:])
retDataSet.append(reducedFeatVec)
return retDataSet
def chooseBestFeatureToSplit(dataSet):
numFeatures = len(dataSet[0]) - 1
baseEntropy = calcShannonEnt(dataSet)
bestInfoGain = 0.0; bestFeature = -1
for i in range(numFeatures): # 遍历数据集中的所有特征
# 创建唯一的分类标签列表
featList = [example[i] for example in dataSet]
uniqueVals = set(featList) # set 是一个集合
newEntropy =