最近在学习Peter Harrington的《机器学习实战》,代码与书中的略有不同,但可以顺利运行。
from math import log
import operator
# 计算熵
def calcShannonEnt(dataset):
num = len(dataset)
labelCounts = {}
for featVec in dataset:
currentLabel = featVec[-1]
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
shannonEnt = 0
for key in labelCounts:
prob = float(labelCounts[key]/num)
shannonEnt -= prob*log(prob, 2)
return shannonEnt
# 创建测试数据集
def createDataset():
dataset = [[1, 1, 'yes'],
[1, 1, 'yes'],
[1, 0, 'no'],
[0, 1, 'no'],
[0, 1, 'no']]
# labels是特征的名称
labels = ['no surfacing', 'flippers']
return dataset, labels
# 测试
# mydata,labels = createDataset()
# print(mydata)
# print(calcShannonEnt(mydata))
# 修改第一个实例的分类结果为maybe
# mydata[0][-1] =