目录
前言
本文主要对书中第五章中的例5.3,ID3算法和c4.5算法进行代码实现(文末有完整代码)
一、导入需要的库
from math import log
二、读取数据
def loadDataSet():
dataSet = [['青年', '否', '否', '一般', '否'],
['青年', '否', '否', '好', '否'],
['青年', '是', '否', '好', '是'],
['青年', '是', '是', '一般', '是'],
['青年', '否', '否', '一般', '否'],
['中年', '否', '否', '一般', '否'],
['中年', '否', '否', '好', '否'],
['中年', '是', '是', '好', '是'],
['中年', '否', '是', '非常好', '是'],
['中年', '否', '是', '非常好', '是'],
['老年', '否', '是', '非常好', '是'],
['老年', '否', '是', '好', '是'],
['老年', '是', '否', '好', '是'],
['老年', '是', '否', '非常好', '是'],
['老年', '否', '否', '一般', '否']]
label = ['年龄', '有工作', '有自己的房子', '信贷情况']
return dataSet, label
三、创建树
def createTree(dataSet, labels):
classList = [example[-1] for example in dataSet]
if len(set(classList)) == 1: # 单节点
return classList[0]
if len(dataSet[0]) == 0: # 特征集A为空集
return classmax(classList)# 将D中实例数最大的类作为该节点的类标记
bestFeat = chooseBestFeatureSplit(dataSet)
bestFeatLabel = labels[bestFeat]
# 创建树:
myTree = {bestFeatLabel: {}}
del (labels[bestFeat])
featValues = [example[bestFeat] for example in dataSet]
for value in set(featValues):
subLabels = labels[:]
subDataSet = splitData(dataSet, bestFeat, value)
myTree[bestFeatLabel][value] = createTree(subDataSet, subLabels)
return myTree
四、将数据集D中实例数最大的类作为该节点的类标记
def classmax(classList):
classCount = dict([(i, classList.count(i)) for i in classList])
return max(classCount, key=lambda x: classCount[x])
五、计算数据集D的经验熵
def entropy(dataSet):
labelCount = {}
D = len(dataSet)
for data in dataSet:
classify = data[-1]
if classify not in labelCount.keys():
labelCount[classify] = 1
else:
labelCount[classify] += 1
H = 0.0
for CK in labelCount.values():
pi = CK / D
H -= pi * log(pi, 2)
return H
六、返回不同value的数据
def splitData(dataSet, axis, value):
retDataSet = []
for data in dataSet:
if data[axis] == value:
reducedData = data[:axis]
reducedData.extend(data[axis + 1:])
retDataSet.append(reducedData)
print(axis,value)
print(retDataSet)
return retDataSet
七、选择信息增益最大的特征为最优特征
ID3算法 (用信息增益选择特征)
def chooseBestFeatureSplit(dataSet):
HD = entropy(dataSet)
bestGain = 0.0
bestFeature = -1
for i in range(len(dataSet[0]) - 1):
feat = [data[i] for data in dataSet]
prob = 0.0
for value in set(feat):
subData = splitData(dataSet, i, value)
pi = len(subData) / len(dataSet) # Di/D
prob += pi * entropy(subData) # 经验条件熵
Gain = (HD - prob)#信息增益
if Gain > bestGain:
bestGain = Gain
bestFeature = i
# print('最优特征是:', label[bestFeature])
return bestFeature
c4.5算法(用信息增益比选择特征)
def chooseBestFeatureSplit(dataSet):
HD = entropy(dataSet)
bestGain = 0.0
bestFeature = -1
for i in range(len(dataSet[0]) - 1):
feat = [data[i] for data in dataSet]
# print(feat)
prob = 0.0
HAD = 0.0
for value in set(feat):
subData = splitData(dataSet, i, value)
pi = len(subData) / len(dataSet) # Di/D
prob += pi * entropy(subData)#经验条件熵
HAD -= (pi) * log(pi, 2)
Gain = (HD - prob) / HAD #信息增益比
if Gain > bestGain:
bestGain = Gain
bestFeature = i
# print('最优特征是:', label[bestFeature])
return bestFeature
八、执行
dataSet, label = loadDataSet()
print(createTree(dataSet, label))
完整代码
ID3算法
from math import log
def loadDataSet():
dataSet = [['青年', '否', '否', '一般', '否'],
['青年', '否', '否', '好', '否'],
['青年', '是', '否', '好', '是'],
['青年', '是', '是', '一般', '是'],
['青年', '否', '否', '一般', '否'],
['中年', '否', '否', '一般', '否'],
['中年', '否', '否', '好', '否'],
['中年', '是', '是', '好', '是'],
['中年', '否', '是', '非常好', '是'],
['中年', '否', '是', '非常好', '是'],
['老年', '否', '是', '非常好', '是'],
['老年', '否', '是', '好', '是'],
['老年', '是', '否', '好', '是'],
['老年', '是', '否', '非常好', '是'],
['老年', '否', '否', '一般', '否']]
label = ['年龄', '有工作', '有自己的房子', '信贷情况']
return dataSet, label
def classmax(classList):
classCount = dict([(i, classList.count(i)) for i in classList])
return max(classCount, key=lambda x: classCount[x])
def entropy(dataSet):
labelCount = {}
D = len(dataSet)
for data in dataSet:
classify = data[-1]
if classify not in labelCount.keys():
labelCount[classify] = 1
else:
labelCount[classify] += 1
H = 0.0
for CK in labelCount.values():
pi = CK / D
H -= pi * log(pi, 2)
return H
def splitData(dataSet, axis, value):
retDataSet = []
for data in dataSet:
if data[axis] == value:
reducedData = data[:axis]
reducedData.extend(data[axis + 1:])
retDataSet.append(reducedData)
print(axis,value)
print(retDataSet)
return retDataSet
def chooseBestFeatureSplit(dataSet):
HD = entropy(dataSet)
bestGain = 0.0
bestFeature = -1
for i in range(len(dataSet[0]) - 1):
feat = [data[i] for data in dataSet]
prob = 0.0
for value in set(feat):
subData = splitData(dataSet, i, value)
pi = len(subData) / len(dataSet) # Di/D
prob += pi * entropy(subData) # 经验条件熵
Gain = (HD - prob)#信息增益
if Gain > bestGain:
bestGain = Gain
bestFeature = i
# print('最优特征是:', label[bestFeature])
return bestFeature
def createTree(dataSet, labels):
classList = [example[-1] for example in dataSet]
if len(set(classList)) == 1: # 单节点
return classList[0]
if len(dataSet[0]) == 0: # 特征集A为空集
return classmax(classList)# 将D中实例数最大的类作为该节点的类标记
bestFeat = chooseBestFeatureSplit(dataSet)
bestFeatLabel = labels[bestFeat]
# 创建树:
myTree = {bestFeatLabel: {}}
del (labels[bestFeat])
featValues = [example[bestFeat] for example in dataSet]
for value in set(featValues):
subLabels = labels[:]
subDataSet = splitData(dataSet, bestFeat, value)
myTree[bestFeatLabel][value] = createTree(subDataSet, subLabels)
return myTree
dataSet, label = loadDataSet()
print(createTree(dataSet, label))
#结果{'有自己的房子': {'是': '是', '否': {'有工作': {'是': '是', '否': '否'}}}}
c4.5算法
from math import log
def loadDataSet():
dataSet = [['青年', '否', '否', '一般', '否'],
['青年', '否', '否', '好', '否'],
['青年', '是', '否', '好', '是'],
['青年', '是', '是', '一般', '是'],
['青年', '否', '否', '一般', '否'],
['中年', '否', '否', '一般', '否'],
['中年', '否', '否', '好', '否'],
['中年', '是', '是', '好', '是'],
['中年', '否', '是', '非常好', '是'],
['中年', '否', '是', '非常好', '是'],
['老年', '否', '是', '非常好', '是'],
['老年', '否', '是', '好', '是'],
['老年', '是', '否', '好', '是'],
['老年', '是', '否', '非常好', '是'],
['老年', '否', '否', '一般', '否']]
label = ['年龄', '有工作', '有自己的房子', '信贷情况']
return dataSet, label
def classmax(classList):
classCount = dict([(i, classList.count(i)) for i in classList])
return max(classCount, key=lambda x: classCount[x])
def entropy(dataSet): #HD
labelCount = {}
D = len(dataSet)
for data in dataSet:
classify = data[-1]
if classify not in labelCount.keys():
labelCount[classify] = 1
else:
labelCount[classify] += 1
H = 0.0
for CK in labelCount.values():
pi = CK / D
H -= pi * log(pi, 2)
return H
def splitData(dataSet, axis, value):
retDataSet = []
for data in dataSet:
if data[axis] == value:
reducedData = data[:axis]
reducedData.extend(data[axis + 1:])
retDataSet.append(reducedData)
return retDataSet
def chooseBestFeatureSplit(dataSet):
HD = entropy(dataSet)
bestGain = 0.0
bestFeature = -1
for i in range(len(dataSet[0]) - 1):
feat = [data[i] for data in dataSet]
# print(feat)
prob = 0.0
HAD = 0.0
for value in set(feat):
subData = splitData(dataSet, i, value)
pi = len(subData) / len(dataSet) # Di/D
prob += pi * entropy(subData)#经验条件熵
HAD -= (pi) * log(pi, 2)
Gain = (HD - prob) / HAD #信息增益比
if Gain > bestGain:
bestGain = Gain
bestFeature = i
# print('最优特征是:', label[bestFeature])
return bestFeature
def createTree(dataSet, labels):
classList = [example[-1] for example in dataSet]
if len(set(classList)) == 1: # 单节点
return classList[0]
if len(dataSet[0]) == 0: # 特征集A为空集
return classmax(classList)# 将D中实例数最大的类作为该节点的类标记
bestFeat = chooseBestFeatureSplit(dataSet)
bestFeatLabel = labels[bestFeat]
# 创建树:
myTree = {bestFeatLabel: {}}
del (labels[bestFeat])
featValues = [example[bestFeat] for example in dataSet]
for value in set(featValues):
subLabels = labels[:]
subDataSet = splitData(dataSet, bestFeat, value)
myTree[bestFeatLabel][value] = createTree(subDataSet, subLabels)
return myTree
dataSet, label = loadDataSet()
print(createTree(dataSet, label))
#结果{'有自己的房子': {'是': '是', '否': {'有工作': {'是': '是', '否': '否'}}}}