熵(信息的期望值): H(D)=−∑ni=1p(Di)log2p(Di) H ( D ) = − ∑ i = 1 n p ( D i ) l o g 2 p ( D i )
条件熵: H(D|A)=−∑mj=1p(Aj)H(D) H ( D | A ) = − ∑ j = 1 m p ( A j ) H ( D )
信息增益: H(D)−H(D|A) H ( D ) − H ( D | A )
信息增益率: H(D)−H(D|A)H(D) H ( D ) − H ( D | A ) H ( D )
path = '/Users/haozhiqiang/Python 3 机器学习/机器学习/machinelearninginaction/Ch03'
# 查看目录文件
import os
for root,dirs,files in os.walk(path):
print('dirs',dirs)
print('files',files)
break
----------
dirs []
files ['lenses.txt', 'classifierStorage.txt', 'treePlotter.pyc', 'treePlotter.py', 'trees.pyc', 'trees.py']
有以下模块:
1、加载数据 —— 加载数据并将数据划分为 dataSet 和 labelSet
2、熵计算 —— 在后面的迭代中需要多次调用熵计算函数(熵和条件熵计算方式可迭代)
3、数据集划分 —— 计算条件熵,需要对数据集分块划分后计算;
4、选最优特征 —— 依据最优特征(最小熵值、最大增益[率])建树
5、建树 —— 递归建树,通过最优特征确立各节点
6、停止条件 —— 得到唯一分类结果 或 使用完所有特征仍未得到结果
7、画图(略咯)
import numpy as np
filename = path + '/lenses.txt'
def load(filename):
'加载数据'
dataSet = []; labelSet = []
data = [line.strip().split('\t') for line in open(filename).readlines() ] # 列表推导
return data
def calculate_Entropy(dataSet):
'计算熵,在计算条件熵和调用这个函数'
m = len(dataSet)
label_count = {}
for line in dataSet:
label = line[-1]
label_count[label] = label_count.get(label,0) + 1 # 统计标签数目
entropy = 0.0
for key in label_count:
prob = float(label_count[key])/m
entropy -= prob * np.log2(prob)
return entropy
def split_DataSet(dataSet,index,value):
'划分数据集,参数:数据集dataSet,特征索引index,特征值 value'
retDataSet = []
for line in dataSet:
if line[index] == value:
partDataSet = line[:index] # 去中间、掐两头,剔除该特征的所有值
partDataSet.extend(line[index+1:]) # 去中间、掐两头,剔除该特征的所有值
retDataSet.append(partDataSet)
return retDataSet
def choose_Best_Feature(dataSet):
'为划分数据集选择最佳特征, 参数:dataSet'
n_features = len(dataSet[0]) -1
base_entropy = calculate_Entropy(dataSet) # 信息熵
best_info_gain_ratio = 0.0 # 信息增益率
best_feature = -1 # 最优划分特征 (赋值-1是因为最后一列是标签)
for feature in range(n_features): #
feat_vals = [element[feature] for element in dataSet]
unique_vals = set(feat_vals)
new_entropy = 0.0
for val in unique_vals:
subDataSet = split_DataSet(dataSet, feature, val)
prob = float(len(subDataSet)/len(dataSet))
new_entropy += prob * calculate_Entropy(subDataSet)
info_gain_ratio = float((base_entropy - new_entropy) / base_entropy)
if info_gain_ratio > best_info_gain_ratio:
best_info_gain_ratio = info_gain_ratio
best_feature = feature
return best_feature
def creat_Tree(dataSet,feature_name): # labels 为各特征名字
'创建树'
labelSet = [element[-1] for element in dataSet]
if len(labelSet) == labelSet.count(labelSet[0]):# 第一个停止条件,结果都为同一类
return labelSet[0]
if len(dataSet[0]) == 1: # 第二个停止条件,特征用完,对剩下的 labels 投票决定
return majorityCnt(labelSet)
best_feature = choose_Best_Feature(dataSet) # 最优特征索引
best_feature_name = feature_name[best_feature] # 最优特征name
myTree = {best_feature_name:{}} # 用递归的方式构建树
del feature_name[best_feature] # 从 feature_name 中删除‘用’过的特征
feat_val = [element[best_feature] for element in dataSet]
unique_feat_val = set(feat_val)
for val in unique_feat_val:
sub_feature_name = feature_name[:] # 涉及递归时为了保护每次掉一个不改变原列表中的内容,使用新变量代替
myTree[best_feature_name][val] = creat_Tree(split_DataSet(dataSet,best_feature,val), sub_feature_name)
return myTree
if __name__ == '__main__':
import pprint
dataSet = load(filename)
feature_name = ['age','prescript','astigmatic','tearRate']
tree = creat_Tree(dataSet,feature_name)
pprint.pprint(tree)
# 输出
{'tearRate': {'normal': {'astigmatic': {'no': {'age': {'pre': 'soft',
'presbyopic': {'prescript': {'hyper': 'soft',
'myope': 'no '
'lenses'}},
'young': 'soft'}},
'yes': {'prescript': {'hyper': {'age': {'pre': 'no '
'lenses',
'presbyopic': 'no '
'lenses',
'young': 'hard'}},
'myope': 'hard'}}}},
'reduced': 'no lenses'}}
# 使用 pickle 模块存储决策树
# 写入的转换为
def storeTree(inputTree, filename):
import pickle
with open(filename,'wb+') as fw:
pickle.dump(inputTree,fw)
def grabTree(filename):
import pickle
with open(filename,'rb+') as fr:
return pickle.load(fr)
if __name__ == '__main__':
storeTree(tree,'/Users/haozhiqiang/Python 3 机器学习/机器学习/machinelearninginaction/Ch03/classifierStorage.txt')
a = grabTree('/Users/haozhiqiang/Python 3 机器学习/机器学习/machinelearninginaction/Ch03/classifierStorage.txt')
----------
# 输出 tree
... ...
画图!可视化!
import matplotlib.pyplot as plt
# 使用决策树进行分类
# 参数说明:决策树, 标签, 待分类数据
def classify(input_tree, feature_labels, test_vec):
first_str = input_tree.keys()[0]
second_dict = input_tree[first_str]
# 得到第特征的索引,用于后续根据此特征的分类任务
feature_index = feature_labels.index(first_str)
for key in second_dict.keys():
if test_vec[feature_index] == key:
if type(second_dict[key]).__name__ == 'dict':
classLabel = classify(second_dict[key], feature_labels, test_vec)
# 达到叶子节点,返回递归调用,得到分类
else:
classLabel = second_dict[key]
return classLabel
# 决策树的存储
# 决策树的构造是一个很耗时的过程,因此需要将构造好的树保存起来以备后用
# 使用pickle序列化对象
def storeTree(input_tree, filename):
import pickle
fw = open(filename, "w")
pickle.dump(input_tree, fw)
fw.close()
# 读取文件中的决策树
def grabTree(filename):
import pickle
fr = open(filename)
return pickle.load(fr)
# 用字典进行存储
# boxstyle为文本框属性, 'sawtooth':锯齿型;fc为边框粗细
decision_node = dict(boxstyle='sawtooth', fc='0.8')
leaf_node = dict(boxstyle='round4', fc='0.8')
arrow_args = dict(arrowstyle='<-')
# node_txt 要注解的文本,center_pt文本中心点,箭头指向的点,parent_pt箭头的起点
def plotNode(node_txt, center_pt, parent_pt, node_type):
createPlot.ax1.annotate(node_txt, xy=parent_pt, xycoords='axes fraction',
xytext=center_pt, textcoords='axes fraction',
va="center", ha="center", bbox=node_type, arrowprops=arrow_args)
# 创建画板
def createPlot(in_tree):
# figure创建画板,‘1’表示第一个图,背景为白色
fig = plt.figure(1, facecolor='white')
# 清空画板
fig.clf()
axprops = dict(xticks=[], yticks=[])
# subplot(x*y*z),表示把画板分割成x*y的网格,z是画板的标号,
# frameon=False表示不绘制坐标轴矩形
createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)
# plotNode('decision_node', (0.5, 0.1), (0.1, 0.5), decision_node)
# plotNode('leaf_node', (0.8, 0.1), (0.8, 0.3), leaf_node)
# plt.show()
# 存储树的宽度
plotTree.totalW = float(getNumLeafs(in_tree))
# 存储树的深度
plotTree.totalD = float(getTreeDepth(in_tree))
# xOff用于追踪已经绘制的节点的x轴位置信息,为下一个节点的绘制提供参考
plotTree.xOff = -0.5/plotTree.totalW
# yOff用于追踪已经绘制的节点y轴的位置信息,为下一个节点的绘制提供参考
plotTree.yOff = 1.0
plotTree(in_tree, (0.5, 1.0), '')
plt.show()
# 为了绘制树,要先清楚叶子节点的数量以及树的深度--以便确定x轴的长度和y轴的高度
# 下面就分别定义这两个方法
def getNumLeafs(my_tree):
num_leafs = 0
first_str = next(iter(my_tree)) # 找到第一个节点
second_dic = my_tree[first_str]
# 测试节点数据是否为字典类型,叶子节点不是字典类型
for key in list(second_dic.keys()):
# 如果节点为字典类型,则递归使用getNumLeafs()
if type(second_dic[key]).__name__ == 'dict':
num_leafs += getNumLeafs(second_dic[key])
else:
num_leafs += 1
return num_leafs
def getTreeDepth(my_tree):
max_depth = 0
first_str = next(iter(my_tree))
second_dic = my_tree[first_str]
# 测试节点数据是否为字典类型,叶子节点不是字典类型
for key in list(second_dic.keys()):
# 如果节点为字典类型,递归使用getTreeDepth()
if type(second_dic[key]).__name__ == 'dict':
this_depth = 1 + getTreeDepth(second_dic[key])
else:
# 当节点不为字典型,为叶子节点,深度遍历结束
# 从递归中调用返回,且深度加1
this_depth = 1
# 最大的深度存储在max_depth中
if this_depth > max_depth:
max_depth = this_depth
return max_depth
# 在父子节点之间填充文本信息进行标注
# 在决策树中此处应是对应父节点的属性值
def plotMidText(center_pt, parent_pt, txt_string):
x_mid = (parent_pt[0] - center_pt[0])/2.0 + center_pt[0]
y_mid = (parent_pt[1] - center_pt[1])/2.0 + center_pt[1]
createPlot.ax1.text(x_mid, y_mid, txt_string)
def plotTree(my_tree, parent_pt, node_txt):
num_leafs = getNumLeafs(my_tree)
depth = getTreeDepth(my_tree)
first_str = list(my_tree.keys())[0]
# 以第一次调用为例说明
# 此时 绘制的为根节点,根节点的x轴:-0.5/plotTree.totalW + (1.0 + float(num_leafs))/2.0/plotTree.totalW
# 假设整个树中叶子节点的数目为6 则上述根节点的x轴:-0.5/6 + (1 + 6)/2.0/6 = 0.5
# 实际上,对于根节点而言,下式的值始终是0.5
center_pt = (plotTree.xOff + (1.0 + float(num_leafs))/2.0/plotTree.totalW, plotTree.yOff)
plotMidText(center_pt, parent_pt, node_txt)
plotNode(first_str, center_pt, parent_pt, decision_node)
second_dict = my_tree[first_str]
# y轴的偏移--深度优先的绘制策略
plotTree.yOff -= 1.0 / plotTree.totalD
for key in list(second_dict.keys()):
if type(second_dict[key]).__name__ == 'dict':
plotTree(second_dict[key], center_pt, str(key))
else:
plotTree.xOff += 1.0 / plotTree.totalW
plotNode(second_dict[key], (plotTree.xOff, plotTree.yOff), center_pt, leaf_node)
plotMidText((plotTree.xOff, plotTree.yOff), center_pt, str(key))
plotTree.yOff += 1.0 / plotTree.totalD
if __name__=='__main__':
import pprint
dataSet = load(filename)
feature_name = ['age','prescript','astigmatic','tearRate']
tree = creat_Tree(dataSet,feature_name)
createPlot(tree)