from math import log
import operator
def calc_entropy(labels):
"""
计算信息熵
:param labels:数据集上的标签集合
:return:返回信息熵
"""
label_num = len(labels)
label_show_up_times_dict = {}
for label in labels:
if label not in label_show_up_times_dict.keys():
label_show_up_times_dict[label] = 0
label_show_up_times_dict[label] += 1
entropy = 0.0
for key in label_show_up_times_dict:
prob = float(label_show_up_times_dict[key]) / label_num
entropy += prob * log(prob, 2)
return entropy
def split_dataset(dataset, labels, index, value):
"""
根据特征所在位置index和特征的value,从原数据集中分割出那些值等于value的子集和标签集
:param dataset:
:param labels:
:param index:
:param value:
:return:
"""
sub_dataset = []
sub_labels = []
fc_index = 0
for fc in dataset:
if fc[index] == value:
# 如果遇到了值相等, 那么把这个索引删除,然后把删除该索引之后的特征向量加入子集中
tmp = fc[:index]
tmp.extend(fc[index+1:])
sub_dataset.append(tmp)
#把该特征值向量对应的标签也搞出来
sub_labels.append(labels[index])
fc_index += 1
return sub_dataset, sub_labels
def select_best_attribute(dataset, labels):
"""
选择最佳属性
:param dataset:
:param labels:
:return:
"""
#特征个数
feature_num = len(dataset[0])
#当前数据集的信息熵
base_entropy = calc_entropy(labels)
#j最大的信息增益
max_info_gain = -1
for i in range(feature_num):
# 当前特征位置上所有值的list
feature_value_list = [ example[i] for example in dataset]
# 获取所有可能的值(不重复)
unique_vals = set(feature_value_list)
#此特征的信息熵
new_entropy = 0.0
for value in unique_vals:
#获取子集
sub_dataset, sub_labels = split_dataset(dataset, i, value)
# 子集占的比例
prob = float(len(sub_dataset)) / len(dataset)
# new_entropy 加上相应的部分
new_entropy += prob * calc_entropy(sub_labels)
# 计算当前特征值的信息增益
info_gain = base_entropy - new_entropy
if info_gain > max_info_gain:
max_info_gain = info_gain
best_feature = i
return best_feature
def majority_count(labels):
"""
计算出现次数最多的label
:param labels:
:return:
"""
label_count = {}
for vote in labels:
if vote not in label_count.keys():
label_count[vote] = 0
label_count[vote] += 1
sorted_class_count = sorted(label_count.iteritems(), key=operator.itemgetter(1), reversed=True)
return sorted_class_count
def decision_tree(dataset, feature_names, labels):
"""
生成决策树(使用dict来表示一棵树)
:param dataset:
:param feature_names:特征的名称(顺序和dataset中特征的顺序保持一致)
:param labels:
:return:决策树
"""
if labels.count(labels[0]) == len(labels):
# labels中所有元素都相等,即类别完全相同,停止划分
return labels[0]
if len[dataset[0]] == 1:
# 如果只有一个特征
return majority_count(labels)
#选出根节点的最佳属性
best_featrue_index = select_best_attribute(dataset, labels)
best_feature_name = feature_names[best_featrue_index]
tree = {best_feature_name: {}}
del(feature_names[best_featrue_index])
attr_values = [example[best_featrue_index] for example in dataset]
unique_vals = set(attr_values)
for value in unique_vals:
sub_dataset, sub_labels, =split_dataset(dataset, best_featrue_index, value)
tree[best_feature_name][value] = decision_tree(sub_dataset, sub_labels)
return tree