原代码:
import math
import operator
def createDataSet():
dataSet = [[0, 0, 0, 0, "no"],
[0, 0, 0, 1, "no"],
[0, 1, 0, 1, "yes"],
[0, 1, 1, 0, "yes"],
[0, 0, 0, 0, "no"],
[1, 0, 0, 0, "no"],
[1, 0, 0, 1, "no"],
[1, 1, 1, 1, "yes"],
[1, 0, 1, 2, "yes"],
[1, 0, 1, 2, "yes"],
[2, 0, 1, 2, "yes"],
[2, 0, 1, 1, "yes"],
[2, 1, 0, 1, "yes"],
[2, 1, 0, 2, "no"],
[2, 0, 0, 0, "no"]]
labels = ["年龄", "有工作", "有自己的房子", "信贷情况"]
return dataSet, labels
def calcShannonEnt(dataSet):
"""计算经验熵(香浓熵)"""
# 返回数据集的行数
numEntires = len(dataSet)
# 收集所有目标标签(最后一个维度)
labels = [featVec[-1] for featVec in dataSet]
# 去重、获取标签种类
keys = set(labels)
shannonEnt = 0.0
for key in keys:
# 计算每种标签出现的次数
prob = float(labels.count(key)) / numEntires
shannonEnt -= prob * math.log(prob, 2)
return shannonEnt
def splitDataSet(dataSet, axis, value):
"""数据集分割"""
"""将axis维 等于 value 的数据分割开来"""
retDataSet = [] # 创建返回的数据列表
for featVec in dataSet: # 遍历数据集
if featVec[axis] == value:
reducedFeatVec = featVec[:axis] # 去掉axis特征
reducedFeatVec.extend(featVec[axis+1:]) # 将符合条件的添加到返回的数据集
retDataSet.append(reducedFeatVec)
return retDataSet # 返回划分后的数据集
def chooseBestFeatureToSplit(dataSet):
numFeatures = len(dataSet[0])-1 # 特征数量,因为最后一个不是特征,所以到len-1
baseEntropy = calcShannonEnt(dataSet) # 计算数据集的香浓熵
bestInfoGain = 0.0 # 信息增益
bestFeature = -1 # 最优特征索引值
for i in range(numFeatures):
# 获取dataSet的第i个所有特征
featList = [example[i] for example in dataSet]
uniqueVals = set(featList) # 创建无重复的特征列表
newEntroy = 0.0 # 经验条件熵
for value in uniqueVals: # 计算信息增益
subDataSet = splitDataSet(dataSet, i, value) # subDataSet划分后的子集
prob = len(subDataSet)/float(len(dataSet)) # 计算子集概率
newEntroy += prob * calcShannonEnt(subDataSet) # 根据公式计算经验条件熵
infoGain = baseEntropy - newEntroy # 信息增益
# print(f"第{i}个特征的增益为{infoGain}") # 打印每个特征的信息增益
if (infoGain > bestInfoGain):
bestInfoGain = infoGain
bestFeature = i
return bestFeature
def majorityCnt(classList):
"""返回classList中出现次数最多的元素"""
classCount = {}
keys = set(classList)
for key in keys:
classCount[key] = classList.count(key)
# 根据字典的值降序排序
sortedClassCount = sorted(classCount.items(),
key=operator.itemgetter(1),
reverse=True)
return sortedClassCount[0][0]
def createTree(dataSet, labels, lab_sel):
"""创建决策树"""
# 取分类标签(是否放贷:yes or no)
classList = [example[-1] for example in dataSet]
# 如果类别完全相同则停止继续划分
if classList.count(classList[0]) == len(classList):
return classList[0]
# 遍历完所有特征时返回出现次数最多的类标签
if len(dataSet[0]) == 1 or len(labels) == 0:
return majorityCnt(classList)
# 获取最优特征的维度
bestFeat = chooseBestFeatureToSplit(dataSet)
# 得到最优特征的标签
bestFeatLabel = labels[bestFeat]
lab_sel.append(labels[bestFeat])
# 根据最优特征的标签生成树
myTree= {bestFeatLabel: {}}
# 删除已使用的特征标签
del(labels[bestFeat])
# 得到训练集中所有的最优特征维度的所有属性值
featValues = [example[bestFeat] for example in dataSet]
uniqueVals = set(featValues)
for value in uniqueVals:
subLabels = labels[:]
myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels, lab_sel)
return myTree
def classify(inputTree, featLabels, testVec):
firstStr = next(iter(inputTree))
secondDict = inputTree[firstStr]
featIndex = featLabels.index(firstStr)
for key in secondDict.keys():
if testVec[featIndex] == key:
if type(secondDict[key]).__name__ == "dict":
classLabel = classify(secondDict[key], featLabels, testVec)
else:
classLabel = secondDict[key]
return classLabel
if __name__ == '__main__':
dataSet, labels = createDataSet()
lab_sel = []
myTree = createTree(dataSet, labels, lab_sel)
print(myTree)
print(lab_sel)
# 测试
testVec = [0, 1, 1, 2]
result = classify(myTree, lab_sel, testVec)
print(result)
复写后代码:
import numpy as np
import operator
def log(base, x):
return np.log(x)/np.log(base)
def create_data_set():
data_set = np.array([[0, 0, 0, 0, "no"],
[0, 0, 0, 1, "no"],
[0, 1, 0, 1, "yes"],
[0, 1, 1, 0, "yes"],
[0, 0, 0, 0, "no"],
[1, 0, 0, 0, "no"],
[1, 0, 0, 1, "no"],
[1, 1, 1, 1, "yes"],
[1, 0, 1, 2, "yes"],
[1, 0, 1, 2, "yes"],
[2, 0, 1, 2, "yes"],
[2, 0, 1, 1, "yes"],
[2, 1, 0, 1, "yes"],
[2, 1, 0, 2, "no"],
[2, 0, 0, 0, "no"]])
labels = ["年龄", "有工作", "有自己的房子", "信贷情况"]
return data_set, labels
def calc_shannon_ent(data_set):
num_entires = len(data_set)
labels = np.array([featVac[-1] for featVac in data_set])
keys, count = np.unique(labels, return_counts=True)
shannon_net = 0.0
for i in count:
prob = i / num_entires
shannon_net -= prob * log(2, prob)
return shannon_net
def split_data_set(data_set, axis, value):
"""数据集分割"""
"""将axis维 等于 value 的数据分割开来"""
ret_data_set = np.zeros(shape=(1, len(data_set[0])-1)) # 创建返回的数据列表
for feat_vec in data_set: # 遍历数据集
if feat_vec[axis] == value:
reduced_feat_vec = feat_vec[:axis] # 去掉axis特征
reduced_feat_vec = np.append(reduced_feat_vec, feat_vec[axis+1:])
ret_data_set = np.row_stack((ret_data_set, reduced_feat_vec))
ret_data_set = np.delete(ret_data_set, 0, 0)
return ret_data_set
def choose_best_feature_to_split(data_set):
num_features = len(data_set[0]) - 1 # 特征数量,因为最后一个不是特征,所以到len-1
base_entropy = calc_shannon_ent(data_set) # 计算数据集的香浓熵
best_info_gain = 0.0 # 信息增益
best_feature = -1 # 最优特征索引值
for i in range(num_features):
# 获取data_set的第i个所有特征
feat_list = np.array([example[i] for example in data_set])
unique_vals = np.unique(feat_list) # 创建无重复的特征列表
new_entroy = 0.0 # 经验条件熵
for value in unique_vals: # 计算信息增益
sub_data_set = split_data_set(data_set, i, value)
prob = len(sub_data_set) / float(len(data_set))
new_entroy += prob * calc_shannon_ent(sub_data_set)
info_gain = base_entropy - new_entroy
# print(f"第{i}个特征的增益为{info_gain}")
if info_gain > best_info_gain:
best_info_gain = info_gain
best_feature = i
return best_feature
def majority_cnt(class_list):
class_count = {}
keys = set(class_list)
for key in keys:
class_count[key] = class_list.count(key)
sorted_class_count = sorted(class_count.items(),
key=operator.itemgetter(1),
reverse=True)
return sorted_class_count[0][0]
def create_tree(data_set, labels, lab_sel):
"""创建决策树"""
# 取分类标签(是否放贷:yes or no)
class_list = [example[-1] for example in data_set]
# 如果类别完全相同则停止继续划分
if class_list.count(class_list[0]) == len(class_list):
return class_list[0]
# 遍历完所有特征时返回出现次数最多的类标签
if len(data_set[0]) == 1 or len(labels) == 0:
return majority_cnt(class_list)
# 获取最优特征的维度
best_feat = choose_best_feature_to_split(data_set)
# 得到最优特征的标签
best_feat_label = labels[best_feat]
lab_sel.append(labels[best_feat])
# 根据最优特征的标签生成树
my_tree = {best_feat_label: {}}
# 删除已使用的特征标签
del (labels[best_feat])
# 得到训练集中所有的最优特征维度的所有属性值
feat_values = [example[best_feat] for example in data_set]
unique_vals = set(feat_values)
for value in unique_vals:
sub_labels = labels[:]
my_tree[best_feat_label][value] = create_tree(split_data_set(data_set, best_feat, value), sub_labels, lab_sel)
return my_tree
def classify(input_tree, feat_labels, test_vec):
tv = []
for i in test_vec:
tv.append(str(i))
first_str = next(iter(input_tree))
second_dict = input_tree[first_str]
feat_index = feat_labels.index(first_str)
for key in second_dict.keys():
if tv[feat_index] == key:
if type(second_dict[key]).__name__ == "dict":
class_label = classify(second_dict[key], feat_labels, test_vec)
else:
class_label = second_dict[key]
return class_label
if __name__ == '__main__':
data_set, labels = create_data_set()
lab_sel = []
my_tree = create_tree(data_set, labels, lab_sel)
print(my_tree)
print(lab_sel)
# 测试
test_vec = [0, 1, 1, 2]
result = classify(my_tree, lab_sel, test_vec)
print(result)