此处为ID3决策树
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
data = {'no surfacing': [1, 1, 1, 0, 0],
'flippers': [1, 1, 0, 1, 1],
'fish': ['yes', 'yes', 'no', 'no', 'no'],
}
data = DataFrame(data)
data['fish'] = (data['fish'] == 'yes').astype(int)
def get_entropy(dataset):
'''
计算熵值
:param dataset:
:return:
'''
label_sum = dataset.shape[0] # 标注的总数量 = 数据集总行数
label_series = dataset.iloc[:, -1].value_counts() # 标签的所有类别
p = label_series / label_sum
entropy = sum(-p * np.log2(p))
return entropy
def best_split(dataset):
'''
找出哪个列最为当前最佳结点
:return:最佳结点对应的列的位置
'''
label_entropy = get_entropy(dataset) # 首先要计算的熵值是标注的熵值
best_info_gain = -1 # 记录最优信息增益
best_column = -1 # 记录最优最优的列
# 遍历除了标注列以外的列,即特征列
for column in range(dataset.shape[1] - 1):
# 当前列下所有的离散值唯一值
unis = dataset.iloc[:, column].value_counts().index
entropys = 0
# 计算当前结点下的唯一离散值的 标注占比*其熵
for uni in unis:
sub_dataset = dataset[dataset.iloc[:, column] == uni] # 相当于分组后每个组的所有成员
entropy = get_entropy(sub_dataset)
entropys += sub_dataset.shape[0] / dataset.shape[0] * entropy
info_gain = label_entropy - entropys
if info_gain > best_info_gain:
best_info_gain = info_gain
best_column = column
return best_column
def split_by_column(dataset, column, value):
'''
把已经当做了当前结点的那一列删去
:param dataset:
:param column:
:param value:
:return:
'''
op_column = dataset.columns[column]
# 获取当前这一列等于value值的所有另外的列
redataset = dataset.loc[dataset[op_column] == value, :].drop(op_column, axis=1)
return redataset
def create_tree(dataset):
'''
递归增加分支
:param dataset:
:return:
'''
feature_list = list(dataset.columns)
class_list = dataset.iloc[:, -1].value_counts()
# 递归出口 当数据集只剩下一列 or
if dataset.shape[1] == 1 or class_list[0] == dataset.shape[0]:
return class_list.index[0]
best_column = best_split(dataset) # 作为当前结点的最佳属性对应的列
best_feature = feature_list[best_column]
my_tree = {best_feature: {}}
del feature_list[best_column]
value_list = set(dataset.iloc[:, best_column])
# 对此节点创建分支
for value in value_list:
temp = split_by_column(dataset, best_column, value)
my_tree[best_feature][value] = create_tree(temp)
return my_tree
def save_tree(tree):
np.save('./my_tree.npy', tree)
def classify(tree, all_columns, test_data):
'''
用训练好的决策树进行分类
:param tree:
:param all_columns:
:param test_data:
:return:
'''
current_node = [key for key in tree][0]
# print(first_node)
next_node_dict = tree[current_node]
feature_index = all_columns.index(current_node)
for key in next_node_dict:
print(test_data[feature_index])
print(key)
if test_data[feature_index] == key:
# 如果下一个是结点则继续递归
if type(next_node_dict[key]) == dict:
class_label = classify(next_node_dict[key], all_columns, test_data)
else:
class_label = next_node_dict[key]
return class_label
def score(train_data, test_data):
'''
预测的准确率评估
:param train_data:
:param test_data:
:return:
'''
tree = create_tree(train_data)
all_columns = list(train_data.columns)
result = []
# 循环每一行
for row in range(test_data.shape[0]):
obj = test_data.iloc[row, :-1]
class_label = classify(tree, all_columns, obj)
result.append(class_label)
new_series = Series(result)
accuracy = (test_data.iloc[:, -1] == new_series).mean()
print(accuracy)
if __name__ == '__main__':
train_data = data
test_data = data.iloc[:3, :]
score(train_data, test_data)