【机器学习】手动实现分类决策树 (不用sk-learn)
不用sk-learn库,一步一步实现分类决策树,外加可视化哦~
- 树结构:二叉树结构+决策节点+叶子节点
- 度量:entropy或gini
- 构造树
- 读入数据:车辆轮廓分类(数据量:846,特征维度:18)
数据来源:CSDN - 一些用于聚类和分类问题的数据集 分类数据35 - 训练
- 测试
下面就开始啦
树结构
二叉树结构
from graphviz import Digraph
import uuid
from random import sample
class BinaryTree:
def __init__(self, rootObj):
self.key = rootObj
self.leftChild = None
self.rightChild = None
self.dot = Digraph(comment='Binary Tree')
def insertLeft(self, newNode):
if self.leftChild is None:
self.leftChild = newNode
else:
t = newNode
t.leftChild = self.leftChild
self.leftChild = t
def insertRight(self, newNode):
if self.rightChild is None:
self.rightChild = newNode
else:
t = newNode
t.rightChild = self.rightChild
self.rightChild = t
# 可视化,输出到Binary_Tree.gv
def print_tree(self, save_path='./Binary_Tree.gv', label=True):
# colors for labels of nodes
colors = ['skyblue', 'tomato', 'orange', 'purple', 'green', 'yellow', 'pink', 'red']
# Draws a binary tree with a node as its root
def print_node(node, node_tag):
# The node color
color = sample(colors, 1)[0]
if node.leftChild is not None:
if type(node.leftChild.key).__name__ == 'DecisionNode': # DecisionNode
left_tag = str(uuid.uuid1()) # Data for the left node
self.dot.node(left_tag,
str(node.leftChild.key.feature_name + " <= " + str(
node.leftChild.key.feature_value) + " ? "),
style='filled', color=color) # left
label_string = 'Y' if label else '' # Whether to label the connector indicates a left subtree
self.dot.edge(node_tag, left_tag, label=label_string) # The line between the left child and parent
print_node(node.leftChild, left_tag)
else: # leaf
left_tag = str(uuid.uuid1())
self.dot.node(left_tag, str(
"samples = " + str(node.leftChild.key.num) + "\n value = " + str(node.leftChild.key.value)),
style='filled', color=color)
label_string = 'Y' if label else ''
self.dot.edge(node_tag, left_tag, label=label_string)
print_node(node.leftChild, left_tag)
if node.rightChild is not None:
if type(node.rightChild.key).__name__ == 'DecisionNode':
right_tag = str(uuid.uuid1())
self.dot.node(right_tag, str(node.rightChild.key.feature_name + " <= " + str(
node.rightChild.key.feature_value) + " ? "), style='filled', color=color)
label_string = 'N' if label else ''
self.dot.edge(node_tag, right_tag, label=label_string)
print_node(node.rightChild, right_tag)
else:
right_tag = str(uuid.uuid1())
self.dot.node(right_tag, str(
"samples = " + str(node.rightChild.key.num) + "\n value = " + str(node.rightChild.key.value)),
style='filled', color=color)
label_string = 'N' if label else ''
self.dot.edge(node_tag, right_tag, label=label_string)
print_node(node.rightChild, right_tag)
if type(self.key).__name__ == 'DecisionNode':
root_tag = str(uuid.uuid1()) # root
self.dot.node(root_tag, str(self.key.feature_name + "<=" + str(self.key.feature_value) + "?"),
style='filled', color=sample(colors, 1)[0]) # Create the root node
print_node(self, root_tag)
self.dot.render(save_path)
决策节点
class DecisionNode:
def __init__(self, feature_id, feature_value, feature_name):
# 决策特征的id、边界值、特征名
self.feature_id = feature_id
self.feature_value = feature_value
self.feature_name = feature_name
叶子节点
class Leaf:
def __init__(self, value, num):
self.value = value # 叶子节点的类名
self.num = num # 叶子节点包括的数据量
self.predict = {self.value, self.num}
分类度量方法
信息熵entropy
def entropy(data_set):
count = len(data_set)
label_counts = {}
# Count the number of each category in the dataset
for row in data_set:
label = row[-1]
if label not in label_counts.keys():
label_counts[label] = 1
else:
label_counts[label] += 1
entropy = 0.0
for key in label_counts:
prob = float(label_counts[key]) / count
entropy -= prob * log(prob, 2)
return entropy
# Select the feature with the highest gain base on entropy
def choose_best_feature_entropy(data_set):
feature_count = len(data_set[0]) - 1
# The original entropy of the data set
base_entropy = entropy(data_set)
# Maximum gain
best_gain = 0.0
# Maximum gain features
best_feature = [-1, -1] # [id, value]
# Go through each feature
for i in range(feature_count):
feature = [example[i] for example in data_set]
feature_value_set = set(feature)
# Calculated gain
for value in feature_value_set:
left, right = split_data_set(data_set, i, value)
prob = len(left) / float(len(data_set))
gain = base_entropy - prob * entropy(left) - (1 - prob) * entropy(right)
# Compare
if gain > best_gain:
best_gain = gain
best_feature = [i, value]
return best_feature
gini指数
def gini(data_set):
count = len(data_set)
label_counts = {}
# Count the number of each category in the dataset
for row in data_set:
label = row[-1]
if label not in label_counts.keys():
label_counts[label] = 1
else:
label_counts[label] += 1
impurity = 1.0
for key in label_counts:
prob = float(label_counts[key]) / count
impurity -= prob * prob
return impurity
# Select the feature with the highest gain base on gini
def choose_best_feature_gini(data_set):
feature_count = len(data_set[0]) - 1
base_gini = gini(data_set)
best_gain = 0.0
best_feature = [-1, -1]
for i in range(feature_count):
feature = [example[i] for example in data_set]
feature_value_set = set(feature)
for value in feature_value_set:
left, right = split_data_set(data_set, i, value)
prob = len(left) / float(len(data_set))
gain = base_gini - prob * gini(left) - (1 - prob) * gini(right)
if gain > best_gain:
best_gain = gain
best_feature = [i, value]
return best_feature
构造树
ENTROPY = False # False就用gini
def split_data_set(data_set, axis, value):
"""根据指定条件分割数据集"""
# 划分后的新数据集
left = []
right = []
for data in data_set:
if data[axis] <= value:
left.append(data)
else:
right.append(data)
return np.array(left), np.array(right)
def count_value(rows):
count = {}
label = ""
# takes whole dataset in as argument
for row in rows:
# traverse on each datapoint
label = row[-1]
# labels are in the last column
# if label is not even once come initialise it
if label not in count:
count[label] = 0
# increase the count of present label by 1
count[label] += 1
return label, count[label]
# Create decision tree
def create_division_tree(data_set, feature_names, parent, left=False, right=False):
class_list = [example[-1] for example in data_set]
# Return leaf node if all data are in the same class
if class_list.count(class_list[0]) == len(class_list):
label, num = count_value(data_set)
leaf = Leaf(label, num)
node = BinaryTree(leaf)
if left:
parent.insertLeft(node)
elif right:
parent.insertRight(node)
return node
else:
if ENTROPY:
best_feature = choose_best_feature_entropy(data_set)
else:
best_feature = choose_best_feature_gini(data_set)
print("best_feature:", feature_names[best_feature[0]], best_feature[1])
leftData, rightData = split_data_set(data_set, best_feature[0], best_feature[1])
# Creates the current decision node
tempNode = BinaryTree(DecisionNode(best_feature[0], best_feature[1], feature_names[best_feature[0]]))
if left: # If the current DecisionNode is the left child of the previous node, add to left
parent.insertLeft(tempNode)
elif right: # right
parent.insertRight(tempNode)
create_division_tree(leftData, feature_names, tempNode, left=True) # iteration
create_division_tree(rightData, feature_names, tempNode, right=True)
return tempNode
读入数据
import numpy as np
import os
from random import randrange
def safe_float(number):
try:
return float(number)
except:
return number
# data
data_root = "./data_classify/"
data_files = []
dataSet = []
classNum = 4
feature_names = ['COMPACTNESS', 'CIRCULARITY', 'DISTANCE CIRCULARITY', 'RADIUS RATIO', 'PR.AXIS ASPECT RATIO',
'MAX.LENGTH ASPECT RATIO', 'SCATTER RATIO', 'ELONGATEDNESS', 'PR.AXIS RECTANGULARITY',
'MAX.LENGTH RECTANGULARITY', 'SCALED VARIANCE ALONG MAJOR AXIS', 'SCALED VARIANCE ALONG MINOR AXIS',
'SCALED RADIUS OF GYRATION', 'SKEWNESS ABOUT MAJOR AXIS', 'SKEWNESS ABOUT MINOR AXIS',
'KURTOSIS ABOUT MINOR AXIS', 'KURTOSIS ABOUT MAJOR AXIS', 'HOLLOWS RATIO']
class_name = {'van': 0, 'saab': 1, 'bus': 2, 'opel': 3}
for root, dirs, files in os.walk(data_root):
for file in files:
data_files.append(os.path.join(data_root, file))
print(data_files)
for file in data_files:
with open(file, 'r') as f:
lines = list(f)
for line in lines:
a = line.split()
a = np.array(a)
a = list(map(safe_float, a))
dataSet.append(a)
print(dataSet)
dataSet = np.array(dataSet) # 这里将列表转换为数组
print('dataSet: ', dataSet.shape)
# 将数据集随机分成n块,其中一块是测试集,其他n-1是训练集
def getTrainTest(dataSet, n_folds):
train_size = int(len(dataSet) / n_folds) * (n_folds - 1)
dataSet_copy = list(dataSet)
train = []
for i in range(n_folds - 1):
while len(train) < train_size: # 这里不能用if,if只是在第一次判断时起作用,while执行循环,直到条件不成立
index = randrange(len(dataSet_copy))
train.append(dataSet_copy.pop(index)) # pop() 函数用于移除列表中的一个元素(默认最后一个元素),并且返回该元素的值。
test = dataSet_copy
return train, test
def getDataSet():
return dataSet, feature_names, class_name, classNum
# data
dataSet, feature_names, class_name, classNum = getDataSet()
trainData, testData = getTrainTest(dataSet, 5)
trainData = np.array(trainData)
testData = np.array(testData)
print('trainData: ', trainData.shape)
print('testData: ', testData.shape)
训练
# train
my_tree = create_division_tree(trainData, feature_names, None)
测试
confusion = np.zeros((classNum, classNum))
# test
true_num = 0
for data in testData:
pre, real = test(my_tree, data)
if pre == real:
true_num += 1
confusion[class_name[real]][class_name[pre]] += 1
print(confusion)
acc = true_num / len(testData)
ACC = str("%.03f" % acc)
print("accuracy=" + ACC)
保存可视化模型
# save results
if ENTROPY:
save_path = str('./figures/my_classifier_entropy_confusion_' + ACC + '.png')
else:
save_path = str('./figures/my_classifier_gini_confusion_' + ACC + '.png')
plt_confusion_matrix(confusion, class_name.keys(), save_path)
if ENTROPY:
save_path = str('./figures/my_classifier_entropy_tree_' + ACC + '.gv')
else:
save_path = str('./figures/my_classifier_gini_tree_' + ACC + '.gv')
my_tree.print_tree(save_path=save_path)
完整代码:https://github.com/shxy522/DecisionTree
如有不正确的地方,希望大家不吝赐教~