【机器学习】手动实现分类决策树（不用sk-learn）

最新推荐文章于 2023-06-30 15:18:36 发布

Sherryshinyy

最新推荐文章于 2023-06-30 15:18:36 发布

阅读量1k

点赞数 1

分类专栏：机器学习文章标签：机器学习决策树 python 可视化

本文链接：https://blog.csdn.net/qq_39001444/article/details/112301348

版权

机器学习专栏收录该内容

2 篇文章 0 订阅

订阅专栏

【机器学习】手动实现分类决策树（不用sk-learn）

不用sk-learn库，一步一步实现分类决策树，外加可视化哦~

树结构：二叉树结构+决策节点+叶子节点
度量：entropy或gini
构造树
读入数据：车辆轮廓分类（数据量：846，特征维度：18）
数据来源：CSDN - 一些用于聚类和分类问题的数据集分类数据35
训练
测试

下面就开始啦

树结构

二叉树结构

from graphviz import Digraph
import uuid
from random import sample


class BinaryTree:
    def __init__(self, rootObj):
        self.key = rootObj
        self.leftChild = None
        self.rightChild = None
        self.dot = Digraph(comment='Binary Tree')

    def insertLeft(self, newNode):
        if self.leftChild is None:
            self.leftChild = newNode
        else:
            t = newNode
            t.leftChild = self.leftChild
            self.leftChild = t

    def insertRight(self, newNode):
        if self.rightChild is None:
            self.rightChild = newNode
        else:
            t = newNode
            t.rightChild = self.rightChild
            self.rightChild = t

    # 可视化，输出到Binary_Tree.gv
    def print_tree(self, save_path='./Binary_Tree.gv', label=True):

        # colors for labels of nodes
        colors = ['skyblue', 'tomato', 'orange', 'purple', 'green', 'yellow', 'pink', 'red']

        # Draws a binary tree with a node as its root
        def print_node(node, node_tag):
            # The node color
            color = sample(colors, 1)[0]
            if node.leftChild is not None:
                if type(node.leftChild.key).__name__ == 'DecisionNode':  # DecisionNode
                    left_tag = str(uuid.uuid1())  # Data for the left node
                    self.dot.node(left_tag,
                                  str(node.leftChild.key.feature_name + " <= " + str(
                                      node.leftChild.key.feature_value) + " ? "),
                                  style='filled', color=color)  # left
                    label_string = 'Y' if label else ''  # Whether to label the connector indicates a left subtree
                    self.dot.edge(node_tag, left_tag, label=label_string)  # The line between the left child and parent
                    print_node(node.leftChild, left_tag)
                else:  # leaf
                    left_tag = str(uuid.uuid1())
                    self.dot.node(left_tag, str(
                        "samples = " + str(node.leftChild.key.num) + "\n value = " + str(node.leftChild.key.value)),
                                  style='filled', color=color)
                    label_string = 'Y' if label else ''
                    self.dot.edge(node_tag, left_tag, label=label_string)
                    print_node(node.leftChild, left_tag)

            if node.rightChild is not None:
                if type(node.rightChild.key).__name__ == 'DecisionNode':
                    right_tag = str(uuid.uuid1())
                    self.dot.node(right_tag, str(node.rightChild.key.feature_name + " <= " + str(
                        node.rightChild.key.feature_value) + " ? "), style='filled', color=color)
                    label_string = 'N' if label else ''
                    self.dot.edge(node_tag, right_tag, label=label_string)
                    print_node(node.rightChild, right_tag)
                else:
                    right_tag = str(uuid.uuid1())
                    self.dot.node(right_tag, str(
                        "samples = " + str(node.rightChild.key.num) + "\n value = " + str(node.rightChild.key.value)),
                                  style='filled', color=color)
                    label_string = 'N' if label else ''
                    self.dot.edge(node_tag, right_tag, label=label_string)
                    print_node(node.rightChild, right_tag)

        if type(self.key).__name__ == 'DecisionNode':
            root_tag = str(uuid.uuid1())  # root
            self.dot.node(root_tag, str(self.key.feature_name + "<=" + str(self.key.feature_value) + "?"),
                          style='filled', color=sample(colors, 1)[0])  # Create the root node
            print_node(self, root_tag)

        self.dot.render(save_path)

决策节点

class DecisionNode:
    def __init__(self, feature_id, feature_value, feature_name):
        # 决策特征的id、边界值、特征名
        self.feature_id = feature_id
        self.feature_value = feature_value
        self.feature_name = feature_name

叶子节点

class Leaf:
    def __init__(self, value, num):
        self.value = value  # 叶子节点的类名
        self.num = num  # 叶子节点包括的数据量
        self.predict = {self.value, self.num}

分类度量方法

信息熵entropy

def entropy(data_set):
    count = len(data_set)
    label_counts = {}

    # Count the number of each category in the dataset
    for row in data_set:
        label = row[-1]
        if label not in label_counts.keys():
            label_counts[label] = 1
        else:
            label_counts[label] += 1

    entropy = 0.0
    for key in label_counts:
        prob = float(label_counts[key]) / count
        entropy -= prob * log(prob, 2)
    return entropy


# Select the feature with the highest gain base on entropy
def choose_best_feature_entropy(data_set):
    feature_count = len(data_set[0]) - 1
    # The original entropy of the data set
    base_entropy = entropy(data_set)
    # Maximum gain
    best_gain = 0.0
    # Maximum gain features
    best_feature = [-1, -1]  # [id, value]

    # Go through each feature
    for i in range(feature_count):
        feature = [example[i] for example in data_set]
        feature_value_set = set(feature)
        # Calculated gain
        for value in feature_value_set:
            left, right = split_data_set(data_set, i, value)
            prob = len(left) / float(len(data_set))
            gain = base_entropy - prob * entropy(left) - (1 - prob) * entropy(right)
            # Compare
            if gain > best_gain:
                best_gain = gain
                best_feature = [i, value]
    return best_feature

gini指数

def gini(data_set):
    count = len(data_set)
    label_counts = {}

    # Count the number of each category in the dataset
    for row in data_set:
        label = row[-1]
        if label not in label_counts.keys():
            label_counts[label] = 1
        else:
            label_counts[label] += 1

    impurity = 1.0
    for key in label_counts:
        prob = float(label_counts[key]) / count
        impurity -= prob * prob
    return impurity


# Select the feature with the highest gain base on gini
def choose_best_feature_gini(data_set):
    feature_count = len(data_set[0]) - 1
    base_gini = gini(data_set)
    best_gain = 0.0
    best_feature = [-1, -1]

    for i in range(feature_count):
        feature = [example[i] for example in data_set]
        feature_value_set = set(feature)
        for value in feature_value_set:
            left, right = split_data_set(data_set, i, value)
            prob = len(left) / float(len(data_set))
            gain = base_gini - prob * gini(left) - (1 - prob) * gini(right)
            if gain > best_gain:
                best_gain = gain
                best_feature = [i, value]
    return best_feature

构造树

ENTROPY = False  # False就用gini


def split_data_set(data_set, axis, value):
    """根据指定条件分割数据集"""
    # 划分后的新数据集
    left = []
    right = []

    for data in data_set:
        if data[axis] <= value:
            left.append(data)
        else:
            right.append(data)

    return np.array(left), np.array(right)


def count_value(rows):
    count = {}
    label = ""
    # takes whole dataset in as argument
    for row in rows:
        # traverse on each datapoint
        label = row[-1]
        # labels are in the last column
        # if label is not even once come initialise it
        if label not in count:
            count[label] = 0
        # increase the count of present label by 1
        count[label] += 1
    return label, count[label]


# Create decision tree
def create_division_tree(data_set, feature_names, parent, left=False, right=False):
    class_list = [example[-1] for example in data_set]

    # Return leaf node if all data are in the same class
    if class_list.count(class_list[0]) == len(class_list):
        label, num = count_value(data_set)
        leaf = Leaf(label, num)
        node = BinaryTree(leaf)
        if left:
            parent.insertLeft(node)
        elif right:
            parent.insertRight(node)
        return node
    else:
        if ENTROPY: 
            best_feature = choose_best_feature_entropy(data_set)
        else:
            best_feature = choose_best_feature_gini(data_set)
        print("best_feature:", feature_names[best_feature[0]], best_feature[1])
        leftData, rightData = split_data_set(data_set, best_feature[0], best_feature[1])

        # Creates the current decision node
        tempNode = BinaryTree(DecisionNode(best_feature[0], best_feature[1], feature_names[best_feature[0]]))
        if left:  # If the current DecisionNode is the left child of the previous node, add to left
            parent.insertLeft(tempNode)
        elif right:  # right
            parent.insertRight(tempNode)
        create_division_tree(leftData, feature_names, tempNode, left=True)  # iteration
        create_division_tree(rightData, feature_names, tempNode, right=True)
        return tempNode

读入数据

import numpy as np
import os
from random import randrange


def safe_float(number):
    try:
        return float(number)
    except:
        return number


# data
data_root = "./data_classify/"
data_files = []
dataSet = []
classNum = 4
feature_names = ['COMPACTNESS', 'CIRCULARITY', 'DISTANCE CIRCULARITY', 'RADIUS RATIO', 'PR.AXIS ASPECT RATIO',
                 'MAX.LENGTH ASPECT RATIO', 'SCATTER RATIO', 'ELONGATEDNESS', 'PR.AXIS RECTANGULARITY',
                 'MAX.LENGTH RECTANGULARITY', 'SCALED VARIANCE ALONG MAJOR AXIS', 'SCALED VARIANCE ALONG MINOR AXIS',
                 'SCALED RADIUS OF GYRATION', 'SKEWNESS ABOUT MAJOR AXIS', 'SKEWNESS ABOUT MINOR AXIS',
                 'KURTOSIS ABOUT MINOR AXIS', 'KURTOSIS ABOUT MAJOR AXIS', 'HOLLOWS RATIO']
class_name = {'van': 0, 'saab': 1, 'bus': 2, 'opel': 3}
for root, dirs, files in os.walk(data_root):
    for file in files:
        data_files.append(os.path.join(data_root, file))
print(data_files)
for file in data_files:
    with open(file, 'r') as f:
        lines = list(f)
        for line in lines:
            a = line.split()
            a = np.array(a)
            a = list(map(safe_float, a))
            dataSet.append(a)
print(dataSet)
dataSet = np.array(dataSet)  # 这里将列表转换为数组
print('dataSet: ', dataSet.shape)


# 将数据集随机分成n块，其中一块是测试集，其他n-1是训练集
def getTrainTest(dataSet, n_folds):
    train_size = int(len(dataSet) / n_folds) * (n_folds - 1)
    dataSet_copy = list(dataSet)
    train = []
    for i in range(n_folds - 1):
        while len(train) < train_size:  # 这里不能用if，if只是在第一次判断时起作用，while执行循环，直到条件不成立
            index = randrange(len(dataSet_copy))
            train.append(dataSet_copy.pop(index))  # pop() 函数用于移除列表中的一个元素（默认最后一个元素），并且返回该元素的值。
    test = dataSet_copy
    return train, test


def getDataSet():
    return dataSet, feature_names, class_name, classNum


# data
dataSet, feature_names, class_name, classNum = getDataSet()
trainData, testData = getTrainTest(dataSet, 5)
trainData = np.array(trainData)
testData = np.array(testData)
print('trainData: ', trainData.shape)
print('testData: ', testData.shape)

训练

#  train
my_tree = create_division_tree(trainData, feature_names, None)

测试

confusion = np.zeros((classNum, classNum))
#  test
true_num = 0
for data in testData:
    pre, real = test(my_tree, data)
    if pre == real:
        true_num += 1
    confusion[class_name[real]][class_name[pre]] += 1
print(confusion)
acc = true_num / len(testData)
ACC = str("%.03f" % acc)
print("accuracy=" + ACC)

保存可视化模型

# save results
if ENTROPY:
    save_path = str('./figures/my_classifier_entropy_confusion_' + ACC + '.png')
else:
    save_path = str('./figures/my_classifier_gini_confusion_' + ACC + '.png')
plt_confusion_matrix(confusion, class_name.keys(), save_path)

if ENTROPY:
    save_path = str('./figures/my_classifier_entropy_tree_' + ACC + '.gv')
else:
    save_path = str('./figures/my_classifier_gini_tree_' + ACC + '.gv')
my_tree.print_tree(save_path=save_path)