python实现ID3算法对鸢尾花数据集分类

最新推荐文章于 2024-04-05 22:23:13 发布

勿笑葱

最新推荐文章于 2024-04-05 22:23:13 发布

阅读量423

点赞数 1

分类专栏：机器学习文章标签：算法 python 分类

本文链接：https://blog.csdn.net/seabyeolbe/article/details/136220938

版权

机器学习专栏收录该内容

9 篇文章 0 订阅

订阅专栏

理论部分大家可以自己学习，这里代码是利用的逻辑编写，没有用sklearn

import numpy as np

import pandas as pd
from collections import Counter


class DecisionTree:
    class Node:
        def __init__(self, feature_index=None, threshold=None, value=None, left=None, right=None):
            self.feature_index = feature_index  # 特征索引
            self.threshold = threshold  # 分割阈值
            self.value = value  # 叶子节点预测值
            self.left = left  # 左子树
            self.right = right  # 右子树

    def __init__(self, max_depth=None, min_samples_split=2):
        self.max_depth = max_depth  # 最大深度
        self.min_samples_split = min_samples_split  # 分割的最小样本数
        self.root = None  # 根节点
        

    def fit(self, X, y):
        self.root = self._build_tree(X, y, depth=0)

    def _build_tree(self, X, y, depth):
        n_samples, n_features = X.shape
        n_classes = len(np.unique(y))

       
        if depth == self.max_depth or n_samples < self.min_samples_split or n_classes == 1: # 满足停止条件
            value = self._most_common_label(y)
            return self.Node(value=value)

        best_feature_index, best_threshold = self._find_best_split(X, y)

     
        if best_feature_index is None or best_threshold is None:# 满足停止条件
            value = self._most_common_label(y)   
            return self.Node(value=value)

        left_indices = X[:, best_feature_index] < best_threshold
        right_indices = ~left_indices

        
        left_branch = self._build_tree(X[left_indices], y[left_indices], depth+1)# 递归构建子树
        right_branch = self._build_tree(X[right_indices], y[right_indices], depth+1)

        return self.Node(feature_index=best_feature_index, threshold=best_threshold,
                         left=left_branch, right=right_branch)

    def _find_best_split(self, X, y):
        n_samples, n_features = X.shape
        best_info_gain = -1
        best_feature_index = None
        best_threshold = None

        
        entropy_parent = self._entropy(y)# 计算父节点的熵

        for feature_index in range(n_features):
            unique_values = np.unique(X[:, feature_index])

            for threshold in unique_values:
                left_indices = X[:, feature_index] < threshold
                right_indices = ~left_indices

               
                entropy_left = self._entropy(y[left_indices]) # 计算子节点的熵和信息增益
                entropy_right = self._entropy(y[right_indices])
                info_gain = self._information_gain(entropy_parent, y[left_indices], y[right_indices])

                
                if info_gain > best_info_gain:# 选择信息增益最大的分割点
                    best_info_gain = info_gain
                    best_feature_index = feature_index
                    best_threshold = threshold

        return best_feature_index, best_threshold

    def _entropy(self, y):  #计算熵
        counter = Counter(y)
        probabilities = [count / len(y) for count in counter.values()]
        entropy = -sum(p * np.log2(p) for p in probabilities)
        return entropy

    def _information_gain(self, entropy_parent, y_left, y_right):
        n_total = len(y_left) + len(y_right)
        p_left, p_right = len(y_left) / n_total, len(y_right) / n_total
        info_gain = entropy_parent - (p_left * self._entropy(y_left) + p_right * self._entropy(y_right))
        return info_gain  #计算信息增益

    def _most_common_label(self, y):
        counter = Counter(y)
        most_common = counter.most_common(1)
        return most_common[0][0]

    def predict(self, X):
        return [self._traverse_tree(x, self.root) for x in X]

    def _traverse_tree(self, x, node):  #进行分类
        if node.value is not None:
            return node.value
        if x[node.feature_index] < node.threshold:
            return self._traverse_tree(x, node.left)
        else:
            return self._traverse_tree(x, node.right)


data = pd.read_excel("C:/Users/wxc/Desktop/xuexi/python/pythonProject/机器学习/决策树/train.xlsx")
x_train = np.array(data.iloc[:, 1:5])
y_train = np.array(data.iloc[:, 6])

tree = DecisionTree(max_depth=4, min_samples_split=1)
tree.fit(x_train, y_train)
data1 = pd.read_excel("C:/Users/wxc/Desktop/xuexi/python/pythonProject/机器学习/决策树/test.xlsx",header=None) # 新样本特征
x_test = np.array(data1.iloc[:, 1:5])
y_test = np.array(data1.iloc[:, 6])
predictions = tree.predict(x_test)
print("预测值为:", predictions)
c = 0
for i in range(len(y_test)):
    if y_test[i] == predictions[i]:
        c= c+1
print('准确率')      
print(c/(len(y_test)))