id3决策树 鸢尾花 python_机器学习之分类回归树(python实现CART)

机器学习之分类回归树(python实现CART)

之前有文章介绍过决策树(ID3)。简单回顾一下:ID3每次选取最佳特征来分割数据,这个最佳特征的判断原则是通过信息增益来实现的。按照某种特征切分数据后,该特征在以后切分数据集时就不再使用,因此存在切分过于迅速的问题。ID3算法还不能处理连续性特征。 下面简单介绍一下其他算法:

0be49bb8741a957ecf539fc9050b8718.png

CART 分类回归树

CART是Classification And Regerssion Trees的缩写,既能处理分类任务也能做回归任务。

678254b7f619b7038785e29d1cd402ec.png

CART树的典型代表时二叉树,根据不同的条件将分类。

419f3b73550722f3541da4e9cadda7fe.png

CART树构建算法 与ID3决策树的构建方法类似,直接给出CART树的构建过程。首先与ID3类似采用字典树的数据结构,包含以下4中元素:

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
id3决策树 鸢尾花 Python代码实现: ```python import numpy as np import pandas as pd from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split class Node: def __init__(self, feature=None, target=None, left=None, right=None): self.feature = feature # 划分数据集的特征 self.target = target # 叶子节点的类别 self.left = left # 左子节点 self.right = right # 右子节点 class ID3DecisionTree: def __init__(self): self.tree = None # 决策树 # 计算信息熵 def _entropy(self, y): labels = np.unique(y) probs = [np.sum(y == label) / len(y) for label in labels] return -np.sum([p * np.log2(p) for p in probs]) # 计算条件熵 def _conditional_entropy(self, X, y, feature): feature_values = np.unique(X[:, feature]) probs = [np.sum(X[:, feature] == value) / len(X) for value in feature_values] entropies = [self._entropy(y[X[:, feature] == value]) for value in feature_values] return np.sum([p * e for p, e in zip(probs, entropies)]) # 选择最优特征 def _select_feature(self, X, y): n_features = X.shape[1] entropies = [self._conditional_entropy(X, y, feature) for feature in range(n_features)] return np.argmin(entropies) # 构建决策树 def _build_tree(self, X, y): if len(np.unique(y)) == 1: # 叶子节点,返回类别 return Node(target=y[0]) if X.shape[1] == 0: # 叶子节点,返回出现次数最多的类别 target = np.argmax(np.bincount(y)) return Node(target=target) feature = self._select_feature(X, y) # 选择最优特征 feature_values = np.unique(X[:, feature]) left_indices = [i for i in range(len(X)) if X[i][feature] == feature_values[0]] right_indices = [i for i in range(len(X)) if X[i][feature] == feature_values[1]] left = self._build_tree(X[left_indices], y[left_indices]) # 递归构建左子 right = self._build_tree(X[right_indices], y[right_indices]) # 递归构建右子 return Node(feature=feature, left=left, right=right) # 训练决策树 def fit(self, X, y): self.tree = self._build_tree(X, y) # 预测单个样本 def _predict_sample(self, x): node = self.tree while node.target is None: if x[node.feature] == np.unique(X[:, node.feature])[0]: node = node.left else: node = node.right return node.target # 预测多个样本 def predict(self, X): return np.array([self._predict_sample(x) for x in X]) # 加载鸢尾花数据集 iris = load_iris() X = iris.data y = iris.target # 划分数据集 train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=1) # 训练模型 model = ID3DecisionTree() model.fit(train_X, train_y) # 预测测试集 pred_y = model.predict(test_X) # 计算准确率 accuracy = np.sum(pred_y == test_y) / len(test_y) print('Accuracy:', accuracy) ``` C4.5决策树 Python代码实现: ```python import numpy as np import pandas as pd from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split class Node: def __init__(self, feature=None, threshold=None, target=None, left=None, right=None): self.feature = feature # 划分数据集的特征 self.threshold = threshold # 划分数据集的阈值 self.target = target # 叶子节点的类别 self.left = left # 左子节点 self.right = right # 右子节点 class C45DecisionTree: def __init__(self, min_samples_split=2, min_gain_ratio=1e-4): self.min_samples_split = min_samples_split # 最小划分样本数 self.min_gain_ratio = min_gain_ratio # 最小增益比 self.tree = None # 决策树 # 计算信息熵 def _entropy(self, y): labels = np.unique(y) probs = [np.sum(y == label) / len(y) for label in labels] return -np.sum([p * np.log2(p) for p in probs]) # 计算条件熵 def _conditional_entropy(self, X, y, feature, threshold): left_indices = X[:, feature] <= threshold right_indices = X[:, feature] > threshold left_probs = np.sum(left_indices) / len(X) right_probs = np.sum(right_indices) / len(X) entropies = [self._entropy(y[left_indices]), self._entropy(y[right_indices])] return np.sum([p * e for p, e in zip([left_probs, right_probs], entropies)]) # 计算信息增益 def _information_gain(self, X, y, feature, threshold): entropy = self._entropy(y) conditional_entropy = self._conditional_entropy(X, y, feature, threshold) return entropy - conditional_entropy # 计算信息增益比 def _gain_ratio(self, X, y, feature, threshold): entropy = self._entropy(y) conditional_entropy = self._conditional_entropy(X, y, feature, threshold) split_info = -np.sum([p * np.log2(p) for p in [np.sum(X[:, feature] <= threshold) / len(X), np.sum(X[:, feature] > threshold) / len(X)]]) return (entropy - conditional_entropy) / split_info if split_info != 0 else 0 # 选择最优特征和划分阈值 def _select_feature_and_threshold(self, X, y): n_features = X.shape[1] max_gain_ratio = -1 best_feature, best_threshold = None, None for feature in range(n_features): thresholds = np.unique(X[:, feature]) for threshold in thresholds: if len(y[X[:, feature] <= threshold]) >= self.min_samples_split and len(y[X[:, feature] > threshold]) >= self.min_samples_split: gain_ratio = self._gain_ratio(X, y, feature, threshold) if gain_ratio > max_gain_ratio: max_gain_ratio = gain_ratio best_feature = feature best_threshold = threshold return best_feature, best_threshold # 构建决策树 def _build_tree(self, X, y): if len(np.unique(y)) == 1: # 叶子节点,返回类别 return Node(target=y[0]) if X.shape[1] == 0: # 叶子节点,返回出现次数最多的类别 target = np.argmax(np.bincount(y)) return Node(target=target) feature, threshold = self._select_feature_and_threshold(X, y) # 选择最优特征和划分阈值 if feature is None or threshold is None: # 叶子节点,返回出现次数最多的类别 target = np.argmax(np.bincount(y)) return Node(target=target) left_indices = X[:, feature] <= threshold right_indices = X[:, feature] > threshold left = self._build_tree(X[left_indices], y[left_indices]) # 递归构建左子 right = self._build_tree(X[right_indices], y[right_indices]) # 递归构建右子 return Node(feature=feature, threshold=threshold, left=left, right=right) # 训练决策树 def fit(self, X, y): self.tree = self._build_tree(X, y) # 预测单个样本 def _predict_sample(self, x): node = self.tree while node.target is None: if x[node.feature] <= node.threshold: node = node.left else: node = node.right return node.target # 预测多个样本 def predict(self, X): return np.array([self._predict_sample(x) for x in X]) # 加载鸢尾花数据集 iris = load_iris() X = iris.data y = iris.target # 划分数据集 train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=1) # 训练模型 model = C45DecisionTree(min_samples_split=5) model.fit(train_X, train_y) # 预测测试集 pred_y = model.predict(test_X) # 计算准确率 accuracy = np.sum(pred_y == test_y) / len(test_y) print('Accuracy:', accuracy) ```

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值