决策树之C4.5算法

1. 采用信息增益率

因为 ID3 在计算的时候,倾向于选择取值多的属性。为了避免这个问题,C4.5 采用信息增益率的方式来选择属性。信息增益率 = 信息增益 / 属性熵,具体的计算公式这里省略。当属性有很多值的时候,相当于被划分成了许多份,虽然信息增益变大了,但是对于 C4.5 来说,属性熵也会变大,所以整体的信息增益率并不大。

2. 采用悲观剪枝

ID3 构造决策树的时候,容易产生过拟合的情况。在 C4.5 中,会在决策树构造之后采用悲观剪枝(PEP),这样可以提升决策树的泛化能力。

悲观剪枝后剪枝技术中的一种,通过递归估算每个内部节点的分类错误率,比较剪枝前后这个节点的分类错误率来决定是否对其进行剪枝。这种剪枝方法不再需要一个单独的测试数据集。

3. 离散化处理连续属性

C4.5 可以处理连续属性的情况,对连续的属性进行离散化的处理。比如打篮球存在的“湿度”属性,不按照“高、中”划分,而是按照湿度值进行计算,那么湿度取什么值都有可能。该怎么选择这个阈值呢,C4.5 选择具有最高信息增益的划分所对应的阈值。

4. 处理缺失值

针对数据集不完整的情况,C4.5 也可以进行处理。假如我们得到的是如下的数据,你会发现这个数据中存在两点问题。第一个问题是,数据集中存在数值缺失的情况,如何进行属性选择?第二个问题是,假设已经做了属性划分,但是样本在这个属性上有缺失值,该如何对样本进行划分?

我们不考虑缺失的数值,可以得到温度 D={2-,3+,4+,5-,6+,7-}。温度 = 高:D1={2-,3+,4+} ;温度 = 中:D2={6+,7-};温度 = 低:D3={5-} 。这里 + 号代表打篮球,- 号代表不打篮球。比如 ID=2 时,决策是不打篮球,我们可以记录为 2-。

针对将属性选择为温度的信息增益为:

Gain(D′, 温度)=Ent(D′)-0.792=1.0-0.792=0.208,

属性熵 =-3/6log3/6 - 1/6log1/6 - 2/6log2/61.459,

信息增益率 Gain_ratio(D′, 温度)=0.208/1.459=0.1426。

D′的样本个数为 6,而 D 的样本个数为 7,所以所占权重比例为 6/7,所以 Gain(D′,温度) 所占权重比例为 6/7,所以:Gain_ratio(D, 温度)=6/7*0.1426=0.122。这样即使在温度属性的数值有缺失的情况下,我们依然可以计算信息增益,并对属性进行选择。

 

 ID3 算法的优点是方法简单,缺点是对噪声敏感。训练数据如果有少量错误,可能会产生决策树分类错误。

C4.5 在 ID3 的基础上,用信息增益率代替了信息增益,解决了噪声敏感的问题,并且可以对构造树进行剪枝、处理连续数值以及数值缺失等情况,但是由于 C4.5 需要对数据集进行多次扫描,算法效率相对较低。

在决策树的构造中,一个决策树包括根节点、子节点、叶子节点。在属性选择的标准上,度量方法包括了信息增益和信息增益率。ID3 是基础的决策树算法,C4.5 在它的基础上进行了改进,也是目前决策树中应用广泛的算法。

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
决策树C4.5算法的C语言代码: ```c #include <stdio.h> #include <stdlib.h> #include <string.h> #include <math.h> #define MAX_FEATURES 100 #define MAX_SAMPLES 1000 typedef struct { int num_samples; int num_features; int feature_types[MAX_FEATURES]; double samples[MAX_SAMPLES][MAX_FEATURES]; int labels[MAX_SAMPLES]; } dataset; typedef struct node { int feature_index; double threshold; struct node *left; struct node *right; int label; } node; double entropy(int *labels, int num_samples) { double e = 0.0; int num_classes = 0; int *class_counts = (int *)calloc(num_samples, sizeof(int)); for (int i = 0; i < num_samples; i++) { int found = 0; for (int j = 0; j < num_classes; j++) { if (labels[i] == j) { class_counts[j]++; found = 1; break; } } if (!found) { class_counts[num_classes++]++; } } for (int i = 0; i < num_classes; i++) { double p = (double)class_counts[i] / num_samples; e -= p * log2(p); } free(class_counts); return e; } double information_gain(dataset *data, int *indices, int num_indices, int feature_index, double threshold) { int num_samples = data->num_samples; int *left_labels = (int *)calloc(num_samples, sizeof(int)); int *right_labels = (int *)calloc(num_samples, sizeof(int)); int left_count = 0; int right_count = 0; for (int i = 0; i < num_indices; i++) { int index = indices[i]; if (data->samples[index][feature_index] < threshold) { left_labels[left_count++] = data->labels[index]; } else { right_labels[right_count++] = data->labels[index]; } } double e = entropy(data->labels, num_samples); double left_e = entropy(left_labels, left_count); double right_e = entropy(right_labels, right_count); double gain = e - ((double)left_count / num_indices * left_e + (double)right_count / num_indices * right_e); free(left_labels); free(right_labels); return gain; } int is_pure(int *labels, int num_samples) { int label = labels[0]; for (int i = 1; i < num_samples; i++) { if (labels[i] != label) { return 0; } } return 1; } int majority_vote(int *labels, int num_samples) { int num_classes = 0; int *class_counts = (int *)calloc(num_samples, sizeof(int)); for (int i = 0; i < num_samples; i++) { int found = 0; for (int j = 0; j < num_classes; j++) { if (labels[i] == j) { class_counts[j]++; found = 1; break; } } if (!found) { class_counts[num_classes++]++; } } int max_count = 0; int max_index = 0; for (int i = 0; i < num_classes; i++) { if (class_counts[i] > max_count) { max_count = class_counts[i]; max_index = i; } } free(class_counts); return max_index; } node *build_tree(dataset *data, int *indices, int num_indices) { if (num_indices == 0) { return NULL; } int num_samples = data->num_samples; int num_features = data->num_features; int *labels = (int *)calloc(num_indices, sizeof(int)); for (int i = 0; i < num_indices; i++) { labels[i] = data->labels[indices[i]]; } if (is_pure(labels, num_indices)) { node *leaf = (node *)malloc(sizeof(node)); leaf->feature_index = -1; leaf->threshold = 0.0; leaf->left = NULL; leaf->right = NULL; leaf->label = labels[0]; free(labels); return leaf; } if (num_features == 0) { node *leaf = (node *)malloc(sizeof(node)); leaf->feature_index = -1; leaf->threshold = 0.0; leaf->left = NULL; leaf->right = NULL; leaf->label = majority_vote(labels, num_indices); free(labels); return leaf; } int best_feature_index = -1; double best_threshold = 0.0; double best_gain = 0.0; for (int i = 0; i < num_features; i++) { if (data->feature_types[i] == 0) { double min_value = INFINITY; double max_value = -INFINITY; for (int j = 0; j < num_indices; j++) { int index = indices[j]; double value = data->samples[index][i]; if (value < min_value) { min_value = value; } if (value > max_value) { max_value = value; } } double step = (max_value - min_value) / 100.0; for (double threshold = min_value; threshold <= max_value; threshold += step) { double gain = information_gain(data, indices, num_indices, i, threshold); if (gain > best_gain) { best_feature_index = i; best_threshold = threshold; best_gain = gain; } } } else { double gain = information_gain(data, indices, num_indices, i, 0.0); if (gain > best_gain) { best_feature_index = i; best_threshold = 0.0; best_gain = gain; } } } if (best_gain == 0.0) { node *leaf = (node *)malloc(sizeof(node)); leaf->feature_index = -1; leaf->threshold = 0.0; leaf->left = NULL; leaf->right = NULL; leaf->label = majority_vote(labels, num_indices); free(labels); return leaf; } int *left_indices = (int *)calloc(num_indices, sizeof(int)); int *right_indices = (int *)calloc(num_indices, sizeof(int)); int left_count = 0; int right_count = 0; for (int i = 0; i < num_indices; i++) { int index = indices[i]; if (data->samples[index][best_feature_index] < best_threshold) { left_indices[left_count++] = index; } else { right_indices[right_count++] = index; } } node *n = (node *)malloc(sizeof(node)); n->feature_index = best_feature_index; n->threshold = best_threshold; n->left = build_tree(data, left_indices, left_count); n->right = build_tree(data, right_indices, right_count); n->label = -1; free(labels); free(left_indices); free(right_indices); return n; } int predict(node *n, double *sample) { if (n->feature_index == -1) { return n->label; } if (n->feature_index >= 0) { if (sample[n->feature_index] < n->threshold) { return predict(n->left, sample); } else { return predict(n->right, sample); } } return -1; } int main() { dataset data; data.num_samples = 14; data.num_features = 4; data.feature_types[0] = 0; data.feature_types[1] = 0; data.feature_types[2] = 0; data.feature_types[3] = 0; memcpy(data.samples[0], (double[]){5.1, 3.5, 1.4, 0.2}, sizeof(double) * 4); memcpy(data.samples[1], (double[]){4.9, 3.0, 1.4, 0.2}, sizeof(double) * 4); memcpy(data.samples[2], (double[]){4.7, 3.2, 1.3, 0.2}, sizeof(double) * 4); memcpy(data.samples[3], (double[]){4.6, 3.1, 1.5, 0.2}, sizeof(double) * 4); memcpy(data.samples[4], (double[]){5.0, 3.6, 1.4, 0.2}, sizeof(double) * 4); memcpy(data.samples[5], (double[]){5.4, 3.9, 1.7, 0.4}, sizeof(double) * 4); memcpy(data.samples[6], (double[]){4.6, 3.4, 1.4, 0.3}, sizeof(double) * 4); memcpy(data.samples[7], (double[]){5.0, 3.4, 1.5, 0.2}, sizeof(double) * 4); memcpy(data.samples[8], (double[]){4.4, 2.9, 1.4, 0.2}, sizeof(double) * 4); memcpy(data.samples[9], (double[]){4.9, 3.1, 1.5, 0.1}, sizeof(double) * 4); memcpy(data.samples[10], (double[]){5.4, 3.7, 1.5, 0.2}, sizeof(double) * 4); memcpy(data.samples[11], (double[]){4.8, 3.4, 1.6, 0.2}, sizeof(double) * 4); memcpy(data.samples[12], (double[]){4.8, 3.0, 1.4, 0.1}, sizeof(double) * 4); memcpy(data.samples[13], (double[]){4.3, 3.0, 1.1, 0.1}, sizeof(double) * 4); memcpy(data.labels, (int[]){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, sizeof(int) * 14); int indices[14]; for (int i = 0; i < 14; i++) { indices[i] = i; } node *root = build_tree(&data, indices, 14); double sample[4] = {5.1, 3.5, 1.4, 0.2}; int label = predict(root, sample); printf("Label: %d\n", label); return 0; } ```

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值