1 概述
传统的决策树算法包括ID3算法、C4.5算法以及CART算法。三者主要的区别在于特征选择准则不同。ID3算法选择特征的依据是信息增益,C4.5是信息增益比,而CART则是Gini指数。决策树算法的基本流程如下图所示。
2 ID3
2.1 理论
离散属性
a
a
a的取值
{
a
1
,
a
2
,
a
3
,
…
a
V
}
\left\{a^{1}, a^{2}, a^{3}, \ldots a^{V}\right\}
{a1,a2,a3,…aV}
D
v
D^{v}
Dv:
D
D
D中在
a
a
a上取值=
a
v
a^{v}
av的样本集合
以属性
a
a
a对数据集
D
D
D进行划分所获得的信息增益为:
其中,
Ent
(
D
)
=
−
∑
k
=
1
∣
Y
∣
p
k
log
2
p
k
\operatorname{Ent}(D)=-\sum_{k=1}^{|\mathcal{Y}|} p_{k} \log _{2} p_{k}
Ent(D)=−∑k=1∣Y∣pklog2pk,值越小纯度越高。
2.2 代码
import numpy as np
import pandas as pd
from math import log
# 信息熵
def entropy(ele):
probs = [ele.count(i) / len(ele) for i in set(ele)]
entropy = -sum([prob * log(prob, 2) for prob in probs])
return entropy
# 数据划分
def split_dataframe(data, col):
unique_values = data[col].unique()
result_dict = {elem: pd.DataFrame for elem in unique_values}
for key in result_dict.keys():
result_dict[key] = data[:][data[col] == key]
return result_dict
# 选择最佳特征:信息增益
def choose_best_col(data, label):
entropy_D = entropy(data[label].tolist())
cols = [col for col in data.columns if col not in [label]]
max_value, best_col = -999, None
max_splited = None
for col in cols:
splited_set = split_dataframe(data, col)
entropy_DA = 0
for subset_col, subset in splited_set.items():
entropy_Di = entropy(subset[label].tolist())
entropy_DA += len(subset) / len(data) * entropy_Di
info_gain = entropy_D - entropy_DA
if info_gain > max_value:
max_value, best_col = info_gain, col
max_splited = splited_set
return max_value, best_col, max_splited
# 创建ID3类
class ID3Tree:
class Node:
def __init__(self, name):
self.name = name
self.connections = {}
def connect(self, label, node):
self.connections[label] = node
def __init__(self, data, label):
self.columns = data.columns
self.data = data
self.label = label
self.root = self.Node("Root")
def print_tree(self, node, tabs):
print(tabs + node.name)
for connection, child_node in node.connections.items():
print(tabs + "\t" + "(" + connection + ")")
self.print_tree(child_node, tabs + "\t\t")
def construct_tree(self):
self.construct(self.root, "", self.data, self.columns)
def construct(self, parent_node, parent_connection_label, input_data, columns):
max_value, best_col, max_splited = choose_best_col(input_data[columns], self.label)
if not best_col:
node = self.Node(input_data[self.label].iloc[0])
parent_node.connect(parent_connection_label, node)
return
node = self.Node(best_col)
parent_node.connect(parent_connection_label, node)
new_columns = [col for col in columns if col != best_col]
for splited_value, splited_data in max_splited.items():
self.construct(node, splited_value, splited_data, new_columns)
if __name__ == '__main__':
df = pd.read_csv('../data.csv', dtype={'windy':'str'})
id3 = ID3Tree(df, 'play')
id3.construct_tree()
id3.print_tree(id3.root, '')
3 C4.5
3.1 理论
ID3的问题是对可取值数目较多的属性有所偏好;故C4.5采用了信息增益率选择特征。计算公式如下:
Gain-ratio ( D , a ) = Gain ( D , a ) IV ( a ) \text{ Gain-ratio }\left(D, a\right)=\frac{\text{Gain}\left(D, a\right)} {\text{IV}\left(a\right)} Gain-ratio (D,a)=IV(a)Gain(D,a)
其中, I V ( a ) = − ∑ v = 1 V ∣ D v ∣ ∣ D ∣ log 2 ∣ D v ∣ ∣ D ∣ \mathrm{IV}(a)=-\sum_{v=1}^{V} \frac{\left|D^{v}\right|}{|D|} \log _{2} \frac{\left|D^{v}\right|}{|D|} IV(a)=−∑v=1V∣D∣∣Dv∣log2∣D∣∣Dv∣
4 CART
4.1 理论
CART算法包括特征选择、决策树生成和决策树剪枝三个部分, CART算法主要包括回归树和分类树两种。回归树特征选择准则用的是平方误差最小准则,分类树特征选择准则用的是基尼指数。此外,剪枝是决策树算法的一种正则化手段。
-
回归树
-
分类树
基尼系数计算公式如下:
Gini ( D ) = 1 − ∑ k = 1 ∣ Y ∣ p k 2 Gini-index ( D , a ) = ∑ v = 1 V ∣ D v ∣ ∣ D ∣ Gini ( D r ) \operatorname{Gini}(D)=1-\sum_{k=1}^{|\mathcal{Y}|} p_{k}^{2} \quad \text { Gini-index }(D, a)=\sum_{v=1}^{V} \frac{\left|D^{v}\right|}{|D|} \operatorname{Gini}\left(D^{r}\right) Gini(D)=1−k=1∑∣Y∣pk2 Gini-index (D,a)=v=1∑V∣D∣∣Dv∣Gini(Dr) -
剪枝
剪枝就是通过主动去掉一些分支来降低过拟合风险,可分为预剪枝与后剪枝。
- 预剪枝
在决策树生成过程中,在划分节点时,若该节点的划分没有提高其在验证集上的准确率,则不进行划分 - 后剪枝
后剪枝决策树先生成一棵完整的决策树,再从底往顶进行剪枝处理。
4.2 代码
import numpy as np
import pandas as pd
# 基尼系数
def gini(nums):
probs = [nums.count(i)/len(nums) for i in set(nums)]
gini = sum([p*(1-p) for p in probs])
return gini
# 划分数据
def split_dataframe(data, col):
unique_values = data[col].unique()
result_dict = {elem : pd.DataFrame for elem in unique_values}
for key in result_dict.keys():
result_dict[key] = data[:][data[col] == key]
return result_dict
# 选择最佳特征:基尼系数
def choose_best_col(df, label):
gini_D = gini(df[label].tolist())
cols = [col for col in df.columns if col not in [label]]
min_value, best_col = 999, None
min_splited = None
for col in cols:
splited_set = split_dataframe(df, col)
gini_DA = 0
for subset_col, subset in splited_set.items():
gini_Di = gini(subset[label].tolist())
gini_DA += len(subset)/len(df) * gini_Di
if gini_DA < min_value:
min_value, best_col = gini_DA, col
min_splited = splited_set
return min_value, best_col, min_splited
# 创建CART类
class CartTree:
class Node:
def __init__(self, name):
self.name = name
self.connections = {}
def connect(self, label, node):
self.connections[label] = node
def __init__(self, data, label):
self.columns = data.columns
self.data = data
self.label = label
self.root = self.Node("Root")
def print_tree(self, node, tabs):
print(tabs + node.name)
for connection, child_node in node.connections.items():
print(tabs + "\t" + "(" + connection + ")")
self.print_tree(child_node, tabs + "\t\t")
def construct_tree(self):
self.construct(self.root, "", self.data, self.columns)
def construct(self, parent_node, parent_connection_label, input_data, columns):
min_value, best_col, min_splited = choose_best_col(input_data[columns], self.label)
if not best_col:
node = self.Node(input_data[self.label].iloc[0])
parent_node.connect(parent_connection_label, node)
return
node = self.Node(best_col)
parent_node.connect(parent_connection_label, node)
new_columns = [col for col in columns if col != best_col]
for splited_value, splited_data in min_splited.items():
self.construct(node, splited_value, splited_data, new_columns)
if __name__ == '__main__':
df = pd.read_csv('../data.csv', dtype={'windy':'str'})
tree1 = CartTree(df, 'play')
tree1.construct_tree()
tree1.print_tree(tree1.root, "")
参考
理论:周志华《机器学习》,李航《统计学习方法》
代码:https://github.com/luwill/machine-learning-code-writing