算法流程
输入:
训练集
D
=
{
(
x
1
,
y
1
)
,
(
x
2
,
y
2
)
,
.
.
.
,
(
x
m
,
y
m
)
}
D=\{ (x_1, y_1), (x_2, y_2), ..., (x_m, y_m) \}
D={(x1,y1),(x2,y2),...,(xm,ym)};
属性集
A
=
{
a
1
,
a
2
,
.
.
.
,
a
d
}
A=\{ a_1, a_2, ..., a_d\}
A={a1,a2,...,ad}
D
v
D_v
Dv表示D中在
a
∗
a_*
a∗上取值为
a
∗
v
a_*^v
a∗v的样本子集
过程:generate_tree(D, A)
生成节点node;
if D 中样本全属于同一类别C then
将node标记为C类叶结点;return
end if
if A=∅ OR D中样本在A上取值相同 then
将node标记为叶结点,其类别标记为D中样本数最多的类;return
end if
从A中选择最优化分属性
a
∗
a_*
a∗;
for
a
∗
a_*
a∗ 的每一个值
a
∗
v
a_*^v
a∗v do
为node生成一个分支;
if
D
v
D_v
Dv为空 then
将分支节点标记为叶结点,其类别标记为
D
v
D_v
Dv中样本数最多的类;return
else
以generate_tree(
D
v
,
A
−
a
∗
D_v,A-a_*
Dv,A−a∗)为分支结点
end if
end for
输出:以node为根结点的一颗决策树
实现
%config ZMQInteractiveShell.ast_node_interactivity='all'
import pandas as pd
import numpy as np
df = pd.read_csv('example_data.csv')
df
humility | outlook | play | temp | windy | |
---|---|---|---|---|---|
0 | high | sunny | no | hot | False |
1 | high | sunny | no | hot | True |
2 | high | overcast | yes | hot | False |
3 | high | rainy | yes | mild | False |
4 | normal | rainy | yes | cool | False |
5 | normal | rainy | no | cool | True |
6 | normal | overcast | yes | cool | True |
7 | high | sunny | no | mild | False |
8 | normal | sunny | yes | cool | False |
9 | normal | rainy | yes | mild | False |
10 | normal | sunny | yes | mild | True |
11 | high | overcast | yes | mild | True |
12 | normal | overcast | yes | hot | False |
13 | high | rainy | no | mild | True |
信息熵Entropy
对于数据集
D
=
{
(
x
1
,
y
1
)
,
(
x
2
,
y
2
)
,
.
.
.
,
(
x
m
,
y
m
)
}
D=\{ (x_1, y_1), (x_2, y_2), ..., (x_m, y_m) \}
D={(x1,y1),(x2,y2),...,(xm,ym)},一共有
∣
y
∣
|y|
∣y∣个分类,那么信息熵
E
n
t
(
D
)
Ent(D)
Ent(D)为:
E
n
t
(
D
)
=
−
∑
k
=
1
∣
y
∣
p
k
l
o
g
2
p
k
Ent(D)=-\sum^{|y|}_{k=1}p_k log_2 p_k
Ent(D)=−k=1∑∣y∣pklog2pk
其中
p
k
p_k
pk为第k个类别的样本数占比
E
n
t
(
D
)
Ent(D)
Ent(D)越小,表示
D
D
D的纯度越高。最小为0,表示
D
D
D中只有一个类别
∣
y
∣
=
1
,
p
1
=
1
|y|=1,p_1=1
∣y∣=1,p1=1
import math
class Strategy:
def __init__(self, name, attribute, label):
"""
attribute: 当前需要划分的属性
label: 标签字段
"""
self.name = name
self.attribute = attribute
self.label = label
def __str__(self):
return f'Strategy[{self.name}] on Attribute[{self.attribute}],Label is {self.label}'
def __repr__(self):
return self.__str__()
def split_dataframe(self, dataset):
"""
根据当前属性切分数据集
"""
unique_values = dataset[self.attribute].unique()
result_dict = {}
for item in unique_values:
result_dict[item] = dataset[:][dataset[self.attribute] == item]
return result_dict
def is_better_than(self, other):
"""
不同的策略需要自己决定更好的规则
"""
pass
def calculate(self, dataset):
"""
计算具体的划分点策略
dataset:pd.Dataframe 数据集
"""
pass
def json(self):
pass
s = Strategy('Test', 'temp', 'play')
s
Strategy[Test] on Attribute[temp],Label is play
dfs = s.split_dataframe(df)
print(type(dfs['cool']))
dfs
<class 'pandas.core.frame.DataFrame'>
{'cool': humility outlook play temp windy
4 normal rainy yes cool False
5 normal rainy no cool True
6 normal overcast yes cool True
8 normal sunny yes cool False,
'hot': humility outlook play temp windy
0 high sunny no hot False
1 high sunny no hot True
2 high overcast yes hot False
12 normal overcast yes hot False,
'mild': humility outlook play temp windy
3 high rainy yes mild False
7 high sunny no mild False
9 normal rainy yes mild False
10 normal sunny yes mild True
11 high overcast yes mild True
13 high rainy no mild True}
信息增益 Information gain
离散属性
a
a
a的取值
{
a
1
,
a
2
,
a
3
,
.
.
.
,
a
V
}
\{a^1,a^2,a^3,...,a^V\}
{a1,a2,a3,...,aV},
D
v
D^v
Dv表示
D
D
D中在
a
a
a上取值为
a
v
a^v
av的样本子集,以属性
a
a
a对数据集
D
D
D进行划分所获得的信息增益
G
a
i
n
(
D
,
a
)
Gain(D, a)
Gain(D,a)为:
G
a
i
n
(
D
,
a
)
=
E
n
t
(
D
)
−
∑
v
=
1
V
∣
D
v
∣
∣
D
∣
E
n
t
(
D
v
)
Gain(D, a)=Ent(D)-\sum^V_{v=1}\frac{|D^v|}{|D|}Ent(D^v)
Gain(D,a)=Ent(D)−v=1∑V∣D∣∣Dv∣Ent(Dv)
其中
E
n
t
(
D
)
Ent(D)
Ent(D)为划分前的信息熵,
∣
D
∣
|D|
∣D∣表示数据集的样本数,
∑
v
=
1
V
∣
D
v
∣
∣
D
∣
E
n
t
(
D
v
)
\sum^V_{v=1}\frac{|D^v|}{|D|}Ent(D^v)
∑v=1V∣D∣∣Dv∣Ent(Dv)是以属性
a
a
a划分后的信息熵。
∣
D
v
∣
∣
D
∣
\frac{|D^v|}{|D|}
∣D∣∣Dv∣表示第
v
v
v个分支的权重,样本子集数量越多越重要
from functools import total_ordering
@total_ordering
class Id3(Strategy):
def __init__(self, attribute, label, dataset):
super().__init__('ID3', attribute, label)
self.information_gain = 0
self.entropy = 0
self.calculate(dataset)
def __str__(self):
return f"""
{super().__str__()}
entropy[{self.entropy}]
information_gain[{self.information_gain}]
"""
def _calculate_entropy(self, dataset, update=False):
# print(dataset)
probs = [dataset.count(item)/len(dataset) for item in set(dataset)]
entropy = -sum([prob*math.log(prob, 2) for prob in probs])
if update: self.entropy = entropy
return entropy
def _calculate_information_gain(self, dataset):
splits = self.split_dataframe(dataset)
# print([split[self.label].tolist() for k,split in splits.items()])
# print([self._calculate_entropy(split[self.label].tolist()) for k,split in splits.items()])
ent = sum([len(split)/len(dataset)*self._calculate_entropy(split[self.label].tolist()) for k,split in splits.items()])
self.information_gain = self.entropy - ent
return self.information_gain
def calculate(self, dataset):
self._calculate_entropy(dataset[self.label].tolist(), True)
# print(self.entropy)
return self._calculate_information_gain(dataset)
def __eq__(self, other):
return self.information_gain == other.information_gain
def __lt__(self, other):
return self.information_gain < other.information_gain
def is_better_than(self, other):
return self>other
def json(self):
return {
'name':self.name,
'attribute': self.attribute,
'label': self.label,
'entropy': self.entropy,
'information_gain': self.information_gain,
}
Id3('temp', 'play', df)
Id3('windy', 'play', df)
Id3('outlook', 'play', df)
Id3('humility', 'play', df)
Strategy[ID3] on Attribute[temp],Label is play
entropy[0.9402859586706309]
information_gain[0.029222565658954647]
Strategy[ID3] on Attribute[windy],Label is play
entropy[0.9402859586706309]
information_gain[0.04812703040826927]
Strategy[ID3] on Attribute[outlook],Label is play
entropy[0.9402859586706309]
information_gain[0.2467498197744391]
Strategy[ID3] on Attribute[humility],Label is play
entropy[0.9402859586706309]
information_gain[0.15183550136234136]
信息增益率 Information gain rate
以属性
a
a
a对数据集
D
D
D进行划分所获得的信息增益率
G
a
i
n
_
r
a
t
i
o
(
D
,
a
)
Gain\_ratio(D, a)
Gain_ratio(D,a)为:
G
a
i
n
_
r
a
t
i
o
(
D
,
a
)
=
G
a
i
n
(
D
,
a
)
I
V
(
a
)
Gain\_ratio(D, a)=\frac{Gain(D,a)}{IV(a)}
Gain_ratio(D,a)=IV(a)Gain(D,a)
其中
I
V
(
a
)
=
−
∑
v
=
1
V
∣
D
v
∣
∣
D
∣
l
o
g
2
∣
D
v
∣
∣
D
∣
IV(a)=-\sum^V_{v=1}\frac{|D^v|}{|D|}log_2\frac{|D^v|}{|D|}
IV(a)=−v=1∑V∣D∣∣Dv∣log2∣D∣∣Dv∣
属性
a
a
a的可能取值数目越多(即
V
V
V越大),则
I
V
(
a
)
IV(a)
IV(a)的值通常就越大。
他解决了类似学号属性导致的信息增益最大导致的过拟合问题。
假设数据集有一个学号的属性,那么根据信息增益率的定义可得:每个学号唯一确定了一个人,那么其肯定只有一个类别,即
E
n
t
(
D
v
)
=
0
Ent(D^v)=0
Ent(Dv)=0,所有的
E
n
t
Ent
Ent相加仍然为0,那么最终的信息增益率计算中后面那一项为0,即此时信息增益率最大。但是我们知道如果按照学号进行属性划分,我们就过拟合了。
之所以增加一个
I
V
(
a
)
IV(a)
IV(a)项,就是为了中合这种属性对信息增益率的影响,我们可以这么考虑:他是考虑了属性取值的纯度,把它和entropy的计算公式对比,会发现两者其实是一样的。
启发式:先从候选划分属性中找出信息增益高于平均水平的,再从中选取增益率最高的
from functools import total_ordering
@total_ordering
class C45(Id3):
def __init__(self, attribute, label, dataset):
super().__init__(attribute, label, dataset)
self.name='C4.5'
self.iv = 0
self.information_gain_rate = 0
self.calculate(dataset)
def __str__(self):
return f"""
{super().__str__()}
iv[{self.iv}]
information_gain_rate[{self.information_gain_rate}]
"""
def _calculate_iv(self, dataset):
# probs = [dataset.count(item)/len(dataset) for item in set(dataset)]
# self.iv = -sum([prob*math.log(prob, 2) for prob in probs])
# return self.iv
self.iv = self._calculate_entropy(dataset) # iv的计算逻辑和Entropy一样
return self.iv
def calculate(self, dataset):
super().calculate(dataset)
self._calculate_iv(dataset[self.attribute].tolist())
self.information_gain_rate = self.information_gain/self.iv
return self.information_gain_rate
def __eq__(self, other):
return self.information_gain_rate == other.information_gain_rate
def __lt__(self, other):
return self.information_gain_rate < other.information_gain_rate
def is_better_than(self, other):
return self>other
def json(self):
return {
'name':self.name,
'attribute': self.attribute,
'label': self.label,
'entropy': self.entropy,
'iv':self.iv,
'information_gain': self.information_gain,
'information_gain_rate': self.information_gain_rate
}
C45('temp', 'play', df)
C45('windy', 'play', df)
C45('outlook', 'play', df)
C45('humility', 'play', df)
Strategy[C4.5] on Attribute[temp],Label is play
entropy[0.9402859586706309]
information_gain[0.029222565658954647]
iv[1.5566567074628228]
information_gain_rate[0.01877264622241867]
Strategy[C4.5] on Attribute[windy],Label is play
entropy[0.9402859586706309]
information_gain[0.04812703040826927]
iv[0.9852281360342516]
information_gain_rate[0.048848615511520595]
Strategy[C4.5] on Attribute[outlook],Label is play
entropy[0.9402859586706309]
information_gain[0.2467498197744391]
iv[1.5774062828523452]
information_gain_rate[0.15642756242117517]
Strategy[C4.5] on Attribute[humility],Label is play
entropy[0.9402859586706309]
information_gain[0.15183550136234136]
iv[1.0]
information_gain_rate[0.15183550136234136]
基尼指数 Gini Index
G
i
n
i
(
D
)
=
1
−
∑
k
=
1
∣
y
∣
p
k
2
Gini(D) = 1-\sum^{|y|}_{k=1}p_k^2
Gini(D)=1−k=1∑∣y∣pk2
G
i
n
i
(
D
)
Gini(D)
Gini(D)越小,数据集
D
D
D的纯度越高。反映了从
D
D
D中随机抽取两个样例,其类别标记不一致的概率
属性a的基尼指数
G
i
n
i
_
i
n
d
e
x
(
D
,
a
)
Gini\_index(D,a)
Gini_index(D,a)为:
G
i
n
i
_
i
n
d
e
x
(
D
,
a
)
=
∑
v
=
1
V
∣
D
v
∣
∣
D
∣
G
i
n
i
(
D
v
)
Gini\_index(D,a)=\sum^V_{v=1}\frac{|D^v|}{|D|}Gini(D^v)
Gini_index(D,a)=v=1∑V∣D∣∣Dv∣Gini(Dv)
在候选属性集合中,选取那个使划分后的基尼指数最小的属性
from functools import total_ordering
@total_ordering
class Cart(Strategy):
def __init__(self, attribute, label, dataset):
super().__init__('CART', attribute, label)
self.gini_index = 0
self.calculate(dataset)
def __str__(self):
return f"""
{super().__str__()}
gini_index[{self.gini_index}]
"""
def _gini(self, dataset):
probs = [(dataset.count(item)/len(dataset))**2 for item in set(dataset)]
return 1 - sum(probs)
def calculate(self, dataset):
splits = self.split_dataframe(dataset)
gini_items = [len(split)/len(dataset)*self._gini(split[self.label].tolist()) for k,split in splits.items()]
self.gini_index = sum(gini_items)
return self.gini_index
def __eq__(self, other):
return self.gini_index == other.gini_index
def __lt__(self, other):
return self.gini_index < other.gini_index
def is_better_than(self, other):
return self<other
def json(self):
return {
'name':self.name,
'attribute': self.attribute,
'label': self.label,
'gini_index': self.gini_index
}
c1 = Cart('temp', 'play', df)
c1
c2 = Cart('windy', 'play', df)
c2
Cart('outlook', 'play', df)
Cart('humility', 'play', df)
Strategy[CART] on Attribute[temp],Label is play
gini_index[0.44047619047619047]
Strategy[CART] on Attribute[windy],Label is play
gini_index[0.42857142857142855]
Strategy[CART] on Attribute[outlook],Label is play
gini_index[0.34285714285714286]
Strategy[CART] on Attribute[humility],Label is play
gini_index[0.3673469387755103]
c1<c2
False
定义Node类
class Node:
def __init__(self, clazz, strategy, label, samples):
self.strategy = strategy # 当前结点的一些统计数据
self.clazz = clazz # 叶结点,属于什么类别
self.label = label
self.samples = samples # 当前结点的样本集合
self.chidren_nodes = {} # 非叶结点,会有后续的分支结点
def connect(self, attribute_value, node):
self.chidren_nodes[attribute_value] = node
def json(self):
return {
'class': self.clazz,
'strategy': self.strategy.json() if self.strategy else None,
'sample_size': self.samples.shape[0],
'samples': self.samples[self.label].tolist(),
'chidren_nodes': {
str(attribute_value):node.json() for attribute_value,node in self.chidren_nodes.items()
}
}
定义Model基类
class Model:
def __init__(self, name, dataset):
self.name = name
self.dataset = dataset
def fit(self):
pass
def predict(self):
pass
def visuaization(self):
pass
def json(self):
pass
定义决策树的具体实现
from collections import Counter
class DecisionTree(Model):
def __init__(self, dataset, label, strategy_str='cart'):
super().__init__('DecisionTree', dataset)
self.root = None
self.label = label
self.attributes = set(dataset.columns)-set([self.label])
if strategy_str=='id3': self.strategy = Id3
elif strategy_str=='c4.5': self.strategy = C45
elif strategy_str=='cart': self.strategy = Cart
def _split_dataframe(self, attribute, dataset):
"""
根据指定属性切分数据集
"""
unique_values = dataset[attribute].unique()
result_dict = {}
for item in unique_values:
result_dict[item] = dataset[:][dataset[attribute] == item]
return result_dict
def _choose_best_col(self, dataset, attributes):
"""
根据不同的 strategy 选择虽有划分属性
"""
best_strategy = None
for attr in attributes:
curr_strategy = self.strategy(attr, self.label, dataset)
if not best_strategy or curr_strategy.is_better_than(best_strategy):
best_strategy=curr_strategy
return best_strategy
def _generate_tree(self, dataset, attributes):
labels = Counter(dataset[self.label].tolist())
# dataset中只有一个类别 or or attributes为空 or dataset剩下的所有attribute各自取值全部相同
if len(labels)==1 or len(attributes)==0:
return Node(labels.most_common(1)[0], None, self.label, dataset)
if sum([len(set(dataset[attr].tolist())) for attr in attributes])==len(attributes):
return Node(labels.most_common(1)[0], None, self.label, dataset)
# 选择最优化分属性
best_strategy = self._choose_best_col(dataset, attributes)
# 使用最优化分属性作为当前结点:没有分类
curr_node = Node(None, best_strategy, self.label, dataset)
# 使用最优化分属性 切分数据集
splits = self._split_dataframe(best_strategy.attribute, dataset)
# 获得新的属性字段列表
new_attributes = attributes-set([best_strategy.attribute])
# 对每一个数据子集递归创建结点
for key,split in splits.items():
curr_node.connect(key, self._generate_tree(split, new_attributes))
return curr_node
def fit(self):
self.root = self._generate_tree(self.dataset, self.attributes)
def predict(self, dataset):
pass
def visuaization(self):
pass
def json(self):
return {
'name': self.name,
'root': self.root.json(),
'label': self.label,
'dataset_size': self.dataset.shape[0],
'dataset': self.dataset[self.label].tolist(),
'strategy': self.strategy.__name__
}
dt = DecisionTree(df, 'play', strategy_str='id3')
dt.fit()
dt.root.chidren_nodes['overcast'].samples
humility | outlook | play | temp | windy | |
---|---|---|---|---|---|
2 | high | overcast | yes | hot | False |
6 | normal | overcast | yes | cool | True |
11 | high | overcast | yes | mild | True |
12 | normal | overcast | yes | hot | False |
使用格式化的json显示结果
import json
format_res = json.dumps(dt.json(), sort_keys=True, indent=4, separators=(',', ':'))
print(format_res)
{
"dataset":[
"no",
"no",
"yes",
"yes",
"yes",
"no",
"yes",
"no",
"yes",
"yes",
"yes",
"yes",
"yes",
"no"
],
"dataset_size":14,
"label":"play",
"name":"DecisionTree",
"root":{
"chidren_nodes":{
"overcast":{
"chidren_nodes":{},
"class":[
"yes",
4
],
"sample_size":4,
"samples":[
"yes",
"yes",
"yes",
"yes"
],
"strategy":null
},
"rainy":{
"chidren_nodes":{
"False":{
"chidren_nodes":{},
"class":[
"yes",
3
],
"sample_size":3,
"samples":[
"yes",
"yes",
"yes"
],
"strategy":null
},
"True":{
"chidren_nodes":{},
"class":[
"no",
2
],
"sample_size":2,
"samples":[
"no",
"no"
],
"strategy":null
}
},
"class":null,
"sample_size":5,
"samples":[
"yes",
"yes",
"no",
"yes",
"no"
],
"strategy":{
"attribute":"windy",
"entropy":0.9709505944546686,
"information_gain":0.9709505944546686,
"label":"play",
"name":"ID3"
}
},
"sunny":{
"chidren_nodes":{
"high":{
"chidren_nodes":{},
"class":[
"no",
3
],
"sample_size":3,
"samples":[
"no",
"no",
"no"
],
"strategy":null
},
"normal":{
"chidren_nodes":{},
"class":[
"yes",
2
],
"sample_size":2,
"samples":[
"yes",
"yes"
],
"strategy":null
}
},
"class":null,
"sample_size":5,
"samples":[
"no",
"no",
"no",
"yes",
"yes"
],
"strategy":{
"attribute":"humility",
"entropy":0.9709505944546686,
"information_gain":0.9709505944546686,
"label":"play",
"name":"ID3"
}
}
},
"class":null,
"sample_size":14,
"samples":[
"no",
"no",
"yes",
"yes",
"yes",
"no",
"yes",
"no",
"yes",
"yes",
"yes",
"yes",
"yes",
"no"
],
"strategy":{
"attribute":"outlook",
"entropy":0.9402859586706309,
"information_gain":0.2467498197744391,
"label":"play",
"name":"ID3"
}
},
"strategy":"Id3"
}
总结
- 为了定义清晰而引入了一些冗余的操作,比如entropy的计算
- 还没有考虑到剪枝的优化
- 旨在理解决策树的构建过程
- 希望有看到的大佬可以指正文中的问题点(*^▽^*)
- 希望自己可以坚持下去