经典决策树算法包括ID3算法、C4.5算法以及GBDT的基分类器CART算法 ,ID3算法选择特征的依据是信息增益、C4.5是信息增益比,而CART则是Gini指数。
例子:
所谓信息增益就是数据在得到特征X的信息时使得类Y的信息不确定性减少的程度。假设数据集D的信息熵为H(D),给定特征A之后的条件熵为H(D|A),则特征A对于数据集的信息增益g(D,A)可表示为:
信息增益越大,则该特征对数据集确定性贡献越大,表示该特征对数据有较强的分类能力。
#出太阳打球信息熵
S_entropy = -(0.6*np.log2(0.6) + 0.4*np.log2(0.4))
print('S_entropy:', S_entropy)
#阴天打球信息熵
O_entropy = -(1*np.log2(1))
print('O_entropy:', O_entropy)
#下雨天打球信息熵
R_entropy = -(0.6*np.log2(0.6) + 0.4*np.log2(0.4))
print('R_entropy:', R_entropy)
#打球的信息熵
play_entropy = -(5/14*np.log2(5/14) + 9/14*np.log2(9/14))
print('play_entropy:', play_entropy)
#在天气这个维度 打球的条件熵
Play_outlook_Conditional_entropy = 5/14*S_entropy+4/14*O_entropy+5/14*R_entropy
print('Play_outlook_Conditional_entropy:', Play_outlook_Conditional_entropy)
#在天气这个维度 打球的信息增益
Gain_play_outlook = play_entropy - Play_outlook_Conditional_entropy
print('Gain_play_outlook:', Gain_play_outlook)
以ID3算法构建决策树过程:
#coding:utf-8
import numpy as np
import pandas as pd
from math import log
#计算信息熵
def entropy(ele):
probs = [ele.count(i) / len(ele) for i in set(ele)]
# print('probs:', probs)
entropy = -sum([prob * log(prob, 2) for prob in probs])
return entropy
#拆分dataframe的key值
def split_dataframe(data, col):
unique_values = data[col].unique()
# print('unique_values:', unique_values)
result_dict = {elem: pd.DataFrame for elem in unique_values}
# print('result_dict:', result_dict)
# assert 1 == 0
for key in result_dict.keys():
result_dict[key] = data[:][data[col] == key]
return result_dict
#获取信息增益最大的列
def choose_best_col(data, label):
# print('==data[label].tolist():', data[label].tolist())
entropy_D = entropy(data[label].tolist())
cols = [col for col in data.columns if col not in [label]]
max_value, best_col = -999, None
max_splited = None
for col in cols:
# print('col:', col)
splited_set = split_dataframe(data, col)
print('splited_set:\n', splited_set)
# print('==========\n', splited_set['normal'])
entropy_DA = 0
for subset_col, subset in splited_set.items():
print('============subset_col, subset\n', subset_col, subset)
# assert 1 == 0
entropy_Di = entropy(subset[label].tolist())
entropy_DA += len(subset) / len(data) * entropy_Di
info_gain = entropy_D - entropy_DA
if info_gain > max_value:
max_value, best_col = info_gain, col
max_splited = splited_set
return max_value, best_col, max_splited
class ID3Tree:
class Node:
def __init__(self, name):
self.name = name
self.connections = {}
def connect(self, label, node):
self.connections[label] = node
def __init__(self, data, label):
self.columns = data.columns
# print('self.columns:', self.columns)
self.data = data
self.label = label
self.root = self.Node("Root")
def print_tree(self, node, tabs):
# print('tabs + node.name:\n', tabs + node.name)
for connection, child_node in node.connections.items():
print(tabs + "\t" + "(" + connection + ")")
self.print_tree(child_node, tabs + "\t\t")
def construct_tree(self):
self.construct(self.root, "", self.data, self.columns)
def construct(self, parent_node, parent_connection_label, input_data, columns):
max_value, best_col, max_splited = choose_best_col(input_data[columns], self.label)
# assert 1==0
print('==best_col:', best_col)
print('==not best_col:', not best_col)
if not best_col:
node = self.Node(input_data[self.label].iloc[0])
parent_node.connect(parent_connection_label, node)
return
node = self.Node(best_col)
parent_node.connect(parent_connection_label, node)
new_columns = [col for col in columns if col != best_col]
for splited_value, splited_data in max_splited.items():
self.construct(node, splited_value, splited_data, new_columns)
def test_best_gain_info():
df = pd.read_csv('./example_data.csv', dtype={'windy' : 'str'})
max_value, best_col, max_splited = choose_best_col(df, 'play')
print('max_value, best_col, max_splited:\n', max_value, best_col, max_splited)
def test_tree_construct():
df = pd.read_csv('./example_data.csv', dtype={'windy': 'str'})
id3 = ID3Tree(df, 'play')
id3.construct_tree()
id3.print_tree(id3.root, '')
if __name__ == '__main__':
# test_best_gain_info()
test_tree_construct()
以cart构建分类树过程:
Gini指数是针对概率分布而言的。假设在一个分类问题中有K个类,样本属于第k个类的概率为Pk,则该样本概率分布的基尼指数为:
相应的条件Gini指数,也即给定特征A的条件下集合D的Gini指数计算如下:
import numpy as np
import pandas as pd
def gini(nums):
probs = [nums.count(i)/len(nums) for i in set(nums)]
gini = sum([p*(1-p) for p in probs])
return gini
def split_dataframe(data, col):
'''
function: split pandas dataframe to sub-df based on data and column.
input: dataframe, column name.
output: a dict of splited dataframe.
'''
# unique value of column
unique_values = data[col].unique()
# print('==unique_values:', unique_values)
# empty dict of dataframe
result_dict = {elem: pd.DataFrame for elem in unique_values}
# split dataframe based on column value
for key in result_dict.keys():
result_dict[key] = data[:][data[col] == key]
return result_dict
def choose_best_col(df, label):
'''
funtion: choose the best column based on infomation gain.
input: datafram, label
output: max infomation gain, best column,
splited dataframe dict based on best column.
'''
# Calculating label's gini index
gini_D = gini(df[label].tolist())
# columns list except label
cols = [col for col in df.columns if col not in [label]]
# initialize the max infomation gain, best column and best splited dict
min_value, best_col = 999, None
min_splited = None
# split data based on different column
for col in cols:
splited_set = split_dataframe(df, col)
gini_DA = 0
for subset_col, subset in splited_set.items():
# calculating splited dataframe label's gini index
gini_Di = gini(subset[label].tolist())
# calculating gini index of current feature
gini_DA += len(subset) / len(df) * gini_Di
if gini_DA < min_value:
min_value, best_col = gini_DA, col
min_splited = splited_set
return min_value, best_col, min_splited
def test_gini():
lst = ['a', 'b', 'c', 'd', 'b', 'c', 'a', 'b', 'c', 'd', 'a']
res = gini(lst)
print('=res:', res)
def test_csv_gini():
df = pd.read_csv('./example_data.csv', dtype={'windy': 'str'})
res = gini(df['play'].tolist())
print('=res:', res)
def test_split_dataframe():
df = pd.read_csv('./example_data.csv', dtype={'windy': 'str'})
res = split_dataframe(df, 'temp')
print('=res:', res.keys())
print("=====res['mild']:\n", res['mild'])
def test_choose_best_col():
df = pd.read_csv('./example_data.csv', dtype={'windy': 'str'})
min_value, best_col, min_splited = choose_best_col(df, 'play')
print('==min_value:', min_value)
print('==best_col:', best_col)
print('==min_splited:', min_splited)
class CartTree:
# define a Node class
class Node:
def __init__(self, name):
self.name = name
self.connections = {}
def connect(self, label, node):
self.connections[label] = node
def __init__(self, data, label):
self.columns = data.columns
self.data = data
self.label = label
self.root = self.Node("Root")
# print tree method
def print_tree(self, node, tabs):
print(tabs + node.name)
for connection, child_node in node.connections.items():
print(tabs + "\t" + "(" + connection + ")")
self.print_tree(child_node, tabs + "\t\t")
def construct_tree(self):
self.construct(self.root, "", self.data, self.columns)
# construct tree
def construct(self, parent_node, parent_connection_label, input_data, columns):
min_value, best_col, min_splited = choose_best_col(input_data[columns], self.label)
if not best_col:
node = self.Node(input_data[self.label].iloc[0])
parent_node.connect(parent_connection_label, node)
return
node = self.Node(best_col)
parent_node.connect(parent_connection_label, node)
new_columns = [col for col in columns if col != best_col]
# Recursively constructing decision trees
for splited_value, splited_data in min_splited.items():
self.construct(node, splited_value, splited_data, new_columns)
def test_construct_tree():
df = pd.read_csv('./example_data.csv', dtype={'windy': 'str'})
tree1 = CartTree(df, 'play')
tree1.construct_tree()
tree1.print_tree(tree1.root, "")
if __name__ == '__main__':
# test_gini()
# test_csv_gini()
# test_split_dataframe()
# test_choose_best_col()
test_construct_tree()
安装graphviz用于可视化决策树
apt-get install graphviz
from sklearn.tree import DecisionTreeClassifier
import pydotplus
from sklearn import tree
X = np.array([[2, 2],
[2, 1],
[2, 3],
[1, 2],
[1, 1],
[3, 3]])
y = np.array([0, 1, 1, 1, 0, 1])
plt.style.use('fivethirtyeight')
plt.rcParams['font.size'] = 18
plt.figure(figsize=(8, 8))
# Plot each point as the label
for x1, x2, label in zip(X[:, 0], X[:, 1], y):
plt.text(x1, x2, str(label), fontsize=40, color='g',
ha='center', va='center')
plt.grid(None)
plt.xlim((0, 3.5))
plt.ylim((0, 3.5))
plt.xlabel('x1', size=20)
plt.ylabel('x2', size=20)
plt.title('Data', size=24)
# plt.show()
dec_tree = DecisionTreeClassifier()
print(dec_tree)
dec_tree.fit(X, y)
print(dec_tree.score(X,y))
# Export as dot
dot_data = tree.export_graphviz(dec_tree, out_file=None,
feature_names=['x1', 'x2'],
class_names=['0', '1'],
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
with open('1.png', 'wb') as f:
f.write(graph.create_png())
除叶节点(终端节点)之外的所有节点都有 5 部分:
-
基于一个特征的值的有关数据的问题。每个问题的答案要么是 True,要么就是 False。数据点会根据该问题的答案在该决策树中移动。
-
gini:节点的基尼不纯度。当沿着树向下移动时,平均加权的基尼不纯度必须降低。
-
samples:节点中观察的数量。
-
value:每一类别中样本的数量。比如,顶部节点中有 2 个样本属于类别 0,有 4 个样本属于类别 1。
-
class:节点中大多数点的类别(持平时默认为 0)。在叶节点中,这是该节点中所有样本的预测结果。
一个节点的基尼不纯度的公式为:
root节点的计算:
在这个决策树的第二层,最左边的节点的基尼不纯度为 0.5,这似乎表明不纯度增大了。但是,每一层应该降低的是基尼不纯度的加权平均。每个节点都会根据其样本占父节点样本的比例进行加权。所以整体而言,第二层的基尼不纯度为:
在最后一层,每个节点的基尼不纯度都会达到 0.0,这说明每个节点都只包含单一类别的样本。这符合我们的预期,因为我们并没有限制决策树的深度,让其可以按需要创建足够多的层以能分类所有数据点。尽管我们的模型能正确分类所有的训练数据点,但这并不意味着它就是完美的,因为它与训练数据可能过拟合了。
二,决策树案例2
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
def plot_feature_importances(clf, feature_names):
"""
可视化分类器中特征的重要性
"""
c_features = len(feature_names)
plt.barh(range(c_features), clf.feature_importances_)
plt.xlabel('Feature importance')
plt.ylabel('Feature name')
plt.yticks(np.arange(c_features), feature_names)
plt.show()
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=0)
max_depth_values = [2, 3, 4]
for max_depth_val in max_depth_values:
dt_model = DecisionTreeClassifier(max_depth=max_depth_val)
dt_model.fit(X_train, y_train)
print('max_depth=', max_depth_val)
print('训练集上的准确率: {:.3f}'.format(dt_model.score(X_train, y_train)))
print('测试集的准确率: {:.3f}'.format(dt_model.score(X_test, y_test)))
print()
dt_model = DecisionTreeClassifier(max_depth=4)
dt_model.fit(X_train, y_train)
print(iris.feature_names)
print(dt_model.feature_importances_)
plot_feature_importances(dt_model, iris.feature_names)
随机森林
随机森林是由许多决策树构成的模型。这不仅仅是森林,而且是随机的,这涉及到两个概念:
1.随机采样数据点
2.基于特征的子集分割节点
随机采样
随机森林的一大关键是每个树都在随机的数据点样本上进行训练。这些样本是可重复地抽取出来的(称为 bootstrapping),也就是说某些样本会多次用于单个树的训练(如果有需要,也可以禁止这种做法)。其思路是,通过在不同样本上训练每个树,尽管每个树依据训练数据的某个特定子集而可能有较高方差,但整体而言整个森林的方差会很低。这种在数据的不同子集上训练每个单个学习器然后再求预测结果的平均的流程被称为 bagging,这是 bootstrap aggregating 的缩写。
特征的随机子集
随机森林背后的另一个概念是:在每个决策树中,分割每个节点时都只会考虑所有特征中的一个子集。通常设定为 sqrt(n_features),意思是在每个节点,决策树会基于一部分特征来考虑分割,这部分特征的数量为总特征数量的平方根。随机森林也可以在每个节点考虑所有特征来进行训练。(在 Scikit-Learn 随机森林实现中,这些选项是可调控的。)随机森林组合了数百或数千个决策树,并会在稍有不同的观察集上训练每个决策树(数据点是可重复地抽取出来的),并且会根据限定数量的特征分割每个树中的节点。随机森林的最终预测结果是每个单个树的预测结果的平均。