根据决策树CART的原理用Python3写出,代码如下:
from random import randrange
# 根据阈值对单个属性数据(数值)进行分割
def split_numerical(attribute_index, thresh, datasets):
left, right = [], []
for r in datasets:
if r[attribute_index] < thresh:
left.append(r)
else:
right.append(r)
return left, right
# 计算基尼指数
def cal_gini(split_datas, labels):
# split_datas包含属性A 左边、 右边的数据
# 总样本数
N = sum([len(d) for d in split_datas])
gini_S_A = 0
# 将两边的基尼指数按权重相加
for d in split_datas:
N_t = len(d)
# 避免除以0
if N_t == 0:
continue
sum_p_tk = 0
# 计算每个组每个类别的概率
for label in labels:
p_tk = [r[-1] for r in d].count(label) / N_t
sum_p_tk += p_tk * p_tk
gini_S = 1 - sum_p_tk
gini_S_A = N_t / N * gini_S
return gini_S_A
# 根据基尼指数找到最佳属性(找到根结点的属性)及其最佳分割阈值
def split_all_attributes(datasets):
labels = list(set([r[-1] for r in datasets]))
best_attribute_index, best_thresh, best_gini, best_split_datas = 999, 999, 999, None
# 将当个属性的每个数值作为阈值对其划分,求其gini,确定阈值
for attribute_index in range(len(datasets[0]) - 1):
for r in datasets:
thresh = r[attribute_index]
split_datas = split_numerical(attribute_index, thresh, datasets)
gini = cal_gini(split_datas, labels)
if gini < best_gini:
best_attribute_index, best_thresh, best_gini, \
best_split_datas = attribute_index, thresh, gini, split_datas
root = {'best_attribute_index': best_attribute_index, 'best_thresh': best_thresh,
'best_split_datas': best_split_datas}
return root
# 对属性分割后的数据左边或右边的结点转换为叶结点
def to_leaf_node(split_data):
result = [r[-1] for r in split_data]
return max(set(result), key=result.count)
def build_tree(train_data, max_depth, min_size_leaf):
# 递归终止条件:max_depth, min_size_leaf
root = split_all_attributes(train_data)
recursive_split(root, max_depth, min_size_leaf, 1)
return root
# 创建内结点(子结点)
def recursive_split(node, max_depth, min_size_leaf, depth):
left, right = node['best_split_datas']
del (node['best_split_datas'])
# 判断数据有无分割
if not left or not right:
node['left'] = node['right'] = to_leaf_node(left + right)
return
# 判断深度是否达到最大值
if depth >= max_depth:
node['left'], node['right'] = to_leaf_node(left), to_leaf_node(right)
# 判断叶结点数量是否达到最小值
if len(left) <= min_size_leaf:
node['left'] = to_leaf_node(left)
else:
node['left'] = split_all_attributes(left)
recursive_split(node['left'], max_depth, min_size_leaf, depth + 1)
if len(right) <= min_size_leaf:
node['right'] = to_leaf_node(right)
else:
node['right'] = split_all_attributes(right)
recursive_split(node['right'], max_depth, min_size_leaf, depth + 1)
def print_tree(node, depth=0):
if isinstance(node, dict):
print('%s[X%d < %.3f]' % (depth * ' ', (node['best_attribute_index'] + 1), node['best_thresh']))
print_tree(node['left'], depth + 1)
print_tree(node['right'], depth + 1)
else:
print('%s[%s]' % ((depth * ' ', node)))
def predict(node, test_row):
# 判断数据落在哪边
if test_row[node['best_attribute_index']] < node['best_thresh']:
if isinstance(node['left'], dict):
return predict(node['left'], test_row)
else:
return node['left']
else:
if isinstance(node['right'], dict):
return predict(node['right'], test_row)
else:
return node['right']
def accuracy(actual, predicted):
return sum([actual[i] == predicted[i] for i in range(len(actual))]) / len(actual) * 100
def cross_validation_split(datasets, k_folds):
datasets_split = []
datasets_copy = list(datasets)
size = int(len(datasets) / k_folds)
for k in range(k_folds):
k_fold = []
while len(k_fold) < size:
index = randrange(len(datasets_copy))
k_fold.append(datasets_copy.pop(index))
datasets_split.append(k_fold)
return datasets_split
def decision_tree(train_data, test_data, max_depth, min_size_leaf):
tree = build_tree(train_data, max_depth, min_size_leaf)
return [predict(tree, test_row) for test_row in test_data]
def evaluate_algorithm(datasets, algrithm, k_folds, *args):
folds = cross_validation_split(datasets, k_folds)
scores = []
for fold in folds:
train_data = folds
train_data.remove(list(fold))
train_data = sum(train_data, [])
test_data = [row[:-1] for row in fold]
predicted = algrithm(train_data, test_data, *args)
actual = [r[-1] for r in fold]
scores.append(accuracy(actual, predicted))
return scores
应用iris数据,将代码放在上面代码的后面即可:
from random import seed
import numpy as np
from sklearn.datasets import load_iris
seed(1)
iris = load_iris()
data = iris.data
target = iris.target
datasets = np.c_[data, target].tolist()
k_folds = 5
max_depth = 5
min_size_leaf = 10
scores = evaluate_algorithm(datasets, decision_tree, k_folds, max_depth, min_size_leaf)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores) / len(scores)))
# Scores: [86.66666666666667, 93.33333333333333, 86.66666666666667]
# Mean Accuracy: 88.889%
对比sklearn,将代码放在上面代码后面即可:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
import graphviz
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
dot_data = export_graphviz(clf, out_file=None, feature_names=iris.feature_names,
class_names=iris.target_names,
filled=True, rounded=True, special_characters=True)
graph = graphviz.Source(dot_data)
graph.render()