python-机器学习-决策树算法
这属于用python学习机器学习系列的第二篇
代码如下:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import preprocessing
import re
from collections import defaultdict
from sklearn.model_selection import train_test_split
# 树结构
def tree():
return defaultdict(tree)
# 决策树
class DecisionTree:
# 初始化
def __init__(self, criterion='gini', max_depth=10):
self.max_depth = max_depth # 最大树深
self.tree = tree() # 树的生成结果
self.criterion = criterion # 生成模式 ID3 或 ID4.5 或 gini
# 拟合函数
def fit(self, x, y):
self.tree = self.CreateTree(x, y, 1)
return self
# 预测多个样本
def predict(self, x):
return np.array([self.hat(i, self.tree) for i in x])
# 预测某个样本
def hat(self, x, tree):
index = tree['index']
x_hat = x[index]
key = self.select_close(x_hat, tree["A"])
if tree["y_hat"].__len__() != 0:
return float(tree['y_hat'])
else:
return self.hat(x, tree['child_' + str(key)])
# 选取最匹配的key值
def select_close(self, index, dic):
result = index
for key, value in dic.items():
if key > 0 and index >= key:
result=key
elif key <0 and index<(key*-1):
result=key
return result
# 递归生成树
def CreateTree(self, x, y, depth):
tree1 = tree()
data = np.hstack((x, y))
acount_A = len(x[0])
d = [] # 熵列表
l=[]#最佳值列表
# 遍历获取熵值列表
for i in range(acount_A):
x1 = x[:, i]
bestX=self.calculate_BestNum(x1,y)
x1 = self.calculate_sortInTwo(x1, bestX)
d.append(self.calculateEntropy(x1, y))
l.append(bestX)
max1 = max(d) # 最大熵值
max_index = d.index(max1) # 获得最大熵的索引值
x1 = self.calculate_sortInTwo(x[:, max_index],l[max_index])
A = self.calculate_N(x1) # 最大熵的特征值分类
# 节点符值
tree1['Entropy'] = max1
tree1['Sample'] = len(data)
tree1['index'] = max_index
tree1['A'] = A
tree1['depth'] = depth
# 判定熵值和树深是否有效(预剪枝)
if max1 > 0.001 and self.max_depth > depth:
Cr = max1*1*len(data)
tree1['Cr'] = Cr # 剪枝后的评价数
x3 = list(x[:, max_index])
for key, value in A.items():
indexs = self.calculate_indexInList(x3, key)