决策树就是一步步做出决策,然后到达相应的叶子节点。(有点简略。。。之后填坑)
相关知识:
- 信息熵
- 条件熵
- 信息增益
import numpy as np
# 计算熵
def calcInfoEntropy(label):
'''
input:
label(narray):样本标签
output:
InfoEntropy(float):熵
'''
sv = {}
for v in label:
if v in sv:
sv[v] += 1
else:
sv[v] = 1
InfoEntropy = 0
for v in sv:
InfoEntropy -= (sv[v]/len(label)) * np.log2( sv[v]/len(label) )
return InfoEntropy
#计算条件熵
def calcHDA(feature,label,index,value):
'''
input:
feature(ndarray):样本特征
label(ndarray):样本标签
index(int):需要使用的特征列索引
value(int):index所表示的特征列中需要考察的特征值
output:
HDA(float):信息熵
'''
count = 0
# sub_feature和sub_label表示根据特征列和特征值分割出的子数据集中的特征和标签
sub_feature = feature[feature[:,index] == value , :]
sub_label = label[ feature[:,index] == value ]
count = feature[feature[:,index] == value , :].shape[0]
pHA = count / len(feature)
e = calcInfoEntropy(sub_label)
HDA = pHA * e
return HDA
#计算信息增益
def calcInfoGain(feature, label, index):
'''
input:
feature(ndarry):测试用例中字典里的feature
label(ndarray):测试用例中字典里的label
index(int):测试用例中字典里的index,即feature部分特征列的索引。该索引指的是feature中第几个特征,如index:0表示使用第一个特征来计算信息增益。
output:
InfoGain(float):信息增益
'''
base_e = calcInfoEntropy(label)
f = np.array(feature)
# 得到指定特征列的值的集合
f_set = set(f[:, index])
sum_HDA = 0
# 计算条件熵
for value in f_set:
sum_HDA += calcHDA(feature, label, index, value)
# 计算信息增益
InfoGain = base_e - sum_HDA
return InfoGain
# 获得信息增益最高的特征
def getBestFeature(feature, label):
'''
input:
feature(ndarray):样本特征
label(ndarray):样本标签
output:
best_feature(int):信息增益最高的特征
'''
t = ""
maxn = -100
for i in range(feature.shape[1]):
tmp = calcInfoGain(feature , label , i)
if tmp > maxn:
t = i
maxn = tmp
best_feature = t
return best_feature
#创建决策树
def createTree(feature, label):
'''
input:
feature(ndarray):训练样本特征
label(ndarray):训练样本标签
output:
tree(dict):决策树模型
'''
# 样本里都是同一个label没必要继续分叉了
if len(set(list(label))) == 1:
return label[0]
# 样本s中只有一个特征或者所有样本的特征都一样的话就看哪个label的票数高
equ = False
tmps = []
for i in feature:
tmps.append(str(i))
tmps = np.array(tmps)
if feature[0].shape[0] == 1 or tmps[tmps == tmps[0]].shape == tmps.shape :
return np.where(np.bincount(label) == max(np.bincount(label)))[0]
# 根据信息增益拿到特征的索引
bf = getBestFeature(feature ,label)
tree = {bf : {}}
# 拿到bestfeature的所有特征值
fvalue = list(set(feature[:,bf]))
# 构建对应特征值的子样本集sub_feature, sub_label
for v in fvalue:
sub_feature = feature[ feature[:,bf] == v, : ]
sub_label = label[ feature[:,bf] == v ]
# 递归
tree[bf][v] = createTree(sub_feature , sub_label)
return tree
def pred(tree , feature):
if not isinstance(tree , dict):
return tree
f = list(tree.keys())[0]
return pred(tree[ f ][feature[ f ]] , feature)
#决策树分类
def dt_clf(train_feature,train_label,test_feature):
'''
input:
train_feature(ndarray):训练样本特征
train_label(ndarray):训练样本标签
test_feature(ndarray):测试样本特征
output:
predict(ndarray):测试样本预测标签
'''
#创建决策树
tree = createTree(train_feature , train_label)
#根据tree与特征进行分类
predict = []
for f in test_feature:
predict.append( pred(tree , f) )
return predict