决策树是一种基本的分类与回归方法
一、决策树的基本概念
- 决策树的结点和有向边分别表示:
- 内部结点表示一个特征或者属性
- 叶子结点表示一个分类。
- 有向边代表了一个划分规则
-
决策树从根结点到子结点的的有向边代表了一条路径
-
决策树的路径是互斥并且是完备的(不重不漏)
-
用决策树分类时,是对样本的某个特征进行测试,根据测试结果将样本分配如果将样本分配到了树的子结点上,每个子结点对应该特征的一个取值
-
决策树的优点:可读性强,分类速度快
决策树遵循分而治之的思想,可认为是if-else-then规则的集合
二、决策树算法指标
1.信息熵/纯度
信息熵(Entropy)用来度量不确定性,当熵越大,信息的不确定性越大
对于分类问题,那么,当前类别的熵越大,它的不确定性就越大,分类效果越差,集合越不纯
H
(
D
)
=
E
n
t
(
D
)
=
∑
i
=
1
n
−
p
i
log
2
p
i
H\left( D \right) =Ent\left( D \right) =\sum_{i=1}^n{-p_i\log _2p_i}
H(D)=Ent(D)=i=1∑n−pilog2pi
(
n
n
n是分类的数目,
p
i
p_i
pi是当前分类发生的概率)
信息熵为0,意味着信息确定,也意味着分类完成
2.信息增益
若离散属性A有V个可能的取值,用A对D划分,会产生具有权重为
∣
D
v
∣
∣
D
∣
\frac{|D^v|}{|D|}
∣D∣∣Dv∣ 的分支节点,则增加的熵为信息增益(Information Gain):
G
a
i
n
(
D
,
A
)
=
E
n
t
(
D
)
−
∑
v
=
1
V
∣
D
v
∣
∣
D
∣
E
n
t
(
D
v
)
Gain\left( D,A \right) =Ent\left( D \right) -\sum_{v=1}^V{\frac{|D^v|}{|D|}Ent\left( D^v \right)}
Gain(D,A)=Ent(D)−v=1∑V∣D∣∣Dv∣Ent(Dv)
信息增益越大,用属性A进行划分得到的集合纯度提升越大
3.ID3
ID3(Iterative Dichotomisor 3) 迭代二分类三代,用信息增益准则选择特征,判定分类器的性能,从而构建决策树
算法思想:利用数据标注,信息增益和遍历,可以完成一个决策树中的特征和阈值的选择(得到最大信息增益的特征和阈值),得到一颗分类决策树
4.信息增益率,ID4.5和基尼指数
信息增益准则对可取值数目较多的特征有所偏好,为了减少这种偏好可能带来的不利影响,C4.5决策树算法使用了“增益率”
G
a
i
n
_
r
a
t
i
o
(
D
,
a
)
=
G
a
i
n
(
D
,
a
)
I
V
(
a
)
Gain\_ratio\left( D,a \right) =\frac{Gain\left( D,a \right)}{IV\left( a \right)}
Gain_ratio(D,a)=IV(a)Gain(D,a)
I
V
(
a
)
=
−
∑
v
=
1
V
∣
D
v
∣
∣
D
∣
log
2
∣
D
v
∣
∣
D
∣
IV\left( a \right) =-\sum_{v=1}^V{\frac{|D^v|}{|D|}}\log _2\frac{|D^v|}{|D|}
IV(a)=−v=1∑V∣D∣∣Dv∣log2∣D∣∣Dv∣
其中IV(a)被称为a的固有值,即a的可取值数越多,IV(a)越大
数据集D的纯度可由基尼指数来衡量:
G
i
n
i
(
D
)
=
∑
k
=
1
∣
y
∣
∑
k
‘
≠
k
p
k
p
k
’
=
1
−
∑
k
=
1
∣
y
∣
p
k
2
Gini\left( D \right) =\sum_{k=1}^{|y|}{\sum_{k^‘\ne k}{p_kp_{k^’}}}=1-\sum_{k=1}^{|y|}{p_{k}^{2}}
Gini(D)=k=1∑∣y∣k‘=k∑pkpk’=1−k=1∑∣y∣pk2
Gini(D)越小,D的纯度越高
三、决策树算法
决策树学习的算法通常是遍历选择最优特征和特征值,并根据该特征对训练数据进行分割,使得对各个子数据集有一个最好的分类,这一过程对应着特征空间的划分,也对应着决策树的构建。
最优特征选取:
1.离散特征:直接按信息增益公式计算求得最大信息增益对应的特征即可
2.连续特征:采用连续特征离散化技术,用二分法对连续特征做处理
建树的过程是递归的,递归返回条件有三种:
1.D中样本为同一类别
2. D为空集,无法划分
3.D中样本在所有属性上取值相同或属性集为空集
四、决策树ROC曲线
五、连续值决策树实现
import numpy as np
import math
from matplotlib import pyplot as plt
def loadDataSet(trainfile): # 加载数据
dataMat = []
lable = []
lablename = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']
fr = open(trainfile)
for line in fr.readlines():
lineArr = line.strip().split(',')
dataMat.append([float(lineArr[1]), float(lineArr[2]), float(lineArr[3]), float(lineArr[4]), float(lineArr[5]),
float(lineArr[6]), float(lineArr[7]), float(lineArr[8])])
lable.append(float(lineArr[0]))
return dataMat, lable, lablename
def splitdata0(dataset, lable, lable_position, best_t): # 将剔除A属性后的数据集按A的最佳划分点分离:比划分点小
splitDataSet = []
splitlable = []
for i in range(len(dataset)):
if dataset[i][lable_position] <= best_t:
DataSet = dataset[i][:lable_position] # 不会包含lable_position处的点
DataSet.extend(dataset[i][lable_position + 1:])
splitDataSet.append(DataSet)
splitlable.append(lable[i])
return splitDataSet, splitlable
def splitdata1(dataset, lable, lable_position, best_t): # 将剔除A属性后的数据集按A的最佳划分点分离:比划分点大
splitDataSet = []
splitlable = []
for i in range(len(dataset)):
if dataset[i][lable_position] > best_t:
DataSet = dataset[i][:lable_position] # 不会包含lable_position处的点
DataSet.extend(dataset[i][lable_position + 1:])
splitDataSet.append(DataSet)
splitlable.append(lable[i])
return splitDataSet, splitlable
def Ent(pt, pf): # 计算信息熵,pt为正例,pf为反例
if pt == 0 or pf == 0:
return 0
else:
return -pt / (pt + pf) * math.log(pt / (pt + pf)) - pf / (pt + pf) * math.log(pf / (pt + pf), 2)
def Gain(A, lable): # 信息增益;属性Attributes;连续值处理
A = np.asarray(A)
lable = np.asarray(lable)
T = []
t_star, max_gain = 0.0, 0.0 # 最优划分中位点,最大信息增益
pt = np.shape(np.nonzero(lable))[1]
pf = np.shape(lable)[0] - pt
B = np.sort(A) # 排序以便得出中位点,由于用的是asarray,所以原本的列表A也随之变化了
for i in range(np.shape(B)[0] - 1):
T.append(float((B[i] + B[i + 1]) / 2))
m = len(T)
for i in range(m):
p0_t = np.shape(np.nonzero(lable[np.nonzero(A <= T[i])[0]]))[
1] # 小于等于T[i]的正例个数,np.nonzero(A <= T[i])[0]:小于等于T[i]的索引
p0_f = np.shape(np.nonzero(lable[np.nonzero(A <= T[i])[0]] == 0))[1]
p1_t = np.shape(np.nonzero(lable[np.nonzero(A > T[i])[0]]))[1]
p1_f = np.shape(np.nonzero(lable[np.nonzero(A > T[i])[0]] == 0))[1]
gain = Ent(pt, +pf) - ((i + 1) / m * Ent(p0_t, p0_f)) - ((m - i - 1) / m * Ent(p1_t, p1_f))
if max_gain < gain:
max_gain = gain
t_star = T[i]
return t_star, max_gain
def bestAttributes(dataSet, lable, lablename): # 选择最优属性,最优划分点和对应的序号
dataSet = np.asarray(dataSet)
lable = np.asarray(lable)
best_gain, best_t, best_lable, lable_position = 0.0, 0.0, 'A', 0.0
m = np.shape(dataSet)[1]
if len(lablename) < m: # 可供选择的属性数不足,m为实际属性数
return best_lable, best_t, lable_position
for i in range(m):
t, max_gain = Gain(dataSet[:, i], lable)
if best_gain < max_gain:
best_gain = max_gain
best_t = t
best_lable = lablename[i]
lable_position = i
return best_lable, best_t, int(lable_position)
def MostLable(lable): # 多数投票
lable = np.asarray(lable)
pt = np.count_nonzero(lable) # 统计标签为1的数量
pf = np.shape(lable)[0] - pt
if pt > pf:
return 1
else:
return 0
def TreeGenerate(dataSet, lable, lablename): # 生成树
if len(set(lable)) == 1: # 所有样本类别一致
return lable[0]
if len(dataSet[0]) == 0: # 所有属性都被划分好了,即列数为0
return MostLable(lable)
best_lable, best_t, lable_position = bestAttributes(dataSet, lable, lablename) # 得到最优标签,最优划分点,最优划分点的序号
if best_t == 0: # 样本在某一属性上取值相同
return MostLable(lable)
tree = {best_lable: {}}
del (lablename[lable_position])
dataSet0, lable0 = splitdata0(dataSet, lable, lable_position, best_t)
dataSet1, lable1 = splitdata1(dataSet, lable, lable_position, best_t)
tree[best_lable]['<=' + str(round(best_t, 3))] = TreeGenerate(dataSet0, lable0, lablename)
tree[best_lable]['>' + str(round(best_t, 3))] = TreeGenerate(dataSet1, lable1, lablename)
return tree
def predict(tree, feat, data, T, leaf, y, inde):
firstFeat = list(tree.keys())[0] # 根节点属性
secondDict = tree[firstFeat] # 根节点的子树
featIndex = feat.index(firstFeat) # firstFeat在feat中的序号(第几个),即对应的是哪一个属性!!!!!!
for key in secondDict.keys(): # 有<=和>两种key
if data[featIndex] <= T[featIndex]: # 由于data的属性顺序为ABCDEFGH,所以T中也要ABCDEFGH,否则会出错
ture_key = '<=' + str(T[featIndex])
if ture_key == key: # 字符串相等
if type(secondDict[key]).__name__ == "dict": # 如果子节点为字典类型就递归下去,直到为数字为止
classlable = predict(secondDict[key], feat, data, T, leaf, y, inde)
else:
classlable = secondDict[key]
if str(key[2:7]) == '0.151':
if y[inde] == 0:
leaf[str(key[2:7]) + '0' + '0'] += 1
else:
leaf[str(key[2:7]) + '0' + '1'] += 1
else:
if y[inde] == 0:
leaf[str(key[2:7]) + '0'] += 1
else:
leaf[str(key[2:7]) + '1'] += 1
else:
ture_key = '>' + str(T[featIndex])
if ture_key == key:
if type(secondDict[key]).__name__ == "dict":
classlable = predict(secondDict[key], feat, data, T, leaf, y, inde)
else:
classlable = secondDict[key]
if str(key[1:6]) == '0.151':
if y[inde] == 0:
leaf[str(key[1:6]) + '1' + '0'] += 1
else:
leaf[str(key[1:6]) + '1' + '1'] += 1
else:
if y[inde] == 0:
leaf[str(key[1:6]) + '0'] += 1
else:
leaf[str(key[1:6]) + '1'] += 1
return classlable
def getKey(x):
return float(x[0])
def roc_draw(leaf):
mat = np.zeros((9, 2))
i = j = 0
sum1 = sum0 = 0
for key, value in leaf.items():
if key[-1] == '0':
mat[i][1] = value
sum0 += value
i += 1
else:
mat[j][0] = value
sum1 += value
j += 1
col_one = mat[:, 0]
col_two = mat[:, 1]
col_one = np.sort(col_one)
col_two = np.sort(col_two)
mat = np.vstack((col_one, col_two)).T
fpr = [0]
tpr = [0]
s0 = s1 = 0
s = 0
for i, j in mat:
print(j, i)
temp0 = s0
temp1 = s1
s1 += i
s0 += j
s += (((temp1 + s1) / sum1) * (j / sum0) / 2)
fpr.append(s0 / sum0)
tpr.append(s1 / sum1)
plt.plot(fpr, tpr, color='red')
plt.xlabel("False Positive Rate(FPR)")
plt.ylabel("Ture Positive Rate(TPR)")
plt.grid(alpha=0.4)
plt.show()
return s
def con_mat(true_lable, pre_lable):
tp, fp, fn, tn = 0, 0, 0, 0
for i in range(len(true_lable)):
if true_lable[i] == pre_lable[i]:
if true_lable[i] == 1:
tp += 1
else:
tn += 1
else:
if true_lable[i] == 1:
fn += 1
else:
fp += 1
print("confusion matrix:", tp, fp)
print(" ", fn, tn)
print("accuracy: ", (tp + tn) / (tp + tn + fn + fp))
print("precisionScore: ", tp / (tp + fp))
print("recallScore: ", tp / (tp + fn))
print("F1: ", tp / (tp + (fn + fp) / 2))
return
traindata, trainlable, lablename = loadDataSet('classification_train.txt')
tree = TreeGenerate(traindata, trainlable, lablename)
print(tree)
# 验证
testdata, testlable, testlablename = loadDataSet('classification_test.txt')
T = [0.235, 0.617, 0.607, 0.165, 0.151, 0.446, 0.036, 0.008]
pre_lable = []
feat = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']
leaf = {}
leaf['0.2350'] = leaf['0.6170'] = leaf['0.6070'] = leaf['0.1650'] = leaf['0.4460'] = leaf['0.0360'] = \
leaf['0.0080'] = 0
leaf['0.2351'] = leaf['0.6171'] = leaf['0.6071'] = leaf['0.1651'] = leaf['0.4461'] = leaf['0.0361'] = \
leaf['0.0081'] = 0
leaf['0.15100'] = leaf['0.15101'] = leaf['0.15110'] = leaf['0.15111'] = 0
inde = 0
for data in testdata:
pre_lable.append(predict(tree, feat, data, T, leaf, testlable, inde))
inde += 1
con_mat(testlable, pre_lable)
print("ROC AUC: ", roc_draw(leaf))