import numpy as np
from collections import Counter
###################################
def entropy(D):
count_array=np.array(Counter(D).values())
P=count_array/float(count_array.sum())
H=np.dot(-P,np.log2(P))
return H
def condition_entropy(D,A):
A=np.array(A)
D=np.array(D)
H_da=0
for i in np.unique(A):
index_i=np.ravel(np.argwhere(A==i))
Di=D[index_i]
H_Di=entropy(Di)
pi=float(Di.size)/D.size
H_da=H_da+pi*H_Di
return H_da
###################################
x1=[0,0,0,0,0,1,1,1,1,1,2,2,2,2,2]
x2=[0,0,1,1,0,0,0,1,0,0,0,0,1,1,0]
x3=[0,0,0,1,0,0,0,1,1,1,1,1,0,0,0]
x4=[0,1,1,0,0,0,1,1,2,2,2,1,1,2,0]
y =[0,0,1,1,0,0,0,1,1,1,1,1,1,1,0]
X=np.c_[x1,x2,x3,x4]
y=np.array(y)
Hy=entropy(y)
Hyx1=condition_entropy(y,x1)
Hyx2=condition_entropy(y,x2)
Hyx3=condition_entropy(y,x3)
Hyx4=condition_entropy(y,x4)
g_yx1=Hy-Hyx1
g_yx2=Hy-Hyx2
g_yx3=Hy-Hyx3
g_yx4=Hy-Hyx4
print Hy #熵 H(y)
print Hyx1 #条件熵 H(y|x1)
print Hyx2
print Hyx3
print Hyx4
print g_yx1 #信息增益g(y,x1)
print g_yx2
print g_yx3
print g_yx4
###############################################
yuanzhen@yuanzhen-ThinkPad-X121e:~/P_script$ python mydecisiontree.py
0.970950594455
0.887943094599
0.647300396303
0.550977500433
0.607961031918
0.0830074998558
0.323650198152
0.419973094022
0.362989562537
#################################
计算也可以使用np.apply_along_axis,如下:
H_X=np.apply_along_axis(condition_entropy,0,X,y)
print H_X
g_Xy=Hy-H_X
print g_Xy
print g_Xy.argmax()
###############################
[ 0.88794309 0.6473004 0.5509775 0.60796103]
[ 0.0830075 0.3236502 0.41997309 0.36298956]
2
不过需要注意,使用np.apply_along_axis时需要那个函数 condition_entropy(D,A)变为condition_entropy(A,D)
信息增益比:特征的信息增益除以特征的熵
HX=np.apply_along_axis(entropy,0,X)
信息增益比:gr_Xy=g_Xy/HX