6人阅读 评论(0)

# Decision Tree(ID3) using pandas with Python

## implementation of ID3

# 定义一个决定树

def TreeGenerate(df):
"""
@param df : the pandas dataframe of the dataset
@return root : the root node of decision tree
"""
newNode=Node(None,None,{})
labelArr=df[df.columns[-1]]
labelCount=NodeLabel(labelArr)
if labelCount:# 假设标签数不是空
newNode.label=max(labelCount,key=labelCount.get)
if len(labelCount)==1 or len(labelArr)==0:    # end if there is only 1 class in node data
return newNode
# get the optimal attribution for a new branching
newNode.attr,divValue=OptArr(df)
# recursion
if divValue==0: # categoric variable
valueCount=ValueCount(df[newNode.attr])
for value in valueCount:
dfV=df[df[newNode.attr].isin([value])]   # get sub set
dfV=dfV.drop(newNode.attr,1)
newNode.attrDown[value]=TreeGenerate(dfV)
else:# continuous variable
# left and right child
valueL="<=%.3f"% divValue
valueR=">%.3f" % divValue
dfVL=df[df[newNode.attr]<=divVakue]
dfVR=df[df[newNode.attr]>divVakue]

newNode.attrDown[valueL]=TreeGenerate(dfVL)
newNode.attrDown[valueR]=TreeGenerate(dfVR)
return newNode


# NodeLabel()函数

# calculating the appeared labe and it's counts
# 分类标签计数函数

def NodeLabel(labelArr):
"""
@param labelArr:data Array for class labels
@return labelCount:dict,the appeared label and it's counts
"""
labelCount={} # store count of label
for label in labelArr:
if label in labelCount:
labelCount[label]+=1
else:
labelCount[label]=1
return labelCount
# OptArr()函数

# find the optimal attributes of current dataSet,找到数据集中可选属性
# 找到最大的信息增益的属性用来分支树的

def OptArr(df):
"""
@param df: pandas dataframe of the dataSet
@return optArr: the optimal attribution for branch
@return divVlaue: for discrete variable value=0
for continuous variable value =t for bisection divide value
"""
infoGain=0
for attrId in df.columns[1:-1]: # 计算每个特征,在特征中找到划分特征的最佳属性
infoGainTmp,divValueTmp=InfoGain(df,attrId)
if infoGainTmp>infoGain:
infoGain=infoGainTmp
optArr=attrId
divValue=divValueTmp
return optArr,divValue

# infoGain()函数
# 用来计算每个属性的信息增益

def InfoGain(df,index):
"""
@param df :the pandas of dataframe
@param index : the attrbution ID
@return : infoGain and divValue
"""
infoGain=InfoEnt(df.values[:,-1])# labelArr 中class的信息熵的计算
# infoGain for the whole label
divValue=0
# for continuous attribute
n=len(df[index])
# for continuous variable using method of bitsection
if df[index].dtype==(float,int):
subInfoEnt={}
# sorted the index
df=df.sort_values([index],ascending=1)
df=df.reset_index(drop=True)# 重新设置索引值,并删除一些元素
dataArr=df[index]
labelArr=df[df.columns[-1]]
# 连续值的处理:连续属性离散化,bit-partition ,西瓜书 4.4
for i in range(n-1):
div=(dataAr[i]+dataArr[i+1])/2
subInfoEnt(div)=((i+1)*InfoEnt(labelArr[0:i+1])/n)+((n-i-1)*InfoEnt(labelArr[i+1:-1])/n)
divValue,sunInfoEntMax=min(subInfoEnt.items(),key=lambda x:x[1]) # lambda 这个key 是什么意思?
infoGain-=subInfoGainMax

#　discrete variable
else:
dataArr=df[index]
labelArr=df[df.columns[-1]]
valueCount=ValueCount(dataArr)
for key in valueCount:
keyLabelArr=labelArr[dataArr==key]
infoGain-=valueCount[key]*InfoEnt(keyLabelArr)/n
return infoGain,divValue

# ValueCount()函数
# 计算每个特征中的属性值

def ValueCount(labelArr):
"""
@param labelArr: the attribute of data array
@return valueCount:dict,the appeared value and it's counts
"""
valueCount={}
for label in labelArr:
if label in valueCount:
valueCount[label]+=1
else:
valueCount[label]=1
return valueCount
# infoEnt()函数
# 用来计算属性的信息熵

def InfoEnt(labelArr):
"""
@param labelArr: data array of class label
@return ent: the class information entropy
"""
try:
from math import log2
except ImportError:

ent=0
n=len(labelArr)
labelCount=NodeLabel(labelArr)
for key in labelCount:
ent-=(labelCount[key]/n)*log2(labelCount[key]/n)
return ent
# Predict()　function
# make a perdict based on root

def Predict(root,df_sample):
try:
import re # using regular exopression to get the number i string
expect ImportError:
while root.attr !=None:
if df_sample[root.attr].dtype==(float,int):
# get the div_value from root.attr_down
for key in list(root.attr_down):
num=re.findall(r"\d+\.?\d*",key)
div_value=float(num[0])
break
if df_sample[root.attr].values[0]<=div_value:
key="<=%.3f" %div_value
root=root.attr_down[key]
else:
key=">%.3f" %div_value
root=root.attr_down[key]
# categoric variable
else:
key=df_sample[root.attr].values[0]
# check whether the attr_value in the child branch
if key in root.attr_down:
root=root.attr_down[key]
else:
break
return root.label


# DrawPng()　functions
# visualization the tree using graphviz

def DrawPng(root,out_file):
"""
@param root : the tree root node
@param out_file: the output name&file path of file
"""
try:
from pydotplus import graphviz
except ImportError:

g=graphviz.Dot()   # generation of new dot

TreeToGraph(0,g,root)
g2=graphviz.graph_from_dot_data(g.to_string())
g2.write_png(out_file)

# TreeToGraph()
# bulid a graph from root on

def TreeToGraph(i,g,root):
"""
@param i: node number in this tree
@param g: pydotplus.garphviz.Dot() object
@param root : the root node

@return i:node number after modified
@return g:...object ater modified
@return g_node: the current root node in graphviz
"""
try:
from pydotplus import graphviz
except ImportError:
if root.attr==None:
g_node_label='Node:%d\n 好瓜:%s'%(i,root,label)
else:
g_node_label="Node:%d\n 好瓜:%s"%(i,root.label,root.attr)
g_node=i
for value in list(root.attr_down):
i,g_child=TreeToGraph(i+1,g,root.attr_down[value])
return i,g_node
上面的树的基础已经完成，利用上面的函数来进行数据的处理

root=TreeGenerate(df)

# 计算准确率
from random import sample
accuracy_scores=[]
for i in range(10):
train=sample(range(len(df.index)),int(1*len(df.index)/2))

df_train=df.iloc[train]  # 按位置选取元素
df_test=df.drop(train)
# generate tree
root=TreeGenerate(df_train)
# test the accuracy
pred_true=0
for i in df_test.index:
label=Predict(root,df[df.index==i])
if label==df_test[df_test.columns[-1]][i]:
pred_true+=1
accuracy=pred_true/len(df_test.index)
accuracy_scores.append(accuracy)
# K-folds cross prediction
# k 折交叉验证　，一个模型评估方法
n=len(df.index)
k=5
for i in range(k):
m=int(n/k)
test=[]
for j in range(i*m,i*m+m):  # 这个程序要记住
test.append(j)
df_train=df.drop(test)
df_test=df.iloc[test]
root=TreeGenerate(df_train)  # generate the tree

# test the accuracy
pred_true=0
for i in df_test.index:
label=Predict(root,df[df.index==i])
if label==df_test[df_test.columns[-1]][i]:
pred_true+=1

accuracy=pred_true/len(df_test.index)
accuracy_scores.append(accuracy)
# print the prediction accuracy result

accuracy_sum=0
print("accuracy:",end= "")
for i in range(k):
print("%.3f "% accuracy_scores[i],end="")
accuracy_sum+=accuracy_scores[i]
print("\n average accuracy {}".format(accuracy_sum/k))
# visualization 可视化处理
# decision tree visualization using pydotplus.graphviz
root=TreeGenerate(df)
DrawPng(root,"decision_tree_ID3.png")

个人资料
等级：
访问量： 19
积分： 85
排名： 279万+
文章存档