DecisionTree+using+Pandas+in+Python

标签: pandas
6人阅读 评论(0) 收藏 举报
分类:

Decision Tree(ID3) using pandas with Python

implementation of ID3

# 定义一个决定树

def TreeGenerate(df):
    """
    @param df : the pandas dataframe of the dataset
    @return root : the root node of decision tree
    """
    newNode=Node(None,None,{})
    labelArr=df[df.columns[-1]]
    labelCount=NodeLabel(labelArr)
    if labelCount:# 假设标签数不是空
        newNode.label=max(labelCount,key=labelCount.get)
        if len(labelCount)==1 or len(labelArr)==0:    # end if there is only 1 class in node data
            return newNode
        # get the optimal attribution for a new branching
        newNode.attr,divValue=OptArr(df)
        # recursion
        if divValue==0: # categoric variable
            valueCount=ValueCount(df[newNode.attr])
            for value in valueCount:
                dfV=df[df[newNode.attr].isin([value])]   # get sub set
                # delete current attribution
                dfV=dfV.drop(newNode.attr,1)
                newNode.attrDown[value]=TreeGenerate(dfV)
        else:# continuous variable  
            # left and right child
            valueL="<=%.3f"% divValue
            valueR=">%.3f" % divValue
            dfVL=df[df[newNode.attr]<=divVakue]
            dfVR=df[df[newNode.attr]>divVakue]

            newNode.attrDown[valueL]=TreeGenerate(dfVL)
            newNode.attrDown[valueR]=TreeGenerate(dfVR)
    return newNode

# NodeLabel()函数

# calculating the appeared labe and it's counts
# 分类标签计数函数

def NodeLabel(labelArr):
    """
    @param labelArr:data Array for class labels
    @return labelCount:dict,the appeared label and it's counts
    """
    labelCount={} # store count of label
    for label in labelArr:
        if label in labelCount:
            labelCount[label]+=1
        else:
            labelCount[label]=1
    return labelCount
# OptArr()函数

# find the optimal attributes of current dataSet,找到数据集中可选属性
# 找到最大的信息增益的属性用来分支树的

def OptArr(df):
    """
    @param df: pandas dataframe of the dataSet
    @return optArr: the optimal attribution for branch
    @return divVlaue: for discrete variable value=0
                      for continuous variable value =t for bisection divide value
    """
    infoGain=0
    for attrId in df.columns[1:-1]: # 计算每个特征,在特征中找到划分特征的最佳属性
        infoGainTmp,divValueTmp=InfoGain(df,attrId)
        if infoGainTmp>infoGain:
            infoGain=infoGainTmp
            optArr=attrId
            divValue=divValueTmp
    return optArr,divValue
# infoGain()函数
# 用来计算每个属性的信息增益

def InfoGain(df,index):
    """
    @param df :the pandas of dataframe
    @param index : the attrbution ID 
    @return : infoGain and divValue
    """
    infoGain=InfoEnt(df.values[:,-1])# labelArr 中class的信息熵的计算
    # infoGain for the whole label
    divValue=0
    # for continuous attribute
    n=len(df[index])
    # for continuous variable using method of bitsection
    if df[index].dtype==(float,int):
        subInfoEnt={}
        # sorted the index
        df=df.sort_values([index],ascending=1)
        df=df.reset_index(drop=True)# 重新设置索引值,并删除一些元素
        dataArr=df[index]
        labelArr=df[df.columns[-1]]
        # 连续值的处理:连续属性离散化,bit-partition ,西瓜书 4.4
        for i in range(n-1):
            div=(dataAr[i]+dataArr[i+1])/2
            subInfoEnt(div)=((i+1)*InfoEnt(labelArr[0:i+1])/n)+((n-i-1)*InfoEnt(labelArr[i+1:-1])/n)
        divValue,sunInfoEntMax=min(subInfoEnt.items(),key=lambda x:x[1]) # lambda 这个key 是什么意思?
        infoGain-=subInfoGainMax

    # discrete variable 
    else:
        dataArr=df[index]
        labelArr=df[df.columns[-1]]
        valueCount=ValueCount(dataArr)
        for key in valueCount:
            keyLabelArr=labelArr[dataArr==key]
            infoGain-=valueCount[key]*InfoEnt(keyLabelArr)/n
        return infoGain,divValue
# ValueCount()函数
# 计算每个特征中的属性值

def ValueCount(labelArr):
    """
    @param labelArr: the attribute of data array
    @return valueCount:dict,the appeared value and it's counts
    """
    valueCount={}
    for label in labelArr:
        if label in valueCount:
            valueCount[label]+=1
        else:
            valueCount[label]=1
    return valueCount
# infoEnt()函数
# 用来计算属性的信息熵

def InfoEnt(labelArr):
    """
    @param labelArr: data array of class label
    @return ent: the class information entropy
    """
    try:
        from math import log2
    except ImportError:
        print('module math.log2 not found')

    ent=0
    n=len(labelArr)
    labelCount=NodeLabel(labelArr)
    for key in labelCount:
        ent-=(labelCount[key]/n)*log2(labelCount[key]/n)
    return ent
# Predict() function
# make a perdict based on root

def Predict(root,df_sample):
    try:
        import re # using regular exopression to get the number i string
    expect ImportError:
        print('module re not found')
    while root.attr !=None:
        if df_sample[root.attr].dtype==(float,int):
            # get the div_value from root.attr_down
            for key in list(root.attr_down):
                num=re.findall(r"\d+\.?\d*",key)
                div_value=float(num[0])
                break
            if df_sample[root.attr].values[0]<=div_value:
                key="<=%.3f" %div_value
                root=root.attr_down[key]
            else:
                key=">%.3f" %div_value
                root=root.attr_down[key]
        # categoric variable
        else:
            key=df_sample[root.attr].values[0]
            # check whether the attr_value in the child branch
            if key in root.attr_down:
                root=root.attr_down[key]
            else:
                break
    return root.label

# DrawPng() functions
# visualization the tree using graphviz

def DrawPng(root,out_file):
    """
    @param root : the tree root node 
    @param out_file: the output name&file path of file 
    """
    try:
        from pydotplus import graphviz
    except ImportError:
        print("module pydotplus.graphviz not found")

    g=graphviz.Dot()   # generation of new dot

    TreeToGraph(0,g,root)
    g2=graphviz.graph_from_dot_data(g.to_string())
    g2.write_png(out_file)
# TreeToGraph()
# bulid a graph from root on

def TreeToGraph(i,g,root):
    """
    @param i: node number in this tree
    @param g: pydotplus.garphviz.Dot() object
    @param root : the root node

    @return i:node number after modified
    @return g:...object ater modified
    @return g_node: the current root node in graphviz
    """
    try:
        from pydotplus import graphviz
    except ImportError:
        print("module p... not found")
    if root.attr==None:
        g_node_label='Node:%d\n 好瓜:%s'%(i,root,label)
    else:
        g_node_label="Node:%d\n 好瓜:%s"%(i,root.label,root.attr)
    g_node=i
    g.add_node(graphviz.Node(g_node,label=g_node_label))
    for value in list(root.attr_down):
        i,g_child=TreeToGraph(i+1,g,root.attr_down[value])
        g.add_edge(graphviz.Edge(g_node,g_child,label=value))
    return i,g_node
上面的树的基础已经完成,利用上面的函数来进行数据的处理
root=TreeGenerate(df)

# 计算准确率
from random import sample
accuracy_scores=[]
for i in range(10):
    train=sample(range(len(df.index)),int(1*len(df.index)/2))

    df_train=df.iloc[train]  # 按位置选取元素
    df_test=df.drop(train)
    # generate tree
    root=TreeGenerate(df_train)
    # test the accuracy
    pred_true=0
    for i in df_test.index:
        label=Predict(root,df[df.index==i])
        if label==df_test[df_test.columns[-1]][i]:
            pred_true+=1
    accuracy=pred_true/len(df_test.index)
    accuracy_scores.append(accuracy)
# K-folds cross prediction
# k 折交叉验证 ,一个模型评估方法
n=len(df.index)
k=5
for i in range(k):
    m=int(n/k)
    test=[]
    for j in range(i*m,i*m+m):  # 这个程序要记住
        test.append(j)
    df_train=df.drop(test)
    df_test=df.iloc[test]
    root=TreeGenerate(df_train)  # generate the tree

    # test the accuracy
    pred_true=0
    for i in df_test.index:
        label=Predict(root,df[df.index==i])
        if label==df_test[df_test.columns[-1]][i]:
            pred_true+=1

    accuracy=pred_true/len(df_test.index)
    accuracy_scores.append(accuracy)
# print the prediction accuracy result

accuracy_sum=0
print("accuracy:",end= "")
for i in range(k):
    print("%.3f "% accuracy_scores[i],end="")
    accuracy_sum+=accuracy_scores[i]
print("\n average accuracy {}".format(accuracy_sum/k))
# visualization 可视化处理
# decision tree visualization using pydotplus.graphviz
root=TreeGenerate(df)
DrawPng(root,"decision_tree_ID3.png")
查看评论
    个人资料
    等级:
    访问量: 19
    积分: 85
    排名: 279万+
    文章存档