Python不调用机器学习库实现C4.5

1 篇文章 0 订阅
1 篇文章 0 订阅

Python不调用机器学习库实现C4.5

基本知识

C4.5即J48,是一种决策树算法。决策树顾名思义是用树结构来实现决策分类功能的算法。决策树主要通过选取数据集中的最优属性(特征)来进行建树,这其中,最优属性(特征)的选择也是决策树算法最核心的部分。

关于最优属性的选择,这里不想提及,如果想看懂后面的代码可以去参考周志华教授《机器学习》 。这里只提醒一点:C4.5在选择最优属性选择时并不是一股脑的选择增益率最大,而是先选择那些信息增益高于平均水平的属性,再在其中选择增益率最大的属性。多说无益,代码来说话!

代码实现

首先需要读取用户的数据:
#读取数据的函数
def ReadData(path):
    data_frame = pd.DataFrame(pd.read_csv(path))
    return data_frame
接着计算当前集合的熵:
#计算当前样本集合的熵
def Entropy(frame):
    #样本的数量
    num_frame = frame.shape[0]
    #创建一个value为全0的字典,此处为{0: 0, 1: 0}
    label_dict = dict.fromkeys(label,0)
    #数一下各个类所占的个数
    for i in range(frame.shape[0]):
        label_dict[frame.ix[i][-1]] += 1 
    #初始化熵值
    ENT = 0.0
    for key in label_dict:
        pk = label_dict[key] / num_frame
        if pk == 0:
            ENT -= pk
        else:
            ENT -= pk * math.log(pk,2)
    return ENT
计算根据属性a划分的信息增益:
#计算根据属性a划分的信息增益
def InfoGain(frame,a):
    gain = 0.0
    #首先要统计属性a有哪些取值
    a_attr = list(frame[a].drop_duplicates())
    #接着根据每个可能的值划分样本并计算其熵
    for i in a_attr:
        frame_a_i = frame.ix[frame[a] == i].reset_index(drop = True)
        #分支节点的权重
        weight_i = frame_a_i.shape[0] / frame.shape[0]
        ent_i = Entropy(frame_a_i)
        gain += weight_i * ent_i
    return Entropy(frame) - gain  
计算根据属性a划分的增益率:
#计算属性a的增益率
def GainRate(frame,a):
    IV = 0.0
    a_attr = list(frame[a].drop_duplicates())
    for i in a_attr:
        frame_a_i = frame.ix[frame[a] == i].reset_index(drop = True)
        weight_i = frame_a_i.shape[0] / frame.shape[0]
        if weight_i == 0:
            IV -= weight_i
        else:
            IV -= weight_i * math.log(weight_i,2)
    return InfoGain(frame,a) / IV
选择最优属性并返回:
#选择最优属性,返回属性
def ChooseBestAttr(frame):
    #属性名的list
    attr = list(frame.columns.values[:data.shape[1] - 1])
    #属性的个数
    num_attr = frame.shape[1] - 1
    #初始化各属性的信息增益率的字典
    gain_dict = dict.fromkeys(attr,0)
    sum_gain = 0.0
    for i in attr:
        gain_dict[i] = InfoGain(frame,i)
        sum_gain += InfoGain(frame,i)
    #所有属性的平均信息增益    
    aver_gain = sum_gain / num_attr
    #选择信息增益高于平均水平的属性
    gain_big_aver_dict = gain_dict.copy()
    for key,value in gain_dict.items():
        if value <= aver_gain:
            gain_big_aver_dict.pop(key)
    #再选择增益率最高的属性
    gain_rate = gain_big_aver_dict.copy()
    for key,value in gain_big_aver_dict.items():
        gain_rate[key] = GainRate(frame,key)
    return {v:k for k,v in gain_rate.items()}[max(gain_rate.values())]

接下来的代码是为了判断一些特殊情况,例如当前样本是否为同一个类别啊之类。总之看了《机器学习》 就都懂了,懒得多写。

判断当前的样本集合中的所有样本是否全是一个类别:
#判断当前的样本集合中的所有样本是否全是一个类别    
def SameLable(frame):
    attr_current = list(frame.iloc[:,-1].drop_duplicates())
    #如果是则返回True
    if len(attr_current) == 1:
        return True
    #如果不是则返回False
    else:
        return False
判断属性集是否为空:
#判断属性集是否为空
def AttrSetIsNull(attr_list):
    if len(attr_list) == 0:
        return True
    else:
        return False
判断数据集在属性集上的取值是否相同:
#判断数据集在属性集上的取值是否相同
def IsSameValue(frame,attrs):
    if frame.shape[0] == 1:
        return True
    elif list(frame[attrs].values[0]) == list(frame[attrs].values[1]):
        return True
    else:
        return False
找出数据集中样本数最多的类并返回该类别:
#找出数据集中样本数最多的类并返回该类别
def MostSample(frame):
    label_list = list(frame.iloc[:,-1])
    label_current = list(frame.iloc[:,-1].drop_duplicates())
    label_count_dict = dict.fromkeys(label_current,0)
    for i in label_current:
        label_count_dict[i] = label_list.count(i)
    return {v:k for k,v in label_count_dict.items()}[max(label_count_dict.values())]
划分子集:
#划分样本子集
def SplitData(frame,a_attr,value):
    frame_new = frame[frame[a_attr].isin([value])]
    return frame_new.reset_index(drop = True)

接下来就是利用多重字典创建决策树了,需要传入数据集和属性集合

创建决策树:
def CraetTree(frame,attrs):
    #标签集合
    label_list = list(frame.iloc[:,-1])
    #如果样本集合全是一个类别,则直接返回该类别
    if SameLable(frame):
        return label_list[0]
    #如果属性集为空话则返回类别次数最多的类
    if len(label_list) == 1 or IsSameValue(frame,attrs):
        return MostSample(frame)
    #找到最优属性
    best_atrr = ChooseBestAttr(frame)
    best_attr_set = set(frame[best_atrr])
    #创建节点
    my_tree = {best_atrr:{}}
    for value in best_attr_set:
        Dv = SplitData(frame,best_atrr,value)
        if Dv is None:
            return MostSample(frame)
        else:
            left_labels = attrs[:]
            my_tree[best_atrr][value] = CraetTree(Dv,left_labels)
    return my_tree
对于新来的数据进行决策树分类:
#对新来的数据进行决策树分类
def Desicion(tree,attrs,test_data):
    root_node = list(tree.keys())[0]
    second_node = tree[root_node]
    node_index = attrs.index(root_node)
    for key in second_node.keys():
        if test_data[node_index] == key:
            if type(second_node[key]).__name__ =='dict':
                label_desicion = Desicion(second_node[key],attrs,test_data)
            else:
                label_desicion = second_node[key]
    return label_desicion
主函数:
if __name__ == '__main__':    
    path = 'D:\\文档\\暑期培训\\04--有监督学习\\数据\\watermelon2.0.csv'
    data = ReadData(path)

    #属性的list
    attr = list(data.columns.values[:data.shape[1] - 1])
    attr_copy = attr.copy()
    #样本的总数
    num_sample = data.shape[0]
    #标签类型,此处为[1,0]
    label = list(data.iloc[:,-1].drop_duplicates())
    print('所生成的树的字典结构如下:',CraetTree(data,attr))
    test_data = np.array(data.drop(data.columns.values[-1],axis = 1))
    label_desicion_list = []
    for i in test_data:
        label_desicion_list.append(Desicion(CraetTree(data,attr),attr,list(i)))
    print('分类的结果:',label_desicion_list)

全部的代码就是上面这些,如果希望画出决策树可以使用如下的代码:

import matplotlib.pyplot as plt

decisionNode = dict(boxstyle="sawtooth", fc="0.8")
leafNode = dict(boxstyle="round4", fc="0.8")
arrow_args = dict(arrowstyle="<-")

def plotNode(nodeTxt, centerPt, parentPt, nodeType):
    createPlot.ax1.annotate(nodeTxt, xy=parentPt, xycoords='axes fraction', \
                            xytext=centerPt, textcoords='axes fraction', \
                            va="center", ha="center", bbox=nodeType, arrowprops=arrow_args)

def getNumLeafs(myTree):
    numLeafs = 0
    firstStr = list(myTree.keys())[0]
    secondDict = myTree[firstStr]
    for key in secondDict.keys():
        if type(secondDict[key]).__name__ == 'dict':
            numLeafs += getNumLeafs(secondDict[key])
        else:
            numLeafs += 1
    return numLeafs

def getTreeDepth(myTree):
    maxDepth = 0
    firstStr = list(myTree.keys())[0]
    secondDict = myTree[firstStr]
    for key in secondDict.keys():
        if type(secondDict[key]).__name__ == 'dict':
            thisDepth = getTreeDepth(secondDict[key]) + 1
        else:
            thisDepth = 1
        if thisDepth > maxDepth:
            maxDepth = thisDepth
    return maxDepth

def plotMidText(cntrPt, parentPt, txtString):
    xMid = (parentPt[0] - cntrPt[0]) / 2.0 + cntrPt[0]
    yMid = (parentPt[1] - cntrPt[1]) / 2.0 + cntrPt[1]
    createPlot.ax1.text(xMid, yMid, txtString)

def plotTree(myTree, parentPt, nodeTxt):
    numLeafs = getNumLeafs(myTree)
    depth = getTreeDepth(myTree)
    firstStr = list(myTree.keys())[0]
    cntrPt = (plotTree.xOff + (1.0 + float(numLeafs)) / 2.0 / plotTree.totalw, plotTree.yOff)
    plotMidText(cntrPt, parentPt, nodeTxt)
    plotNode(firstStr, cntrPt, parentPt, decisionNode)
    secondDict = myTree[firstStr]
    plotTree.yOff = plotTree.yOff - 1.0 / plotTree.totalD
    for key in secondDict.keys():
        if type(secondDict[key]).__name__ == 'dict':
            plotTree(secondDict[key], cntrPt, str(key))
        else:
            plotTree.xOff = plotTree.xOff + 1.0 / plotTree.totalw
            plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode)
            plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key))
    plotTree.yOff = plotTree.yOff + 1.0 / plotTree.totalD

def createPlot(inTree):
    fig = plt.figure(1, facecolor='white')
    fig.clf()
    axprops = dict(xticks=[], yticks=[])
    createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)
    plotTree.totalw = float(getNumLeafs(inTree))
    plotTree.totalD = float(getTreeDepth(inTree))
    plotTree.xOff = -0.5 / plotTree.totalw
    plotTree.yOff = 1.0
    plotTree(inTree, (0.5, 1.0), '')
    plt.show()

最后进行测试嘛,看看效果:

在主函数中加入下述代码:

plot.createPlot(CraetTree(data,attr))

可以得到如下的决策树:

C4.5

对于测试嘛,可以将原有数据集去掉标签列传进去进行分类,看分类的结果与实际的结果如何。

最后,本博文采用的数据集的《机器学习》西瓜数据集2.0 可以在网上搜到。

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值