python自编函数生成二叉决策树以及测试准确率(iris数据集)

输入:iris.data
输出:二叉决策树,以及测试集在这颗树上的准确率(此处用gini指数)
代码如下

import pandas as pd 
import numpy as np
from sklearn.datasets import load_iris
from sklearn import tree

iris=load_iris()

# 取测试集,其余为训练集
removed = range(70,100)
new_target = np.delete(iris.target,removed)
new_data = np.delete(iris.data,removed, axis=0) 

# entropy计算
def calc_entropy(p):
		if p!=0:
			return -p * np.log2(p)
		else:
			return 0

# 增益计算
def calc_info_gain(data,classes,feature):

    # Calculates the information gain based on both entropy and the Gini impurity
    gain = 0
    ggain = 0
    nData = len(data)

    # List the values that feature can take

    values = []
    for datapoint in data:
        if datapoint[feature] not in values:
            values.append(datapoint[feature])

    featureCounts = np.zeros(len(values))
    entropy = np.zeros(len(values))
    gini = np.zeros(len(values))
    valueIndex = 0
    # Find where those values appear in data[feature] and the corresponding class
    for value in values:
        dataIndex = 0
        newClasses = []
        for datapoint in data:
            if datapoint[feature]==value:
                featureCounts[valueIndex]+=1
                newClasses.append(classes[dataIndex])
            dataIndex += 1

        # Get the values in newClasses
        classValues = []
        for aclass in newClasses:
            if classValues.count(aclass)==0:
                classValues.append(aclass)

        classCounts = np.zeros(len(classValues))
        classIndex = 0
        for classValue in classValues:
            for aclass in newClasses:
                if aclass == classValue:
                    classCounts[classIndex]+=1 
            classIndex += 1
        
        for classIndex in range(len(classValues)):
            entropy[valueIndex] += calc_entropy(float(classCounts[classIndex])/np.sum(classCounts))
            gini[valueIndex] += (float(classCounts[classIndex])/np.sum(classCounts))**2

        # Computes both the Gini gain and the entropy
        gain = gain + float(featureCounts[valueIndex])/nData * entropy[valueIndex]
        ggain = ggain + float(featureCounts[valueIndex])/nData * gini[valueIndex]
        valueIndex += 1
    return gain, 1-ggain

# 生成树
def make_tree(data,classes,featureNames,maxlevel=-1,level=0,forest=0):
    """ The main function, which recursively constructs the tree"""

    nData = len(data)
    nFeatures = len(data[0])

    newClasses = []
    for aclass in classes:
        if newClasses.count(aclass)==0:
            newClasses.append(aclass)

    frequency = np.zeros(len(newClasses))

    totalEntropy = 0
    totalGini = 0
    index = 0
    for aclass in newClasses:
        frequency[index] = np.sum(classes == aclass)
        totalEntropy += calc_entropy(float(frequency[index])/nData)
        totalGini += (float(frequency[index])/nData)**2

        index += 1

    totalGini = 1 - totalGini
    default = classes[np.argmax(frequency)]

    if nData==0 or nFeatures == 0 or (maxlevel>=0 and level>maxlevel):
        # Have reached an empty branch
        return default
    elif np.sum(classes == classes[0]) == nData:
        # Only 1 class remains
        return classes[0]
    else:
        # Choose which feature is best	
        gain = np.zeros(nFeatures)
        ggain = np.zeros(nFeatures)
        featureSet = range(nFeatures)
        if forest != 0:
            np.random.shuffle(featureSet)
            featureSet = featureSet[0:forest]
        for feature in featureSet:
            g,gg = calc_info_gain(data,classes,feature)
            gain[feature] = totalEntropy - g
            ggain[feature] = totalGini - gg

        bestFeature = np.argmax(gain)
        tree = {featureNames[bestFeature]:{}}

        # List the values that bestFeature can take
        values = []
        for datapoint in data:
            if datapoint[feature] not in values:
                values.append(datapoint[bestFeature])

        for value in values:
            # Find the datapoints with each feature value
            newData = []
            newClasses = []
            index = 0
            for datapoint in data:
                if datapoint[bestFeature]==value:
                    if bestFeature==0:
                        newdatapoint = datapoint[1:]
                        newNames = featureNames[1:]
                    elif bestFeature==nFeatures:
                        newdatapoint = datapoint[:-1]
                        newNames = featureNames[:-1]
                    else:
                        newdatapoint = datapoint[:bestFeature]
                        newdatapoint = np.hstack((newdatapoint,datapoint[bestFeature+1:]))
                        newNames = featureNames[:bestFeature]
                        newNames.extend(featureNames[bestFeature+1:])
                    newData.append(newdatapoint)
                    newClasses.append(classes[index])
                index += 1

            # Now recurse to the next level	
            subtree = make_tree(newData,newClasses,newNames,maxlevel,level+1,forest)

            # And on returning, add the subtree on to the tree
            tree[featureNames[bestFeature]][value] = subtree

        return tree

def classify(tree,datapoint,featureNames):
    if type(tree) == type("string"):
        # Have reached a leaf
        return tree
    else:
        if isinstance(tree,dict):
            a = list(tree.keys())[0]
            for i in range(3):
                if featureNames[i]==a:
                    break
            try:   
                t = tree[a][datapoint[i]]
                return classify(t,datapoint,featureNames)
            except:
                for y in range(len(tree[a])):
                    tree_feature = [0]*len(tree[a])
                    differ = list(tree[a].keys())[y] - datapoint[i] 
                    if differ < 0:
                        tree_feature[y] = (-1)*differ
                    else:
                        tree_feature[y] = differ
                index = tree_feature.index(min(tree_feature))
                t = tree[a][list(tree[a].keys())[y]]
                return classify(t, datapoint, featureNames)  
        else:
            return tree

# 对测试集进行分类
def classifyAll(tree,data,featureNames):
    results = []
    for i in range(len(data)):
        results.append(classify(tree,data[i],featureNames)-1)
    return results

# 计算准确率
def accuracy(a,b):
    s = 0
    for i in range(len(a)):
        if a[i] == b[i]:
            s += 1
    rate = s/len(a)
    return rate

featureNames = iris.feature_names
classes = iris.target
tree = make_tree(new_data,new_target,featureNames)

a = classifyAll(tree, iris.data[removed], featureNames)
# 打印树与准确率
print(tree)
print(accuracy(a,list(iris.target[removed])))

  • 1
    点赞
  • 12
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

today__present

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值