决策树分类实例

最新推荐文章于 2024-04-29 23:11:29 发布

mxYlulu

最新推荐文章于 2024-04-29 23:11:29 发布

阅读量733

点赞数

本文链接：https://blog.csdn.net/mxYlulu/article/details/106400188

版权

数据处理

在这里插入图片描述
数据就这样，也没啥好处理的 $q w q$

决策树(删减版)

删减在于，用的字典。
所以判断 $r e t u r n$ 的时候少了，为了方便递归。
少了启发式算法，直接用信息增益率

计算信息增益

def calcShannonEnt(dataSet):
    shannonEnt = 0
    classList = [example[-1] for example in dataSet]#最后一个是类别
    Set = set(classList)
    for x in Set:
        p = classList.count(x) / len(classList)
        shannonEnt += (-1) * p * log(p)
    return shannonEnt

根据信息增益率找最佳划分特征

def chooseBestFeatureToSplit(dataSet):
    numFeatures = len(dataSet[0]) - 1      #the last column is used for the labels
    baseEntropy = calcShannonEnt(dataSet)
    bestInfoGain = 0.0; bestFeature = -1
    for i in range(numFeatures):        #iterate over all the features
        featList = [example[i] for example in dataSet]#create a list of all the examples of this feature
        uniqueVals = set(featList)       #get a set of unique values,属性值的数量
        newEntropy = 0.0
        for value in uniqueVals:
            subDataSet = splitDataSet(dataSet, i, value)
            prob = len(subDataSet)/float(len(dataSet))
            newEntropy += prob * calcShannonEnt(subDataSet)     
        infoGain = baseEntropy - newEntropy     #calculate the info gain; ie reduction in entropy
        if (infoGain > bestInfoGain):       #compare this to the best gain so far
            bestInfoGain = infoGain         #if better than current best, set to best
            bestFeature = i
    return bestFeature                    #returns an integer

生成决策树

def createTree(dataSet,labels):
    classList = [example[-1] for example in dataSet]#最后一个是类别
    if classList.count(classList[0]) == len(classList): 
        return classList[0]#stop splitting when all of the classes are equal
    if len(labels) == 0:#stop splitting when there are no more features in dataSet
        return majorityCnt(classList)
    bestFeat = chooseBestFeatureToSplit(dataSet)#找最佳划分属性
    bestFeatLabel = labels[bestFeat]
    myTree = {bestFeatLabel:{}}
    del(labels[bestFeat])
    featValues = [example[bestFeat] for example in dataSet]
    uniqueVals = set(featValues)#对这一特征的各个属性值
    for value in uniqueVals:
        subLabels = labels[:]       #copy all of labels, so trees don't mess up existing labels
        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels)
    return myTree
myTree = createTree(lenses,lensesLabels)

总代码



#导入数据集
fr = open('lenses.txt')
lenses = [line.strip().split('\t') for line in fr.readlines()]
#print(lenses)
lensesLabels = ['age','prescript','astigmatic','tearRate']

#计算原始数据的香农熵
import numpy as np
from math import log
def calcShannonEnt(dataSet):
    shannonEnt = 0
    classList = [example[-1] for example in dataSet]#最后一个是类别
    Set = set(classList)
    for x in Set:
        p = classList.count(x) / len(classList)
        shannonEnt += (-1) * p * log(p)
    return shannonEnt
#print(calcShannonEnt(lenses))

def majorityCnt(classList):
    thisset = set(classList)
    num = 0
    ans = 0
    for x in thisset:
        if classList.count(x) > num:
            num = classList.count(x)
            ans = x
    return ans

#classList = [example[-1] for example in lenses]#最后一个是类别
#print(majorityCnt(classList))
#划分数据集
def splitDataSet(dataSet,feature_index,feature_value):
    subDataSet = []
    for b in dataSet:
        if b[feature_index]==feature_value:
            temp = b[:feature_index]#注意这里不能直接用del删除而应该用切片，用del原数据集会改变
            temp.extend(b[feature_index+1:])#这两句等价于删除了这个特征，因为已经用于划分了
            subDataSet.append(temp)
    return subDataSet

#print(splitDataSet(lenses,0,"pre"))

#选择特征划分
def chooseBestFeatureToSplit(dataSet):
    numFeatures = len(dataSet[0]) - 1      #the last column is used for the labels
    baseEntropy = calcShannonEnt(dataSet)
    bestInfoGain = 0.0; bestFeature = -1
    for i in range(numFeatures):        #iterate over all the features
        featList = [example[i] for example in dataSet]#create a list of all the examples of this feature
        uniqueVals = set(featList)       #get a set of unique values,属性值的数量
        newEntropy = 0.0
        for value in uniqueVals:
            subDataSet = splitDataSet(dataSet, i, value)
            prob = len(subDataSet)/float(len(dataSet))
            newEntropy += prob * calcShannonEnt(subDataSet)     
        infoGain = baseEntropy - newEntropy     #calculate the info gain; ie reduction in entropy
        if (infoGain > bestInfoGain):       #compare this to the best gain so far
            bestInfoGain = infoGain         #if better than current best, set to best
            bestFeature = i
    return bestFeature                    #returns an integer

#生成决策树
def createTree(dataSet,labels):
    classList = [example[-1] for example in dataSet]#最后一个是类别
    if classList.count(classList[0]) == len(classList): 
        return classList[0]#stop splitting when all of the classes are equal
    if len(labels) == 0:#stop splitting when there are no more features in dataSet
        return majorityCnt(classList)
    bestFeat = chooseBestFeatureToSplit(dataSet)#找最佳划分属性
    bestFeatLabel = labels[bestFeat]
    myTree = {bestFeatLabel:{}}
    del(labels[bestFeat])
    featValues = [example[bestFeat] for example in dataSet]
    uniqueVals = set(featValues)#对这一特征的各个属性值
    for value in uniqueVals:
        subLabels = labels[:]       #copy all of labels, so trees don't mess up existing labels
        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels)
    return myTree
myTree = createTree(lenses,lensesLabels)

#构造分类器
def classify(inputTree,featLabels,testVec):
	#同学们自己编写
    cnt = 0
    while cnt < len(featLabels):
        cnt += 1
        key = str(list(inputTree.keys())[0])
        val = 0
        for x,y in zip(featLabels,testVec):
            if x == key:
               val = y
               break
        inputTree = inputTree[key][val]
        if type(inputTree) != dict:
            return inputTree
    classLabel = inputTree
    return classLabel


lensesLabels = ['age', 'prescript', 'astigmatic','tearRate']
print(classify(myTree, ['age','prescript','astigmatic','tearRate'],['young','myope','yes','normal']))

sklearn 实例

唯一麻烦的就是，需要实现离散化特征，字符串不能训练
需要用 $L a b e l E n c o d e r$ 实现



#导入数据集
fr = open('lenses.txt')
lenses = [line.strip().split('\t') for line in fr.readlines()]
#print(lenses)
lensesLabels = ['age','prescript','astigmatic','tearRate']

import numpy as np
from sklearn.tree import export_graphviz
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
x_train=np.array([x[:-1] for x in lenses])
y_train=np.array([x[-1] for x in lenses])

for i in range(len(x_train[0])):
    le.fit(x_train[:,i])
    x_train[:,i] = le.transform(x_train[:,i])

print(x_train)
print(y_train)
tree = DecisionTreeClassifier()
tree.fit(x_train,y_train)

print('Train score:{:.3f}'.format(tree.score(x_train,y_train)))

mxYlulu

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
决策树分类实例

数据处理数据就这样，也没啥好处理的qwqqwqqwq决策树(删减版)删减在于，用的字典。所以判断returnreturnreturn的时候少了，为了方便递归。少了启发式算法，直接用信息增益率计算信息增益def calcShannonEnt(dataSet): shannonEnt = 0 classList = [example[-1] for example in dataSet]#最后一个是类别 Set = set(classList) for x in
复制链接

扫一扫