决策树分类实例

数据处理

在这里插入图片描述
数据就这样,也没啥好处理的 q w q qwq qwq

决策树(删减版)

删减在于,用的字典。
所以判断 r e t u r n return return的时候少了,为了方便递归。
少了启发式算法,直接用信息增益率

计算信息增益
def calcShannonEnt(dataSet):
    shannonEnt = 0
    classList = [example[-1] for example in dataSet]#最后一个是类别
    Set = set(classList)
    for x in Set:
        p = classList.count(x) / len(classList)
        shannonEnt += (-1) * p * log(p)
    return shannonEnt
根据信息增益率找最佳划分特征
def chooseBestFeatureToSplit(dataSet):
    numFeatures = len(dataSet[0]) - 1      #the last column is used for the labels
    baseEntropy = calcShannonEnt(dataSet)
    bestInfoGain = 0.0; bestFeature = -1
    for i in range(numFeatures):        #iterate over all the features
        featList = [example[i] for example in dataSet]#create a list of all the examples of this feature
        uniqueVals = set(featList)       #get a set of unique values,属性值的数量
        newEntropy = 0.0
        for value in uniqueVals:
            subDataSet = splitDataSet(dataSet, i, value)
            prob = len(subDataSet)/float(len(dataSet))
            newEntropy += prob * calcShannonEnt(subDataSet)     
        infoGain = baseEntropy - newEntropy     #calculate the info gain; ie reduction in entropy
        if (infoGain > bestInfoGain):       #compare this to the best gain so far
            bestInfoGain = infoGain         #if better than current best, set to best
            bestFeature = i
    return bestFeature                    #returns an integer
生成决策树
def createTree(dataSet,labels):
    classList = [example[-1] for example in dataSet]#最后一个是类别
    if classList.count(classList[0]) == len(classList): 
        return classList[0]#stop splitting when all of the classes are equal
    if len(labels) == 0:#stop splitting when there are no more features in dataSet
        return majorityCnt(classList)
    bestFeat = chooseBestFeatureToSplit(dataSet)#找最佳划分属性
    bestFeatLabel = labels[bestFeat]
    myTree = {bestFeatLabel:{}}
    del(labels[bestFeat])
    featValues = [example[bestFeat] for example in dataSet]
    uniqueVals = set(featValues)#对这一特征的各个属性值
    for value in uniqueVals:
        subLabels = labels[:]       #copy all of labels, so trees don't mess up existing labels
        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels)
    return myTree
myTree = createTree(lenses,lensesLabels)

总代码



#导入数据集
fr = open('lenses.txt')
lenses = [line.strip().split('\t') for line in fr.readlines()]
#print(lenses)
lensesLabels = ['age','prescript','astigmatic','tearRate']

#计算原始数据的香农熵
import numpy as np
from math import log
def calcShannonEnt(dataSet):
    shannonEnt = 0
    classList = [example[-1] for example in dataSet]#最后一个是类别
    Set = set(classList)
    for x in Set:
        p = classList.count(x) / len(classList)
        shannonEnt += (-1) * p * log(p)
    return shannonEnt
#print(calcShannonEnt(lenses))

def majorityCnt(classList):
    thisset = set(classList)
    num = 0
    ans = 0
    for x in thisset:
        if classList.count(x) > num:
            num = classList.count(x)
            ans = x
    return ans

#classList = [example[-1] for example in lenses]#最后一个是类别
#print(majorityCnt(classList))
#划分数据集
def splitDataSet(dataSet,feature_index,feature_value):
    subDataSet = []
    for b in dataSet:
        if b[feature_index]==feature_value:
            temp = b[:feature_index]#注意这里不能直接用del删除而应该用切片,用del原数据集会改变
            temp.extend(b[feature_index+1:])#这两句等价于删除了这个特征,因为已经用于划分了
            subDataSet.append(temp)
    return subDataSet

#print(splitDataSet(lenses,0,"pre"))

#选择特征划分
def chooseBestFeatureToSplit(dataSet):
    numFeatures = len(dataSet[0]) - 1      #the last column is used for the labels
    baseEntropy = calcShannonEnt(dataSet)
    bestInfoGain = 0.0; bestFeature = -1
    for i in range(numFeatures):        #iterate over all the features
        featList = [example[i] for example in dataSet]#create a list of all the examples of this feature
        uniqueVals = set(featList)       #get a set of unique values,属性值的数量
        newEntropy = 0.0
        for value in uniqueVals:
            subDataSet = splitDataSet(dataSet, i, value)
            prob = len(subDataSet)/float(len(dataSet))
            newEntropy += prob * calcShannonEnt(subDataSet)     
        infoGain = baseEntropy - newEntropy     #calculate the info gain; ie reduction in entropy
        if (infoGain > bestInfoGain):       #compare this to the best gain so far
            bestInfoGain = infoGain         #if better than current best, set to best
            bestFeature = i
    return bestFeature                    #returns an integer

#生成决策树
def createTree(dataSet,labels):
    classList = [example[-1] for example in dataSet]#最后一个是类别
    if classList.count(classList[0]) == len(classList): 
        return classList[0]#stop splitting when all of the classes are equal
    if len(labels) == 0:#stop splitting when there are no more features in dataSet
        return majorityCnt(classList)
    bestFeat = chooseBestFeatureToSplit(dataSet)#找最佳划分属性
    bestFeatLabel = labels[bestFeat]
    myTree = {bestFeatLabel:{}}
    del(labels[bestFeat])
    featValues = [example[bestFeat] for example in dataSet]
    uniqueVals = set(featValues)#对这一特征的各个属性值
    for value in uniqueVals:
        subLabels = labels[:]       #copy all of labels, so trees don't mess up existing labels
        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels)
    return myTree
myTree = createTree(lenses,lensesLabels)

#构造分类器
def classify(inputTree,featLabels,testVec):
	#同学们自己编写
    cnt = 0
    while cnt < len(featLabels):
        cnt += 1
        key = str(list(inputTree.keys())[0])
        val = 0
        for x,y in zip(featLabels,testVec):
            if x == key:
               val = y
               break
        inputTree = inputTree[key][val]
        if type(inputTree) != dict:
            return inputTree
    classLabel = inputTree
    return classLabel


lensesLabels = ['age', 'prescript', 'astigmatic','tearRate']
print(classify(myTree, ['age','prescript','astigmatic','tearRate'],['young','myope','yes','normal']))

sklearn 实例

唯一麻烦的就是,需要实现离散化特征,字符串不能训练
需要用 L a b e l E n c o d e r LabelEncoder LabelEncoder实现



#导入数据集
fr = open('lenses.txt')
lenses = [line.strip().split('\t') for line in fr.readlines()]
#print(lenses)
lensesLabels = ['age','prescript','astigmatic','tearRate']

import numpy as np
from sklearn.tree import export_graphviz
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
x_train=np.array([x[:-1] for x in lenses])
y_train=np.array([x[-1] for x in lenses])

for i in range(len(x_train[0])):
    le.fit(x_train[:,i])
    x_train[:,i] = le.transform(x_train[:,i])

print(x_train)
print(y_train)
tree = DecisionTreeClassifier()
tree.fit(x_train,y_train)

print('Train score:{:.3f}'.format(tree.score(x_train,y_train)))

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值