Decision Tree

1.使用sklearn来完成购买电脑的预测
import csv
from sklearn import preprocessing, tree

from sklearn.feature_extraction import DictVectorizer

featureList = []  # 特征向量容器  --> 注意这可是特征和特征值组成的dict所组成的List
labelList = [] # 标签容器
data = [] # 原始数据集
header = []

try:
    with open('D:\machineLearning\src\decisionTree\AllElectronics.csv','r') as allElectronics:
        reader = csv.reader(allElectronics)
        header = next(reader) # ['RID', 'age', 'income', 'student', 'credit_rating', 'class_buys_computer']
        data = [row for row in reader] # 使用列表解析
except csv.Error as e:
        print("Error reading CSV file at line %s: %s" % (reader.line_num,e))

# row ['1', 'youth', 'high', 'no', 'fair', 'no']

# featureList转换成:featureList = [{k1,v1},{k2,v2}]
for row in data:
    labelList.append(row[len(row)-1]) #数据集中的最有一行为标签
    rowDict = {}
    for i in range(1,len(row)-1):
        rowDict[header[i]] = row[i]
    featureList.append(rowDict)

# featureList转换成数值类型键值矩阵
vec = DictVectorizer()
dummyX = vec.fit_transform(featureList).toarray()

print("dummyX:" + str(dummyX))
print(vec.get_feature_names())

# labelList转换成数值类型向量
lb = preprocessing.LabelBinarizer()
dummY = lb.fit_transform(labelList)
print("dummyY: " + str(dummY))

# 构建决策树
clf = tree.DecisionTreeClassifier(criterion='entropy') # 使用熵的度量,也就是使用ID3算法
clf = clf.fit(dummyX,dummY)
print("clf: " + str(clf))

#画出决策树
# with open('allElectronicInformationGainOri.dot','w') as f:
#     f = tree.export_graphviz(clf,feature_names=vec.get_feature_names(),out_file=f)

# 预测实验
oneRow = dummyX[0,:]

newRowX = oneRow

newRowX[0] = 1 # youth
newRowX[2] = 0 # 高收入
print("newRowX: " + str(newRowX))

willBePredict = []
willBePredict.append(newRowX)

predictedY = clf.predict(willBePredict)
print("predictedY: " + str(predictedY[0]))  # 预测会购买

2.背后的原理

#_*_ coding:utf-8 _*_
import operator
from math import log

# 构建数据集
def createDateSet():
    dataSet = [[1,1,'yes'],
               [1,1,'yes'],
               [1,0,'no'],
               [0,1,'no'],
               [0,1,'no']]
    label = ['no surfacing','flippers']
    return dataSet,label


# 计算数据集的熵
def calcShannonEnt(dataSet):
    numEntries = len(dataSet) # 获取数据集长度
    labelCounts = {}
    for featVec in dataSet:
        currentLabel = featVec[-1]
        if currentLabel not in labelCounts.keys():
            labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1
    # 开始计算熵 plogp
    shannonEnt = 0.0
    for key in labelCounts.keys():
        prop = float(labelCounts[key])/numEntries
        shannonEnt -= prop * log(prop,2)
    return shannonEnt


# 按照给定的特征划分数据集,取出每个特征的第axis个属性的值为value的特征,并将第axis后的属性形成一个新的list
# 该算法为计算每个特征的信息增益做准备
def splitDataSet(dataSet,axis,value):
    retDataSet = []
    for featVec in dataSet:
        if featVec[axis] == value:
            reducedFeatVec = featVec[:axis]
            reducedFeatVec.extend(featVec[axis+1:])
            retDataSet.append(reducedFeatVec)
    return retDataSet


# 计算每个特征的信息增益,并返回信息增益最大的那个特征
def chooseBestFeatureToSplit(dataSet):
    beseEntropy = calcShannonEnt(dataSet) # 数据集合的信息增益
    numFeatures =  len(dataSet[0]) - 1 # 获取特征的个数
    baseInfoGain = 0.0; bastFeature = -1
    for i in range(numFeatures):
        featList = [example[i] for example in dataSet]
        uniqueVals = set(featList) # 类别标签组成的集合 (去除相同的元素)
        newEntropy = 0.0
        for value in uniqueVals:
            subDataSet = splitDataSet(dataSet, i, value) # 或找出dataSet中第i个特征值为value的数据集合
            prob = len(subDataSet) / float(len(dataSet)) # 计算频率(拟合概率)
            newEntropy += prob * calcShannonEnt(subDataSet)
        infoGain = beseEntropy - newEntropy # 计算每种特征向量的信息增益
        if(infoGain > baseInfoGain): # 取出最大的信息增益所对应的特征向量说对应的索引地址
            baseInfoGain = infoGain
            bastFeature = i
    return bastFeature

# 多数表决筛选出类别
def majorityCnt(classList):
    classCount = {}
    for vote in classList.keys():
        if vote not in classCount.keys():
            classCount[vote] = 0
        classCount[vote] += 1
    sortedClassCount = sorted(classCount.items(), operator.itemgetter(1), reverse=True)
    return sortedClassCount[0][0]

# 构建决策树
def createTree(dataSet,labels):
    classList = [example[-1] for example in dataSet] # ['yes', 'yes', 'no', 'no', 'no']
    if classList.count(classList[0]) == len(classList): # 第一种情况,当类别完全相同时,停止划分
        return classList[0]
    if len(dataSet[0]) == 1: # 当只有一个特征的时候,遍历完所有实例返回出现次数最多的类别
        return majorityCnt(classList)
    bestFeat = chooseBestFeatureToSplit(dataSet) # 筛选出信息增益最大的特征
    bestFeatLabl = labels[bestFeat]
    myTree = {bestFeatLabl : {}}
    del(labels[bestFeat])
    featValues = [example[bestFeat] for example in dataSet]
    uniqueVals = set(featValues)
    for value in uniqueVals:
        subLabels = labels[:]
        myTree[bestFeatLabl][value] = createTree(splitDataSet(dataSet,bestFeat,value),subLabels)
    return myTree


dataSet, label = createDateSet()
# ent = calcShannonEnt(dataSet)
# print(ent)

# data_set = splitDataSet(dataSet, 1, 1)
# print(data_set)

# bestFeature = chooseBestFeatureToSplit(dataSet)
# print(bestFeature)

myTree = createTree(dataSet, label)
print(myTree) #{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值