决策树R语言和python语言实现(不调包)

R 语言:

#loadData

loadData <- function(){
  dataSet = matrix(c(1,1,0,"yes",1,1,1,"yes",1,0,1,"no",0,1,0,"no",0,1,1,"no"),byrow = T, nrow =5)
  colnames(dataSet) = c("a","b","c","labels")
  return (dataSet)
}


#computing ShannonEnt 实现香农熵
calShannonEnt <- function(dataSet){
  numEntries = nrow(dataSet)
  labels = levels(factor(dataSet[,"labels"]))
  labelCount = NULL
  labelCount[labels] = rep(0,length(labels))
  for(i in 1:numEntries){
    if(dataSet[i,"labels"] %in% labels){
      temp = dataSet[i,"labels"]
      labelCount[temp] = labelCount[temp] + 1
    }
  }
  t = NULL
  shannonEnt = 0
  for(i in  1:length(labelCount)){
    t[i]= labelCount[i]*1.0 / numEntries
    shannonEnt = -t[i]*log2(t[i]) + shannonEnt
  }
  #  labelCount = as.numeric(labelCount)
  return (shannonEnt)
}


#划分数据集 split DataSet
splitDataSet <- function(dataSet,axis,value){
  retDataSet = NULL
  for(i in 1:nrow(dataSet)){
    if(dataSet[i,axis] == value){
      tempDataSet = dataSet[i,]
      retDataSet = rbind(retDataSet,tempDataSet)
    }
  }
  rownames(retDataSet) = NULL
  return (retDataSet)
}


#choose Best Feature To Split 选择最好的数据集
chooseBestFeatureToSplita <- function(dataSet){
  numFeatures = ncol(dataSet) - 1
  baseEntropy = calShannonEnt(dataSet)
  #最大信息增益
  bestInfoGain = 0.0
  bestFeature = -1 
  for(i in  1: numFeatures){
    featureLabels = levels(factor(dataSet[,i]))
    # featureLabels = as.numeric(featureLabels)
    newEntropy = 0.0
    for( j in 1:length(featureLabels)){
      subDataSet = splitDataSet(dataSet,i,featureLabels[j])
      prob = length(subDataSet[,1])*1.0/nrow(dataSet)
      newEntropy = newEntropy + prob*calShannonEnt(subDataSet)
    }
    infoGain = baseEntropy - newEntropy
    if(infoGain > bestInfoGain){
      bestInfoGain = infoGain
      bestFeature = i
    }
  }
  return (bestFeature)



#最终判断属于哪一类的条件
majorityCnt <- function(classList){
  classCount = NULL
  count = as.numeric(table(classList))
  majorityList = levels(as.factor(classList))
  if(length(count) == 1){
    return (majorityList[1])
  }else{
    f = max(count)
    return (majorityList[which(count == f)][1])
  }
}


#little trick for R Language,judgement whether belongs to one classify
trick <- function(classList){
  count = as.numeric(table(classList))
  if(length(count) == 1){
    return (TRUE)
  }else
    return (FALSE)
}


#递归建立生成树
creatTree <- function(dataSet){########temp是所有的需要分割的特征
  decision_tree = list()
  classList = dataSet[,"labels"]
  #判断是否属于同一类
  if(trick(classList))
    return (rbind(decision_tree,classList[1]))
  #是否在矩阵中只剩Label标签了,若只剩Label标签,则都分完了
  if(ncol(dataSet) == 1){
    decision_tree = rbind(decision_tree,majorityCnt(classList))
    return (decision_tree)
  }
  #数据集中只剩下一个元素了 
  ########选择bestFeature作为分割属性
  bestFeature = chooseBestFeatureToSplita(dataSet)
  ########,接下来就需要把它删除
  labelFeature = colnames(dataSet)[bestFeature]
  #DIECISION TREE
  decision_tree = rbind(decision_tree,labelFeature)
  #选中了那个标签作为此次分类标签 
  t = dataSet[,bestFeature]
  temp_tree = data.frame()
  for(j in 1:length(levels(as.factor(t)))){
    #这个标签的两个属性,比如“yes”,“no”,所属的数据集
    dataSet = splitDataSet(dataSet,bestFeature,levels(as.factor(t))[j])
    dataSet[,-bestFeature]
    #递归调用这个函数
    temp_tree = creatTree(dataSet)
    decision_tree = rbind(decision_tree,temp_tree)
  }
  
  return (decision_tree)

}

--------------------------------------------------------------------------------------------------------------------------------

python 语言:

# -*- coding: utf-8 -*-
from math import log


#创建数据集


def createDataSet():
    dataSet =[[1,1,'yes'],[1,1,'yes'],[1,0,'no'],[0,1,'no'],[0,1,'no']]
    labels =['no surfacing','flippers']
    return dataSet, labels


myDat,labels = createDataSet()
print myDat


# 计算信息熵


def calcShannonEnt(dataSet):
    numEntries = len(dataSet)
    labelCounts = {}
    for featVec in dataSet:
        currentLabel = featVec[-1]
        if currentLabel not in labelCounts.keys():
            labelCounts[currentLabel] = 0
        labelCounts[currentLabel] +=1
        shannonEnt =0.0
    for key in labelCounts:
        prob = float(labelCounts[key])/numEntries
        shannonEnt -= prob* log(prob,2)
    return shannonEnt


print calcShannonEnt(myDat)


# 划分数据集

def splitDataSet(dataSet, axis, value):
    retDataSet =[]
    for featVec in dataSet:
        if featVec[axis]==value:
            reducedFeatVec = featVec[:axis]
            reducedFeatVec.extend(featVec[axis+1:])
            retDataSet.append(reducedFeatVec)
    return retDataSet






#print splitDataSet(myDat,0,1)


# 选择最好的数据集划分方式:


def chooseBestFeatureToSplit(dataSet):
    numFeatures = len(dataSet[0])-1
    baseEntropy = calcShannonEnt(dataSet)
    bestInfoGain =0.0
    bestFeature =-1
    for i in range(numFeatures):
        #print 'i: ',i
        featList =[example[i] for example in dataSet]
        uniqueVals = set(featList)
        
        newEntropy =0.0
        for value in uniqueVals:
            #print 'value: ',value
            subDataSet = splitDataSet(dataSet, i, value)
            #print 'subDataSet: ', subDataSet
            prob = len(subDataSet)/float(len(dataSet))
            #print 'prob: ',prob
            newEntropy += prob* calcShannonEnt(subDataSet)
        infoGain = baseEntropy - newEntropy
        if (infoGain > bestInfoGain):
            bestInfoGain = infoGain
            bestFeature = i 
    return bestFeature


print chooseBestFeatureToSplit(myDat)




# 多数类表决


def majorityCnt(classList):
    classCount={}
    for vote in classList:
        if vote not in classCount.keys():
            classCount[vote]=0
        classCount[vote]=1
    sortedClassCount = sorted(classCount.iteritems(),key= operator.itemgetter(1), reverse=True)
    return sortedClassCount[0][0]


# 创建树


def createTree(dataSet, labels):
    classList =[example[-1] for example in dataSet]
    print 'classList :',classList
    print 'class.count(classList[0]): ',classList.count(classList[0])
    if classList.count(classList[0]) == len(classList):
        return classList[0]
    if len(dataSet[0]) ==1:
        return majorityCnt(classList)
    bestFeat = chooseBestFeatureToSplit(dataSet)
    bestFeatLabel = labels[bestFeat]
    myTree ={bestFeatLabel:{}}
    #print 'myTree: ',myTree
    print 'myTree',myTree
    del (labels[bestFeat])
    featValues =[example[bestFeat] for example in dataSet]
    print 'featValues: ', featValues
    uniqueVals = set(featValues)
    for value in uniqueVals:
        print 'value: ',value
        subLabels = labels[:]
        print 'subLabels: ', subLabels
        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels)
        print 'myTree: ',myTree
    return myTree


myTree = createTree(myDat,labels)


# 进行分类预测


def classify(inputTree, featLabels, testVec):
    #print 'inputTree: ',inputTree
    firstStr = inputTree.keys()[0]
    #print 'firstStr: ',firstStr
    secondDict = inputTree[firstStr]
    #print 'featLabels: ',featLabels
    featIndex = featLabels.index(firstStr)
    print 'featIndex: ',featIndex
    for key in secondDict.keys():
        print 'key: ',key
        if testVec[featIndex] == key:
            if type(secondDict[key]).__name__=='dict':
                classLabel = classify(secondDict[key], featLabels, testVec)
            else :
                classLabel =secondDict[key]
    return classLabel


myDat,labels = createDataSet()
print '-----------------------------------------------'
print classify(myTree, labels, [1,0])
    












        
















































  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值