『机器学习实战』决策树

代码:

#! /usr/bin/env python
# coding: utf-8

from math import log
import operator

def calcShannonEnt(dataSet):
	numEntries = len(dataSet)
	labelCounts = {}
	for featVec in dataSet:
		currentLabel = featVec[-1]
		if currentLabel not in labelCounts:
			labelCounts[currentLabel] = 0
			labelCounts[currentLabel] += 1
		shannonEnt = 0.0
		for key in labelCounts:
			prob = float(labelCounts[key]) / numEntries
			shannonEnt -= prob * log(prob, 2)
	return shannonEnt
		


def createDataSet():
	dataSet = [
				[1, 1, 'yes'], 
				[1, 1, 'yes'], 
				[1, 0, 'no'], 
				[0, 1, 'no'], 
				[0, 1, 'no']
	]
	
	labels = ['no surfacing', 'flippers']
	
	return dataSet, labels
	
	
def splitDataSet(dataSet, axis, value):
	retDataSet = []
	for featVec in dataSet:
		if featVec[axis] == value:
			reducedFeatVec = featVec[: axis]
			reducedFeatVec.extend(featVec[axis + 1: ])
			retDataSet.append(reducedFeatVec)
	
	return retDataSet
	

def chooseBestFeatureToSplit(dataSet):
	numFeatures = len(dataSet[0]) - 1
	baseEntropy = calcShannonEnt(dataSet)
	bestInfoGain = 0.0
	bestFeature = -1
	for i in range(numFeatures):
		featList = [example[i] for example in dataSet]
		uniqueVals = set(featList)
		newEntropy = 0.0
		for value in uniqueVals:
			subDataSet = splitDataSet(dataSet, i, value)
			prob = len(subDataSet) / float(len(dataSet))
			newEntropy += prob * calcShannonEnt(subDataSet)
		infoGain = baseEntropy - newEntropy
		if (infoGain > bestInfoGain):
			bestInfoGain = infoGain
			bestFeature = i 
	return bestFeature
	
	
def majorityCnt(classList):
	classCount = {}
	for vote in classList:
		if vote not in classCount.keys():
			classCount[vote] += 1
	sortedClassCount = sorted(classCount.iteritems, 
		key=operator.itemgetter(1), reverse=True)
		
	return sortedClassCount[0][0]
	
	
def createTree(dataSet, labels):
	classList = [example[-1] for example in dataSet]
	if classList.count(classList[0]) == len(classList):
		return classList[0]
	if len(dataSet[0]) == 1:
		return majorityCnt(classList)
	bestFeat = chooseBestFeatureToSplit(dataSet)
	bestFeatLabel = labels[bestFeat]
	myTree = {bestFeatLabel: {}}
	del labels[bestFeat]
	featValues = [example[bestFeat] for example in dataSet]
	uniqueVals = set(featValues)
	for value in uniqueVals:
		subLabels = labels[:]
		myTree[bestFeatLabel][value] = createTree(splitDataSet(
						dataSet, bestFeat, value), subLabels)
						
	return myTree
		
	

运行代码:

import trees_L

myDat, labels = trees_L.createDataSet()
print myDat
print labels

print trees_L.calcShannonEnt(myDat)
#myDat[0][-1] = 'maybe'
print trees_L.calcShannonEnt(myDat)

myTree = trees_L.createTree(myDat, labels)
print myTree


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值