机器学习实战---决策树

1.信息无序程度的度量:熵和基尼不纯度

2.信息增益:无序度的减小 越大越好

3.选择特征划分数据集,使信息增益最大。

from math import log
import operator

def calcShannonEnt(dataset):
	numEntries = len(dataset)
	#print('numEntries is:',numEntries)
	lableCounts= {}
	for feaVec in dataset:
		currentLable = feaVec[-1]
		if currentLable not in lableCounts.keys():
			lableCounts[currentLable] = 0
		lableCounts[currentLable]+=1

	print('lableCounts:',lableCounts)

	shannonEnt=0.0
	
	for key in lableCounts:
		#print(lableCounts[key])
		prob=float (lableCounts[key])/numEntries
		shannonEnt-= prob*log(prob,2)

	return shannonEnt


def createDataSet():
	dataSet = [[1,1,'yes'],[1,1,'yes'],[1,0,'no'],[0,1,'no'],[0,1,'no']]
	lables = ['no surfacing','flippers']
	return dataSet ,lables


def splitDataSet(dataSet,axis,value):
	retDataSet=[]
	for featVec in dataSet:
		#print(featVec[axis])
		if featVec[axis]==value:
			reducedFeatVec = featVec[:axis]
			#print('featVec[axis+1] is:',featVec[axis+1:])

			reducedFeatVec.extend(featVec[axis+1:])
			retDataSet.append(reducedFeatVec)
	return retDataSet


def chooseBestFeatureToSplit(dataSet):
	numFeatures = len(dataSet[0])-1
	baseEntropy =calcShannonEnt(dataSet)
	bestInfoGain = 0.0;
	bestFeature = -1
	for i in range(numFeatures):
		featList = [examples[i] for examples in dataSet] #每个特征的所有值
		uniqueVals = set(featList)                       #每个特征的唯一值
		#print('uniqueVals is:',uniqueVals)
		newEntropy = 0.0
		for value in uniqueVals:                         #每个特征下按不同值划分数据集的熵
			subDataSet = splitDataSet(dataSet,i,value)
			prob = len(subDataSet)/float(len(dataSet))
			newEntropy = prob * calcShannonEnt(subDataSet)

		infoGain = baseEntropy - newEntropy
		if(infoGain>bestInfoGain):
			bestInfoGain =infoGain
			bestFeature=i
	return bestFeature


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值