0.目录
1.算法思想
AdaBoost:属于元算法,它是通过组合多个弱分类器来构建一个强分类器。全称adaptive boosting(自适应boosting)。
2.实现代码
from numpy import *
def loadSimpData():
datMat = matrix([[1., 2.1],
[2., 1.1],
[1.3, 1.],
[1., 1.],
[2., 1.]])
classLabels = [1.0, 1.0, -1.0, -1.0, 1.0]
return datMat, classLabels
def stumpClassify(dataMatrix, dimen, threshVal, threshIneq): # just classify the data
retArray = ones((shape(dataMatrix)[0], 1))
if threshIneq == 'lt':
retArray[dataMatrix[:, dimen] <= threshVal] = -1.0
else:
retArray[dataMatrix[:, dimen] > threshVal] = -1.0
return retArray
def buildStump(dataArr, classLabels, D):
"""
单层决策树生成函数
:param dataArr:
:param classLabels:
:param D: 权值
:return:
"""
dataMatrix = mat(dataArr)
labelMat = mat(classLabels).T
m, n = shape(dataMatrix)
numSteps = 10.0
bestStump = {}
bestClasEst = mat(zeros((m, 1)))
minError = inf # init error sum, to +infinity
for i in range(n): # loop over all dimensions
rangeMin = dataMatrix[:, i].min()
rangeMax = dataMatrix[:, i].max()
stepSize = (rangeMax - rangeMin) / numSteps
for j in range(-1, int(numSteps) + 1): # loop over all range in current dimension
for inequal in ['lt', 'gt']: # go over less than and greater than
threshVal = (rangeMin + float(j) * stepSize)
predictedVals = stumpClassify(dataMatrix, i, threshVal,
inequal) # call stump classify with i, j, lessThan
errArr = mat(ones((m, 1)))
# 预测值与分类值相同,分类正确
errArr[predictedVals == labelMat] = 0
weightedError = D.T * errArr # calc total error multiplied by D
# print("split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f" % (
# i, threshVal, inequal, weightedError))
if weightedError < minError:
minError = weightedError
bestClasEst = predictedVals.copy()
bestStump['dim'] = i
bestStump['thresh'] = threshVal
bestStump['ineq'] = inequal
return bestStump, minError, bestClasEst
在上面的函数中:
错误率ϵ=未正确分类的样本数目所有样本数目
错
误
率
ϵ
=
未
正
确
分
类
的
样
本
数
目
所
有
样
本
数
目
,即weightedError = D.T * errArr
def adaBoostTrainDS(dataArr, classLabels, numIt=40):
"""
AdaBoost训练过程
:param dataArr:
:param classLabels:
:param numIt:
:return:
"""
weakClassArr = []
m = shape(dataArr)[0]
D = mat(ones((m, 1)) / m) # init D to all equal
aggClassEst = mat(zeros((m, 1)))
for i in range(numIt):
bestStump, error, classEst = buildStump(dataArr, classLabels, D) # build Stump
# print "D:",D.T
alpha = float(
0.5 * log((1.0 - error) / max(error, 1e-16))) # calc alpha, throw in max(error,eps) to account for error=0
bestStump['alpha'] = alpha
weakClassArr.append(bestStump) # store Stump Params in Array
# print "classEst: ",classEst.T
expon = multiply(-1 * alpha * mat(classLabels).T, classEst) # exponent for D calc, getting messy
D = multiply(D, exp(expon)) # 更新权值D
D = D / D.sum()
# calc training error of all classifiers, if this is 0 quit for loop early (use break)
aggClassEst += alpha * classEst
# print "aggClassEst: ",aggClassEst.T
aggErrors = multiply(sign(aggClassEst) != mat(classLabels).T, ones((m, 1)))
errorRate = aggErrors.sum() / m
print("total error: ", errorRate)
if errorRate == 0.0:
break
return weakClassArr, aggClassEst
在上面的函数中,最重要的是对权值进行更新:
其中,alpha计算公式为:
如果某个样本被正确分类,权值更新为:
如果被错分,权值更新为:
def adaClassify(datToClass, classifierArr):
"""
AdaBoost分类函数
:param datToClass:
:param classifierArr:
:return:
"""
dataMatrix = mat(datToClass) # do stuff similar to last aggClassEst in adaBoostTrainDS
m = shape(dataMatrix)[0]
aggClassEst = mat(zeros((m, 1)))
for i in range(len(classifierArr)):
# 使用每个弱分类器
classEst = stumpClassify(dataMatrix, classifierArr[i]['dim'],
classifierArr[i]['thresh'],
classifierArr[i]['ineq']) # call stump classify
aggClassEst += classifierArr[i]['alpha'] * classEst
print(aggClassEst)
return sign(aggClassEst)
def main():
datMat, classLabels = loadSimpData()
D = mat(ones((5, 1)) / 5)
classifierArr, aggClassEst = adaBoostTrainDS(datMat, classLabels, 30)
classifyResult = adaClassify([0, 0], classifierArr)
print(classifyResult)
# bestStump, miniError, bestClassEst = buildStump(datMat, classLabels, D)
if __name__ == '__main__':
main()
3.参考文献
[1] 哈林顿李锐. 机器学习实战 : Machine learning in action[M]. 人民邮电出版社, 2013.