adaBoost.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
#coding=utf-8
from numpy import *
def loadSimpleData():
datMat = matrix([[1., 2.1],
[2., 1.1],
[1.3, 1.],
[1., 1.],
[2., 1.]])
classLabels = [1.0, 1.0, -1.0, -1.0, 1.0]
return datMat, classLabels
#通过阀值比较对数据进行分类
#所有在阀值一边的数据会分到类别-1,而在另一边的数据分到类别1
def stumpClassify(dataMatrix, dimen, threshVal, threshIneq):
#首先将返回数组全部元素置为1,然后将所有不满足不等式要求的元素设置为-1
retArray = ones((shape(dataMatrix)[0], 1))
if threshIneq == 'lt':
retArray[dataMatrix[:,dimen] <= threshVal] = -1.0
else:
retArray[dataMatrix[:,dimen] > threshVal] = -1.0
return retArray
#遍历stumpClassify所有可能输入值,并找到数据集上最佳单决策树
#这里的最佳是基于数据的权重向量D来定义的
def buildStump(dataArr, classLabels, D):
dataMat = mat(dataArr)
labelMat = mat(classLabels).T
m, n = shape(dataMat)
numSteps = 10.0
bestStump = {} #用于存储给定权重向量D时所得到的最佳单层决策树的相关信息
bestClasEst = mat(zeros((m, 1)))
minError = inf
for i in range (n): #数据集的所有特征
rangeMin = dataMat[:,i].min()
rangeMax = dataMat[:,i].max()
stepSize = (rangeMax - rangeMin)/numSteps
for j in range(-1, int(numSteps) + 1): #特征的所有可能取值
for inequal in ['lt', 'gt']: #在小于和大于之间切换
threshVal = (rangeMin + float(j) * stepSize)
predictedVals = stumpClassify(dataMat, i, threshVal, inequal) #预测类别
errArr = mat(ones((m, 1))) #错误列表,初始化为1,然后正确的置为0
errArr[predictedVals == labelMat] = 0
weightedError = D.T * errArr #AdaBoost与分类器交互的地方。计算加权错误率
if weightedError < minError: #如果当前的错误率更小
minError = weightedError
bestClasEst = predictedVals.copy()
bestStump['dim'] = j
bestStump['thresh'] = threshVal
bestStump['ineq'] = inequal
return bestStump, minError, bestClasEst #字典,错误率,类别
#基于单层决策树的AdaBoost训练过程
def adaBoostTrainDS(dataArr, classLabels, numIt = 40):
weakClassArr = []
m = shape(dataArr)[0]
#D为概率分布向量,所有元素和为1。
#开始时所有权重相等,然后增大错分类数据的权重,降低正确分类数据的权重
D = mat(ones((m,1))/m)
aggClassEst = mat(zeros((m, 1))) #记录每个数据点的估计累计值
for i in range(numIt):
bestStump, error, classEst = buildStump(dataArr, classLabels, D)
alpha = float(0.5 * log((1.0-error)/max(error, 1e-16))) #基于弱分类器的错误率计算权重值
bestStump['alpha'] = alpha #alpha加到字典中
weakClassArr.append(bestStump) #字典添加到列表中
print "classEst: ", classEst.T
expon = multiply(-1*alpha*mat(classLabels).T, classEst)
D = multiply(D, exp(expon))
D = D / D.sum() #更新D
aggClassEst += alpha * classEst #更新累计类别估计值
print "aggClassEst: ", aggClassEst.T
aggErrors = multiply(sign(aggClassEst) != mat(classLabels).T, ones((m, 1)))
errorRate = aggErrors.sum()/m
print "total error :", errorRate, "\n"
if errorRate == 0.0: #如果训练错误率=0.0,退出循环
break
return weakClassArr, aggClassEst
测试:
>>> import adaBoost
>>> dataMat, classLabels = loadSimpleData()
>>> classifierArray = adaBoostTrainDS(dataMat, classLabels, 9)
classEst: [[-1. 1. -1. -1. 1.]]
aggClassEst: [[-0.69314718 0.69314718 -0.69314718 -0.69314718 0.69314718]]
total error : 0.2
classEst: [[ 1. 1. -1. -1. -1.]]
aggClassEst: [[ 0.27980789 1.66610226 -1.66610226 -1.66610226 -0.27980789]]
total error : 0.2
classEst: [[ 1. 1. 1. 1. 1.]]
aggClassEst: [[ 1.17568763 2.56198199 -0.77022252 -0.77022252 0.61607184]]
total error : 0.0
>>> classifierArray
[{'dim': 3, 'ineq': 'lt', 'thresh': 1.3, 'alpha': 0.6931471805599453}, {'dim': 0, 'ineq': 'lt', 'thresh': 1.0, 'alpha': 0.9729550745276565}, {'dim': -1, 'ineq': 'lt', 'thresh': 0.90000000000000002, 'alpha': 0.8958797346140273}]
测试算法:基于AdaBoost的分类
#adaBoost分类函数
#输入为待分类样例,多个弱分类器促成的数组
def adaClassify(dataToClass, classifierArr):
dataMat = mat(dataToClass)
m = shape(dataMat)[0] #样例数
aggClassEst = mat(zeros((m, 1))) #累计错误率
for i in range(len(classifierArr)):
classEst = stumpClassify(dataMat, classifierArr[i]['dim'],\
classifierArr[i]['thresh'],\
classifierArr[i]['ineq'])
aggClassEst += classifierArr[i]['alpha'] * classEst
print aggClassEst
return sign(aggClassEst)
测试:
>>> import adaBoost
>>> dataMat, classLabels = loadSimpleData()
>>> classifierArray = adaBoostTrainDS(dataMat, classLabels, 30)
classEst: [[-1. 1. -1. -1. 1.]]
aggClassEst: [[-0.69314718 0.69314718 -0.69314718 -0.69314718 0.69314718]]
total error : 0.2
classEst: [[ 1. 1. -1. -1. -1.]]
aggClassEst: [[ 0.27980789 1.66610226 -1.66610226 -1.66610226 -0.27980789]]
total error : 0.2
classEst: [[ 1. 1. 1. 1. 1.]]
aggClassEst: [[ 1.17568763 2.56198199 -0.77022252 -0.77022252 0.61607184]]
total error : 0.0
>>> adaClassify([0, 0], classifierArray)
[[-0.69314718]]
[[-1.66610226]]
[[-2.56198199]]
matrix([[-1.]])
>>> adaClassify([[5, 5],[0, 0]], classifierArray)
[[ 0.69314718]
[-0.69314718]]
[[ 1.66610226]
[-1.66610226]]
[[ 2.56198199]
[-2.56198199]]
matrix([[ 1.],
[-1.]])
度量分类器性能
AUC曲线下方的面积越大,性能越好
完美分类器的AUC为1,随机猜测的AUC为0.5
#度量分类器性能指标,ROC曲线
#ROC曲线的绘制及AUC计算函数
#输入第1个参数代表分类器预测强度
def plotROC(predStrengths, classLabels):
import matplotlib.pyplot as plt
cur = (1.0, 1.0) #绘制光标的位置
ySum = 0.0
numPosClas = sum(array(classLabels) == 1.0) #标签为1的样例数目
yStep = 1/float(numPosClas)
xStep = 1/float(len(classLabels)- numPosClas)
sortedIndicies = predStrengths.argsort() #升序
fig = plt.figure()
ax = plt.subplot(111)
for index in sortedIndicies.tolist()[0]:
if classLabels[index] == 1.0:
delX = 0
delY = yStep
else:
delX = xStep
delY = 0
ySum += cur[1] #纵坐标的累加
ax.plot([cur[0],cur[0]-delX], [cur[1], cur[1]-delY], c='b')
cur = (cur[0]-delX, cur[1]-delY)
ax.plot([0,1], [0,1], 'b--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve for AdaBoost Horse Colic Detection Sytem')
ax.axis([0, 1, 0, 1])
plt.show()
print "the Area under the Curve is: ", ySum * xStep
测试:
>>> import adaBoost
>>> dataMat, classLabels = loadSimpleData()
>>> classifierArray, aggClasEst = adaBoostTrainDS(dataMat, classLabels, 30)
>>> plotROC(aggClasEst.T, classLabels)
the Area under the Curve is: 1.0