利用AdaBoost元算法提高分类性能

adaBoost.py

#!/usr/bin/python  
# -*- coding: utf-8 -*-  
#coding=utf-8

from numpy import *

def loadSimpleData():
    datMat = matrix([[1., 2.1],
                    [2., 1.1],
                    [1.3, 1.],
                    [1., 1.],
                    [2., 1.]])
    classLabels = [1.0, 1.0, -1.0, -1.0, 1.0]
    return datMat, classLabels

#通过阀值比较对数据进行分类
#所有在阀值一边的数据会分到类别-1,而在另一边的数据分到类别1
def stumpClassify(dataMatrix, dimen, threshVal, threshIneq):
    #首先将返回数组全部元素置为1,然后将所有不满足不等式要求的元素设置为-1
    retArray = ones((shape(dataMatrix)[0], 1))
    if threshIneq == 'lt':
        retArray[dataMatrix[:,dimen] <= threshVal] = -1.0
    else:
        retArray[dataMatrix[:,dimen] > threshVal] = -1.0
    return retArray

#遍历stumpClassify所有可能输入值,并找到数据集上最佳单决策树
#这里的最佳是基于数据的权重向量D来定义的
def buildStump(dataArr, classLabels, D):
    dataMat = mat(dataArr)
    labelMat = mat(classLabels).T
    m, n = shape(dataMat)
    numSteps = 10.0
    bestStump = {}  #用于存储给定权重向量D时所得到的最佳单层决策树的相关信息
    bestClasEst = mat(zeros((m, 1)))
    minError = inf
    for i in range (n):  #数据集的所有特征
        rangeMin = dataMat[:,i].min()
        rangeMax = dataMat[:,i].max()
        stepSize = (rangeMax - rangeMin)/numSteps
        for j in range(-1, int(numSteps) + 1):  #特征的所有可能取值
            for inequal in ['lt', 'gt']:  #在小于和大于之间切换
                threshVal = (rangeMin + float(j) * stepSize)
                predictedVals = stumpClassify(dataMat, i, threshVal, inequal) #预测类别
                errArr = mat(ones((m, 1)))  #错误列表,初始化为1,然后正确的置为0
                errArr[predictedVals == labelMat] = 0
                weightedError = D.T * errArr   #AdaBoost与分类器交互的地方。计算加权错误率
                if weightedError < minError:   #如果当前的错误率更小
                    minError = weightedError
                    bestClasEst = predictedVals.copy()
                    bestStump['dim'] = j
                    bestStump['thresh'] = threshVal
                    bestStump['ineq'] = inequal
    return bestStump, minError, bestClasEst  #字典,错误率,类别

#基于单层决策树的AdaBoost训练过程
def adaBoostTrainDS(dataArr, classLabels, numIt = 40):
    weakClassArr = []
    m = shape(dataArr)[0]
    #D为概率分布向量,所有元素和为1。
    #开始时所有权重相等,然后增大错分类数据的权重,降低正确分类数据的权重
    D = mat(ones((m,1))/m)  
    aggClassEst = mat(zeros((m, 1))) #记录每个数据点的估计累计值
    for i in range(numIt):
        bestStump, error, classEst = buildStump(dataArr, classLabels, D)
        alpha = float(0.5 * log((1.0-error)/max(error, 1e-16))) #基于弱分类器的错误率计算权重值
        bestStump['alpha'] = alpha  #alpha加到字典中
        weakClassArr.append(bestStump)  #字典添加到列表中
        print "classEst: ", classEst.T
        expon = multiply(-1*alpha*mat(classLabels).T, classEst)
        D = multiply(D, exp(expon))
        D = D / D.sum()  #更新D
        aggClassEst += alpha * classEst #更新累计类别估计值
        print "aggClassEst: ", aggClassEst.T
        aggErrors = multiply(sign(aggClassEst) != mat(classLabels).T, ones((m, 1)))
        errorRate = aggErrors.sum()/m
        print "total error :", errorRate, "\n"
        if errorRate == 0.0:    #如果训练错误率=0.0,退出循环
            break
    return weakClassArr, aggClassEst

测试:

>>> import adaBoost
>>> dataMat, classLabels = loadSimpleData()
>>> classifierArray = adaBoostTrainDS(dataMat, classLabels, 9)
classEst:  [[-1.  1. -1. -1.  1.]]
aggClassEst:  [[-0.69314718  0.69314718 -0.69314718 -0.69314718  0.69314718]]
total error : 0.2 

classEst:  [[ 1.  1. -1. -1. -1.]]
aggClassEst:  [[ 0.27980789  1.66610226 -1.66610226 -1.66610226 -0.27980789]]
total error : 0.2 

classEst:  [[ 1.  1.  1.  1.  1.]]
aggClassEst:  [[ 1.17568763  2.56198199 -0.77022252 -0.77022252  0.61607184]]
total error : 0.0 

>>> classifierArray
[{'dim': 3, 'ineq': 'lt', 'thresh': 1.3, 'alpha': 0.6931471805599453}, {'dim': 0, 'ineq': 'lt', 'thresh': 1.0, 'alpha': 0.9729550745276565}, {'dim': -1, 'ineq': 'lt', 'thresh': 0.90000000000000002, 'alpha': 0.8958797346140273}]

测试算法:基于AdaBoost的分类

#adaBoost分类函数
#输入为待分类样例,多个弱分类器促成的数组
def adaClassify(dataToClass, classifierArr):
    dataMat = mat(dataToClass)  
    m = shape(dataMat)[0]  #样例数
    aggClassEst = mat(zeros((m, 1)))  #累计错误率
    for i in range(len(classifierArr)):
        classEst = stumpClassify(dataMat, classifierArr[i]['dim'],\
                                 classifierArr[i]['thresh'],\
                                 classifierArr[i]['ineq'])
        aggClassEst += classifierArr[i]['alpha'] * classEst
        print aggClassEst
    return sign(aggClassEst)

测试:

>>> import adaBoost
>>> dataMat, classLabels = loadSimpleData()
>>> classifierArray = adaBoostTrainDS(dataMat, classLabels, 30)
classEst:  [[-1.  1. -1. -1.  1.]]
aggClassEst:  [[-0.69314718  0.69314718 -0.69314718 -0.69314718  0.69314718]]
total error : 0.2 

classEst:  [[ 1.  1. -1. -1. -1.]]
aggClassEst:  [[ 0.27980789  1.66610226 -1.66610226 -1.66610226 -0.27980789]]
total error : 0.2 

classEst:  [[ 1.  1.  1.  1.  1.]]
aggClassEst:  [[ 1.17568763  2.56198199 -0.77022252 -0.77022252  0.61607184]]
total error : 0.0 

>>> adaClassify([0, 0], classifierArray)
[[-0.69314718]]
[[-1.66610226]]
[[-2.56198199]]
matrix([[-1.]])

>>> adaClassify([[5, 5],[0, 0]], classifierArray)
[[ 0.69314718]
 [-0.69314718]]
[[ 1.66610226]
 [-1.66610226]]
[[ 2.56198199]
 [-2.56198199]]
matrix([[ 1.],
        [-1.]])

度量分类器性能
AUC曲线下方的面积越大,性能越好
完美分类器的AUC为1,随机猜测的AUC为0.5

#度量分类器性能指标,ROC曲线
#ROC曲线的绘制及AUC计算函数
#输入第1个参数代表分类器预测强度
def plotROC(predStrengths, classLabels):
    import matplotlib.pyplot as plt
    cur = (1.0, 1.0)  #绘制光标的位置
    ySum = 0.0
    numPosClas = sum(array(classLabels) == 1.0)  #标签为1的样例数目
    yStep = 1/float(numPosClas)
    xStep = 1/float(len(classLabels)- numPosClas)
    sortedIndicies = predStrengths.argsort()  #升序
    fig = plt.figure()
    ax = plt.subplot(111)
    for index in sortedIndicies.tolist()[0]:
        if classLabels[index] == 1.0:
            delX = 0
            delY = yStep
        else:
            delX = xStep
            delY = 0
            ySum += cur[1]  #纵坐标的累加
        ax.plot([cur[0],cur[0]-delX], [cur[1], cur[1]-delY], c='b')
        cur = (cur[0]-delX, cur[1]-delY)
    ax.plot([0,1], [0,1], 'b--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC curve for AdaBoost Horse Colic Detection Sytem')
    ax.axis([0, 1, 0, 1])
    plt.show()
    print "the Area under the Curve is: ", ySum * xStep

测试:

>>> import adaBoost
>>> dataMat, classLabels = loadSimpleData()
>>> classifierArray, aggClasEst = adaBoostTrainDS(dataMat, classLabels, 30)
>>> plotROC(aggClasEst.T, classLabels)
the Area under the Curve is:  1.0

这里写图片描述

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值