机器学习 adaboost算法

# -- coding: utf-8 --
#kNN
from adaboost import*
from numpy import*
import operator
from os import listdir

path = r'F:\file\python\py3test\venv\adaboost\horseColicTraining2.txt'
dataMat, classLabels= loadSimpData()
path1 = r'F:\file\python\py3test\venv\adaboost\horseColicTest2.txt'


datArr,labelArr = loadDataSet(path)
classifierArray = abaBoostTrainDS(datArr, labelArr, 10)

testdat,label = loadDataSet(path1)
result = adaClassify(testdat, classifierArray)
errorArr = mat(ones((67,1)))
print(errorArr[result != mat(label).T].sum())
 
# -- coding: utf-8 --
#adaboost

from numpy import*
import operator
from os import listdir
import matplotlib.pyplot as plt

def loadSimpData():
    dataMat = matrix([[1.0,2.1],
                     [2.0,1.1],
                     [1.3,1.0],
                     [1.0,1.0],
                     [2.0,1.0]])
    classLabels = [1.0, 1.0, -1.0, -1.0, 1.0]
    return dataMat, classLabels

def stumpClassify(dataMatrix,dimen,threshVal,threshIneq):  #阈值比较函数 决策树 dimen特征坐标 threshval分类阈值
    retArray = ones((shape(dataMatrix)[0],1))
    if threshIneq == 'It':
        retArray[dataMatrix[:,dimen] <= threshVal] = -1.0  #threshval阈值
    else:
        retArray[dataMatrix[:,dimen] > threshVal] = -1.0
    return retArray

def buidlStump(dataArr, classLabels, D):
    dataMatrix = mat(dataArr)
    labelMat = mat(classLabels).T
    m,n = shape(dataMatrix)
    numSteps = 10.0
    bestStump = {}
    bestClasEst = mat(zeros((m,1)))
    minError = inf   #初始置为正无穷
    for i in range(n):
        rangeMin = dataMatrix[:,i].min()    #计算列向量最大值
        rangeMax =  dataMatrix[:,i].max()   #计算列向量最小值
        stepSize = (rangeMax-rangeMin)/numSteps  #计算步长
        for j in range(-1,int(numSteps)+1):    # j从 -1到11
            for inequal in ['It','gt']:
                threshVal = (rangeMin +float(j)*stepSize)  #遍历决策树阈值范围
                predictedVals = stumpClassify(dataMatrix,i,threshVal,inequal)  #计算决策树分类结果
                errArr = mat(ones((m,1)))       #构造误差向量
                errArr[predictedVals == labelMat] = 0  #决策树预测的值与真实值相同的 置为1
                weightedError = D.T*errArr   #计算误差率
                if weightedError < minError:
                    minError = weightedError
                    bestClasEst = predictedVals.copy()   #最好决策树的分类结果
                    bestStump['dim'] = i    #最好的特征
                    bestStump['thresh'] = threshVal  #最好的阈值
                    bestStump['ineq'] = inequal  #最好的标记正负方式
    return bestStump,minError,bestClasEst

def abaBoostTrainDS(dataArr, classLabels, numIT=40):
    weakClassArr = []
    m =shape(dataArr)[0]
    D = mat(ones((m,1))/m) #初始化 输入向量的权值
    aggClassEst = mat(zeros((m,1)))
    for i in range(numIT):
        bestStump,error,classEst = buidlStump(dataArr,classLabels,D) #计算最优决策树
        print('D:',D.T)
        alpha = float(0.5*log((1.0-error)/error)) #计算分类器的权值alpha
        bestStump['alpha'] = alpha      #存到决策树字典中
        weakClassArr.append(bestStump)  #将决策树存到决策树数组中
        print('classEst:',classEst.T)
        expon = multiply(-1*alpha*mat(classLabels).T,classEst) #向量中各个元素相乘  结果还是向量
        D = multiply(D,exp(expon))
        D = D/D.sum()   #更新D 作为下一次决策树的输入变量权值
        aggClassEst += alpha*classEst  #各个分类器决策加权和
        print('aggClassEst:',aggClassEst.T)
        aggErrors = multiply(sign(aggClassEst)!= mat(classLabels).T,ones((m,1)))  #总预测值和真实值不相等的部分,置1为错误数字
        errorRate = aggErrors.sum()/m
        print('total error:',errorRate,'\n')
        if errorRate == 0.0:
            break
    plotRoc(aggClassEst.T, classLabels)
    return weakClassArr

def adaClassify(datToClass, classifierArr):
    dataMatrix = mat(datToClass)
    m = shape(dataMatrix)[0]
    aggClassEst = mat(zeros((m,1)))
    for i in range(len(classifierArr)): #遍历决策树数组长度
        classEst = stumpClassify(dataMatrix,classifierArr[i]['dim'],classifierArr[i]['thresh'],classifierArr[i]['ineq'])
        aggClassEst += classifierArr[i]['alpha']*classEst
        print(aggClassEst)
    return sign(aggClassEst)

def loadDataSet(fileName):
    numFeat = len(open(fileName).readline().split('\t')) #读取特征数+1
    dataMat = []
    labelMat = []
    fr = open(fileName)
    for line in fr.readlines():
        lineArr = []
        curline = line.strip().split('\t')
        for i in range(numFeat-1):   #遍历特征
            lineArr.append(float(curline[i]))  #分离每一行的特征
        dataMat.append(lineArr)
        labelMat.append(float(curline[-1]))
    return dataMat,labelMat

def plotRoc(predStrengths, classLabels):  #绘制roc曲线
    cur = (1.0,1.0)
    ySum = 0.0
    numPosClas = sum(array(classLabels) == 1.0)
    yStep = 1/float(numPosClas)   #真阳率
    xStep = 1/float(len(classLabels)-numPosClas)   #假阳率
    sortedIndicies = predStrengths.argsort()  #按列排序 得到排序的索引值 从小到大
    fig = plt.figure()
    fig.clf()
    ax = plt.subplot(111)  #设定图在窗口上的位置
    for index in sortedIndicies.tolist()[0]: #转换成列表
        if classLabels[index] == 1.0:
            de1X = 0
            de1Y = yStep   #真阳率  越高越好
        else:
            de1X = xStep   #假阳率  越低越好
            de1Y = 0
            ySum += cur[1]
        ax.plot([cur[0],cur[0]-de1X],[cur[1],cur[1]-de1Y],c='b')
        cur = (cur[0]-de1X,cur[1]-de1Y)
    ax.plot([0,1],[0,1],'b--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    ax.axis([0,1,0,1])
    plt.show()
    print('the Area Under the Curve is:', ySum*xStep)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值