本文参考自《机器学习实战》
其中adaboostTrainDS()函数的返回值要修改为aggClassEst.T,不然ROC曲线会画不出来
#coding=utf-8 from numpy import * '''单层决策树算法''' def loadSimpData(): dataMat=matrix([[1.0,2.1],[2.0,1.1],[1.3,1.0],[1.0,1.0],[2.0,1.0]]) classLabels=[1.0,1.0,-1.0,-1.0,1.0] return dataMat,classLabels def loadDataSet(filename): numFeat=len(open(filename).readline().split('\t')) dataMat=[];labelMat=[] fr=open(filename) for line in fr.readlines(): lineArr=[] currline=line.strip().split('\t') for i in range(numFeat-1): lineArr.append(float(currline[i])) dataMat.append(lineArr) labelMat.append(float(currline[-1])) return dataMat,labelMat ''' dataMAtrix:数据集 dimen:第几列 threshVal:阈值 threshIneq:lt或gt ''' def stumpClassify(dataMAtrix,dimen,threshVal,threshIneq): retArray=ones((shape(dataMAtrix)[0],1)) if threshIneq=='lt': retArray[dataMAtrix[:,dimen] <= threshVal]=-1.0 else: retArray[dataMAtrix[:,dimen] > threshVal]=-1.0 return retArray ''' dataArr:数据集 classLabels:标签集 D:初始权重 ''' def buildStump(dataArr,classLabels,D): dataMatrix=mat(dataArr)#矩阵化数据集 labelMat=mat(classLabels).T#矩阵化标签集然后转置 m,n=shape(dataMatrix)#或取数据集行数和列数 numSteps=10.0; bestStump={};#最佳决策树 bestClassEnt=mat(zeros((m,1))) minError=inf#初始化为无穷大 for i in range(n):#遍历数据集的所有特征 rangeMin=dataMatrix[:,i].min()#第i列的最小值 rangeMax=dataMatrix[:,i].max()#第i列的最大值 stepSize=(rangeMax-rangeMin)/numSteps#计算步长 for j in range(-1,int(numSteps)+1): for inequal in ['lt','gt']: threshVal=(rangeMin+float(j)*stepSize)#计算阈值 predictedVals=stumpClassify(dataMatrix,i,threshVal,inequal)#预测结果 errArr=mat(ones((m,1)))#初始化错误集 errArr[predictedVals == labelMat]=0#预测准确则变为0 weightedError=D.T*errArr#计算加权错误率 # print "split: dim %d,thresh %.2f,thresh inequal: %s,the weighted error is %.3f "%(i,threshVal,inequal,weightedError) if weightedError<minError:#将当前错误率与已有错误率比较 minError=weightedError bestClassEnt=predictedVals.copy() bestStump['dim']=i bestStump['thresh']=threshVal bestStump['ineq']=inequal return bestStump,minError,bestClassEnt ''' dataArr:数据集 classLabels:标签集 numIt:迭代次数 ''' # def adaBoostTrainDS(dataArr,classLabels,numIt=40): weakClassArr=[] m=shape(dataArr)[0] D=mat(ones((m,1))/m) aggClassEst=mat(zeros((m,1))) for i in range(numIt): bestStump,error,classEst=buildStump(dataArr,classLabels,D) #print "D:",D.T alpha=float(0.5*log((1.0-error)/max(error,1e-16))) bestStump['alpha']=alpha weakClassArr.append(bestStump) #print "classEst:",classEst.T expon=multiply(-1*alpha*mat(classLabels).T,classEst) D=multiply(D,exp(expon)) D=D/D.sum() aggClassEst+=alpha*classEst #print "aggClassEst:",aggClassEst.T aggErrors=multiply(sign(aggClassEst)!=mat(classLabels).T,ones((m,1))) errorRate=aggErrors.sum()/m print "total error:",errorRate,"\n" if errorRate==0.0:break return weakClassArr,aggClassEst.T # dataMat,classLabels=loadSimpData() # D=mat(ones((5,1))/5) # bestStump,minError,bestClassEnt=buildStump(dataMat,classLabels,D) # print bestStump # classifierArray=adaBoostTrainDS(dataMat,classLabels,9) # print classifierArray def adaClassify(datToClass,classifierArr): dataMatrix=mat(datToClass) m=shape(dataMatrix)[0] aggClassEst=mat(zeros((m,1))) for i in range(len(classifierArr)): classEst=stumpClassify(dataMatrix,classifierArr[i]['dim'],classifierArr[i]['thresh'],classifierArr[i]['ineq']) aggClassEst+=classifierArr[i]['alpha']*classEst #print aggClassEst return sign(aggClassEst) def plotROC(predStrengths,classLabels): import matplotlib.pyplot as plt cur=(1.0,1.0) ySum=0.0 numPosClas=sum(array(classLabels)==1.0) yStep=1/float(numPosClas) xStep=1/float(len(classLabels)-numPosClas) sortedIndicies=predStrengths.argsort() fig=plt.figure() fig.clf() ax=plt.subplot(111) for index in sortedIndicies.tolist()[0]: if classLabels[index]==1.0: delX=0;delY=yStep else: delX=xStep;delY=0 ySum+=cur[1] ax.plot([cur[0],cur[0]-delX],[cur[1],cur[1]-delY],c='b') cur=(cur[0]-delX,cur[1]-delY) ax.plot([0,1],[0,1],'b--') plt.xlabel('False Postive Rate') plt.ylabel('True Positive Rate') plt.title('ROC curve for AdaBoost Horse Colic Detection System') ax.axis([0,1,0,1]) plt.show() print "the Area under the Curve is :",ySum*xStep dataArr,labelArr=loadDataSet('horseColicTraining2.txt') classifierArray,aggClassEst=adaBoostTrainDS(dataArr,labelArr,10) testArr,testLabelArr=loadDataSet('horseColicTest2.txt') prediction10=adaClassify(testArr,classifierArray) errArr=mat(ones((67,1))) print errArr[prediction10!=mat(testLabelArr).T].sum() plotROC(aggClassEst,labelArr)