[机器学习][源码]机器学习实战ch7 AdaBoost

最新推荐文章于 2022-09-18 10:18:44 发布

Lucky和小白

最新推荐文章于 2022-09-18 10:18:44 发布

阅读量420

点赞数

分类专栏：机器学习 python

本文链接：https://blog.csdn.net/zhouxinxin0202/article/details/79502642

版权

机器学习同时被 2 个专栏收录

35 篇文章 6 订阅

订阅专栏

python

13 篇文章 0 订阅

订阅专栏

把代码保存于此，python3实现，详解就参考《机器学习实战》（Peter Harrington）啦...

boost.py :

from numpy import *

def loadSimpData():
datMat=matrix([[1.,2.1],
[2.,1.1],
[1.3,1.],
[1.,1.],
[2.,1.] ])
classLabels=[1.0,1.0,-1.0,-1.0,1.0]
return datMat,classLabels

#7-1 单层决策树生成函数
def stumpClassify(dataMatrix,dimen,threshVal,threshIneq):#数据集，维度，分界值（阈值），方向
retArray=ones((shape(dataMatrix)[0],1))
if threshIneq=='lt':
retArray[dataMatrix[:,dimen]<=threshVal]=-1.0
else:
retArray[dataMatrix[:,dimen]>threshVal]=-1.0
return retArray

def buildStump(dataArr,classLabels,D):#数据集，数据标号，权重集合
dataMatrix=mat(dataArr)
labelMat=mat(classLabels).T
m,n=shape(dataMatrix) #samples,features
numSteps=10.0 #10 steps
bestStump={}
bestClasEst=mat(zeros((m,1)))
minError=inf

#attention: 3重for循环
for i in range(n):
#for 1 -- each feature(dim)
rangeMin=dataMatrix[:,i].min();rangeMax=dataMatrix[:,i].max()
stepSize=(rangeMax-rangeMin)/numSteps
for j in range(-1,int(numSteps)+1):
#for 2 -- each stepsize(thresh)
for inequal in ['lt','gt']:
#for 3 -- each direction(inequal)
threshVal =( rangeMin+float(j)*stepSize ) #分界值
predictedVals=stumpClassify(dataMatrix,i,threshVal,inequal)
errArr=mat(ones((m,1))) #错误向量，所有位全1
errArr[ predictedVals==labelMat ]=0 #若样本分类正确，错误向量对应位=0
weightedError=D.T*errArr #权重集合*错误集合，总错误率
#print("split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f"%(i,threshVal,inequal,weightedError))

if weightedError<minError:
minError=weightedError
bestClasEst=predictedVals.copy() #最小error对应的分类标号
bestStump['dim']=i #最小error对应的分类特征
bestStump['thresh']=threshVal#最小error对应的分界值
bestStump['ineq']=inequal#最小error对应的方向

return bestStump,minError,bestClasEst#return: bestStump字典{分类维度，分界值，阈值方向}，错误率，分类标号
'''
d:
cd pythonwp
cd ch07
python
import boost
from importlib import reload
reload(boost)
datMat,classLabels=boost.loadSimpData()
from numpy import *
D=mat(ones((5,1))/5) #初始权重集合
boost.buildStump(datMat,classLabels,D)
'''

#7-2 基于单层决策树的AdaBoost训练过程
def adaBoostTrainDS(dataArr,classLabels,numIt=40):#dataset,labels,iteration number
weakClassArr=[]#弱分类器的集合
m=shape(dataArr)[0]#num of samples
D=mat(ones((m,1))/m)#初始概率分布向量
aggClassEst=mat(zeros((m,1)))#类别估计累积值

for i in range(numIt):
bestStump,error,classEst=buildStump(dataArr,classLabels,D)#7-1 第i棵最佳单层决策树的bestStump字典，分类错误率，分类标号
#print("D:",D.T) #第i棵树的概率分布
alpha=float( 0.5*log( (1.0-error)/max(error,1e-16) ) )#第i棵树的系数
bestStump['alpha']=alpha#7-1,bestStump字典有dim,thresh,ineq
weakClassArr.append(bestStump)
#print("classEst:",classEst)#第i棵树的分类标号

expon=multiply(-1*alpha*mat(classLabels).T,classEst) #下一棵树（第 i+1 个）的概率分布
D=multiply(D,exp(expon))
D=D/D.sum()

aggClassEst+=alpha*classEst #前i棵树生成的加法模型--类别估计累积值
#print("aggClassEst:",aggClassEst.T)
aggErrors=multiply( sign(aggClassEst)!=mat(classLabels).T, ones((m,1)) ) #分类错误的样本数
errorRate=aggErrors.sum()/m #分类错误率
#print("total error:",errorRate,"\n")
if(errorRate==0.0): break
return weakClassArr,aggClassEst #return 弱分类器的集合,类别估计累积值
'''
classifierArray=boost.adaBoostTrainDS(datMat,classLabels,9)
'''

#7-3 adaBoost分类函数
def adaClassify(datToClass,classifierArr):#待分类数据，弱分类器的集合(来自7-2)
dataMatrix=mat(datToClass)
m=shape(dataMatrix)[0]#num of samples
aggClassEst=mat(zeros((m,1)))
for i in range(len(classifierArr)):
classEst=stumpClassify(dataMatrix,classifierArr[i]['dim'],\
classifierArr[i]['thresh'],classifierArr[i]['ineq'])#用第i个弱分类器进行分类（来自7-1）
aggClassEst += classifierArr[i]['alpha'] * classEst #乘以alpha权重，得到类别估计累积值
print( aggClassEst )
return sign(aggClassEst)
'''
reload(boost)
datArr,labelArr=boost.loadSimpData()
classifierArr=boost.adaBoostTrainDS(datMat,classLabels,30) #train
boost.adaClassify( [[5,5],[0,0]],classifierArr) #test
'''

# 7-4 自适应数据加载函数
def loadDataSet(fileName):
numFeat=len(open(fileName).readline().split('\t'))#num of features
dataMat=[];labelMat=[]
fr=open(fileName)

for line in fr.readlines():
lineArr=[]
curLine=line.strip().split('\t')#逐行读入并切分，每行的前两个值为X1，X2
for i in range(numFeat-1):
lineArr.append(float(curLine[i]))
dataMat.append(lineArr )
labelMat.append( float(curLine[-1]) )
return dataMat,labelMat
'''
reload(boost)
datArr,labelArr=boost.loadDataSet('horseColicTraining2.txt')
classifierArr=boost.adaBoostTrainDS(datArr,labelArr,10) #train the model
testArr,testLabelArr=boost.loadDataSet('horseColicTest2.txt')
prediction10=boost.adaClassify(testArr,classifierArr)
errArr=mat(ones((67,1)))
error=errArr[prediction10!=mat(testLabelArr).T].sum()
errorRate=error/67 #错误率
print(errorRate)
'''

#7-5 ROC+AUC
def plotROC(predStrengths,classLabels):#AdaBoost训练所得类别估计累积值(from 7-2),样本真实标号
import matplotlib.pyplot as plt
cur = (1.0,1.0) #cursor，起始点(x,y)的坐标
ySum = 0.0 #variable to calculate AUC。所有小矩形的--宽度是xStep 高度之和是ySum
numPosClas = sum(array(classLabels)==1.0)# 真正例数
yStep = 1/float(numPosClas)# x轴步长--1/真正例数
xStep = 1/float(len(classLabels)-numPosClas)# y轴步长--1/真反例数

sortedIndicies = predStrengths.argsort()#get sorted index, value of predStrengths--from small to big.相当于R语言的sort

fig = plt.figure()
fig.clf()
ax = plt.subplot(111)

#loop through all the values, drawing a line segment at each point
for index in sortedIndicies.tolist()[0]:#将sortedIndicies转成list
if classLabels[index] == 1.0:#实际为正
delX = 0; delY = yStep;
else: #实际为反
delX = xStep; delY = 0;
ySum += cur[1]
#draw line from cur to (cur[0]-delX,cur[1]-delY)
ax.plot([cur[0],cur[0]-delX],[cur[1],cur[1]-delY], c='b')
cur = (cur[0]-delX,cur[1]-delY)

ax.plot([0,1],[0,1],'b--') # 对角线--x从0到1，y从0到1，虚线
plt.xlabel('False positive rate'); plt.ylabel('True positive rate')
plt.title('ROC curve for AdaBoost horse colic detection system')
ax.axis([0,1,0,1])
plt.show()
print("the Area Under the Curve is: ",ySum*xStep)#所有小矩形的面积之和，即ROC面积
'''
from importlib import reload
reload(boost)
datArr,labelArr=boost.loadDataSet('horseColicTraining2.txt')
classifierArr,aggClassEst=boost.adaBoostTrainDS(datArr,labelArr,50) #train the model
boost.plotROC(aggClassEst.T,labelArr)
'''