An interesting theoretical question is can we take a weak classifier and use multiple
instances of it to create a strong classifier? By “weak” I mean the classifier does a better
job than randomly guessing but not by much. That is to say, its error rate is greater
than 50% in the two-class case. The “strong” classifier will have a much lower error
rate. The AdaBoost algorithm was born out of this question.
Pros: Low generalization error, easy to code, works with most classifiers, no parameters
to adjust
Cons: Sensitive to outliers
Works with: Numeric values, nominal values
Train: improving the classifier by focusing on errors
AdaBoost (xi is a sample(vector or multiple features), yi is a label, m is the number of samples)
def stumpClassify(dataMatrix, whichFeaCol, thresholdValue, thresholdInequality):
#(rows)
predictedLabelArr = np.ones( ( np.shape(dataMatrix)[0],1 )
)
if thresholdInequality == 'less than':
#predictedLabelArr[dataMatrix[someRows, whichFeaCol] <= thresholdValue] =-1.0
predictedLabelArr[dataMatrix[:, whichFeaCol] <= thresholdValue] = -1.0
else:
#predictedLabelArr[dataMatrix[someRows, whichFeaCol] > thresholdValue] =-1.0
predictedLabelArr[dataMatrix[:, whichFeaCol] > thresholdValue] = -1.0
return predictedLabelArr
###############thresholdValue#####################
numSteps = 10.0 #numOfSamples * numOfFeatures
for eachFeaCol in range(featureCols):
###############################################
dataMat = np.matrix([
[1., 2.1],
[2., 1.1],
[1.3,1.],
[1., 1.],
[2., 1.]
])
###############################################
rangeMin = dataMat[:,eachFeaCol].min()
rangeMax = dataMat[:,eachFeaCol].max()
#calculate the minimum and maximum to see how large
#your step size should be
stepSize = (rangeMax-rangeMin) / numSteps
#make sense to set the threshold outside the extremes of your range
#there are two extra steps outside the range: j=-1 or j= numStep
for j in range(-1, int(numSteps)+1):
#less then, greater then
for inequal in ['less than', 'greater than']:
#for spliting the data points or the samples
thresholdValue = (rangeMin + float(j) * stepSize)#######
#vertical
predictedLabelArr =\
stumpClassify(dataMat, eachFeaCol, \
thresholdValue, inequal)
errorMat = np.mat( np.ones((sampleRows,1)) )
errorMat[predictedLabelArr == labelMat] =0 #vertical
#D= np.mat( np.ones((sampleRows,1))/sampleRows ) ### 1/m
weightedErrorMat = D.T *errorMat ###
print("split: dimFeature %d, thresh %.2f, thresh ineqal: %s, \
the weighted error is %.3f" %\
(eachFeaCol, thresholdValue, inequal, weightedErrorMat))
if weightedErrorMat < minError:
minError = weightedErrorMat
###################processs
split: dimFeature 0, thresh 0.90, thresh ineqal: less than, the weighted error is 0.400
split: dimFeature 0, thresh 0.90, thresh ineqal: greater than, the weighted error is 0.600
split: dimFeature 0, thresh 1.00, thresh ineqal: less than, the weighted error is 0.400
split: dimFeature 0, thresh 1.00, thresh ineqal: greater than, the weighted error is 0.600
split: dimFeature 0, thresh 1.10, thresh ineqal: less than, the weighted error is 0.400
split: dimFeature 0, thresh 1.10, thresh ineqal: greater than, the weighted error is 0.600
split: dimFeature 0, thresh 1.20, thresh ineqal: less than, the weighted error is 0.400
split: dimFeature 0, thresh 1.20, thresh ineqal: greater than, the weighted error is 0.600
split: dimFeature 0, thresh 1.30, thresh ineqal: less than, the weighted error is 0.200
split: dimFeature 0, thresh 1.30, thresh ineqal: greater than, the weighted error is 0.800
split: dimFeature 0, thresh 1.40, thresh ineqal: less than, the weighted error is 0.200
split: dimFeature 0, thresh 1.40, thresh ineqal: greater than, the weighted error is 0.800
split: dimFeature 0, thresh 1.50, thresh ineqal: less than, the weighted error is 0.200
.......................
return bestStumpDict, minError, bestClassEstimated
({'dimFeature': 0, 'thresh': 1.3, 'ineq': 'less than'},
matrix([[0.2]]),
array([[-1.],
[ 1.],
[-1.],
[-1.],
[ 1.]]))
###########################################################
###
#The statement max(error,1e-16) is there to make sure you
#don’t have a divide-by-zero error in the case where there’s no error
alpha = float(0.5 * np.log( (1.0-error)/max(error, 1e-16))) ###
#horizontal #vertical
expon = np.multiply( -1*alpha*np.mat(classLabelList).T, bestClassEstimated )
D = np.multiply( D, np.exp(expon) )
D = D/D.sum()
alpha = float(0.5 * np.log( (1.0-error)/max(error, 1e-16)))
After you calculate , you can update the weight vector D so that the examples that are
correctly classified will decrease in weight and the misclassified examples will increase
in weight. D is given by
After D is calculated, AdaBoost starts on the next iteration. The AdaBoost algorithm
repeats the training and weight-adjusting iterations until the training error is 0
or until the number of weak classifiers reaches a user-defined value.
#aggClassEst, which gives you the aggregate estimate of the class
#for every data point.
aggClassEst += alpha*bestClassEstimated #H(x) = sign (E a*h(x))
print( "aggClassEst: ", aggClassEst.T)
########################################
print("np.sign(aggClassEst) != np.mat(classLabelList).T : \n",\
np.sign(aggClassEst) != np.mat(classLabelList).T)
aggErrors = np.multiply( np.sign(aggClassEst) != np.mat(classLabelList).T, \
np.ones((sampleRows,1)) )
print("aggErrors: ", aggErrors)
########################################
errorRate = aggErrors.sum() /sampleRows
print("total error: ", errorRate, "\n")
if( errorRate ==0.0):
break
aggClassEst: [[ 1.17568763 2.56198199 -0.77022252 -0.77022252 0.61607184]]
np.sign(aggClassEst) != np.mat(classLabelList).T :
[[False]
[False]
[False]
[False]
[False]]
aggErrors: [[0.]
[0.]
[0.]
[0.]
[0.]]
total error: 0.0
# -*- coding: utf-8 -*-
"""
Created on Sat Jun 29 04:51:13 2019
@author: LlQ
"""
# np.dataMat = = np.dataMatrix(data, copy=False) #reference
# np.dataMatrix == np.matrix(data,copy = True) #copy
import numpy as np
def loadSimpleData():
dataMat = np.matrix([
[1., 2.1],
[2., 1.1],
[1.3,1.],
[1., 1.],
[2., 1.]
])
classLabelList = [1.0, 1.0, -1.0, -1.0, 1.0]
return dataMat, classLabelList
#performs a threshold comparison to "classify data"
def stumpClassify(dataMatrix, whichFeaCol, thresholdValue, thresholdInequality):
#(rows)
predictedLabelArr = np.ones( ( np.shape(dataMatrix)[0],1 )
)
if thresholdInequality == 'less than':
#predictedLabelArr[dataMatrix[someRows, whichFeaCol] <= thresholdValue] =-1.0
predictedLabelArr[dataMatrix[:, whichFeaCol] <= thresholdValue] = -1.0
else:
#predictedLabelArr[dataMatrix[someRows, whichFeaCol] > thresholdValue] =-1.0
predictedLabelArr[dataMatrix[:, whichFeaCol] > thresholdValue] = -1.0
def buildStump(dataMat, classLabelList, D):
#np.matrix(data, copy = False) #refference
dataMat = np.mat(dataMat)
labelMat = np.mat(classLabelList).T #vertical
sampleRows,featureCols = np.shape(dataMat)
numSteps = 10.0 #numOfSamples * numOfFeatures
#to store the classifier information corresponding to the best choice
#of a decision stump given this weight vector D.
bestStumpDict = {}
bestClassEstimated = np.mat( np.zeros( (sampleRows,1) ) )
minError = np.inf
#goes over all the features in our dataset
for eachFeaCol in range(featureCols):
rangeMin = dataMat[:,eachFeaCol].min()
rangeMax = dataMat[:,eachFeaCol].max()
#calculate the minimum and maximum to see how large
#your step size should be
stepSize = (rangeMax-rangeMin) / numSteps
#make sense to set the threshold outside the extremes of your range
#there are two extra steps outside the range: j=-1 or j= numStep
for j in range(-1, int(numSteps)+1):
#less then, greater then
for inequal in ['less than', 'greater than']:
#for spliting the data points or the samples
thresholdValue = (rangeMin + float(j) * stepSize)#######
#vertical
predictedLabelArr =\
stumpClassify(dataMat, eachFeaCol, \
thresholdValue, inequal)
errorMat = np.mat( np.ones((sampleRows,1)) )
errorMat[predictedLabelArr == labelMat] =0 #vertical
weightedErrorMat = D.T *errorMat ###
print("split: dimFeature %d, thresh %.2f, thresh ineqal: %s, \
the weighted error is %.3f" %\
(eachFeaCol, thresholdValue, inequal, weightedErrorMat))
if weightedErrorMat < minError:
minError = weightedErrorMat
#vertical
bestClassEstimated = predictedLabelArr.copy()
bestStumpDict['dimFeature'] = eachFeaCol
bestStumpDict['thresh'] = thresholdValue
bestStumpDict['ineq'] = inequal
return bestStumpDict, minError, bestClassEstimated
#The DS at the end of the function names stands for decision stump.
def adaBoostTrainsDS(dataMat, classLabelList, numIterations = 40):
weakClassList = []
sampleRows = np.shape(dataMat)[0]
D= np.mat( np.ones((sampleRows,1))/sampleRows ) ### 1/m
aggClassEst = np.mat( np.zeros((sampleRows,1)) )
#[
# [0]
# [0]
# ...
#]
for i in range(numIterations):
#vertical
bestStumpDict, error, bestClassEstimated = buildStump(dataMat, classLabelList, D)
print("D:", D.T)
###
#The statement max(error,1e-16) is there to make sure you
#don’t have a divide-by-zero error in the case where there’s no error
alpha = float(0.5 * np.log( (1.0-error)/max(error, 1e-16))) ###
bestStumpDict['alpha'] = alpha
weakClassList.append(bestStumpDict)
print("classEst: ", bestClassEstimated.T)
#horizontal #vertical
expon = np.multiply( -1*alpha*np.mat(classLabelList).T, bestClassEstimated )
D = np.multiply( D, np.exp(expon) )
D = D/D.sum()
#aggClassEst, which gives you the aggregate estimate of the class
#for every data point.
aggClassEst += alpha*bestClassEstimated #H(x) = sign (E a*h(x))
print( "aggClassEst: ", aggClassEst.T)
########################################
print("np.sign(aggClassEst) != np.mat(classLabelList).T : \n",\
np.sign(aggClassEst) != np.mat(classLabelList).T)
aggErrors = np.multiply( np.sign(aggClassEst) != np.mat(classLabelList).T, \
np.ones((sampleRows,1)) )
print("aggErrors: ", aggErrors)
########################################
errorRate = aggErrors.sum() /sampleRows
print("total error: ", errorRate, "\n")
if( errorRate ==0.0):
break
return weakClassList
...
...
...
Let’s look at the intermediate output. Remember, our class labels were [1.0, 1.0, -1.0,
-1.0, 1.0]. In the first iteration, all the D values were equal; then only one value, the first
data point, was misclassified. So, in the next iteration, the D vector puts 0.5 weight on
the first data point because it was misclassified previously. You can see the total class by
looking at the sign of aggClassEst. After the second iteration, you can see that the first
data point is correctly classified, but the last data point is now wrong. The D value now
becomes 0.5 for the last element, and the other values in the D vector are much smaller.
Finally, in the third iteration the sign of all the values in aggClassEst matches your class
labels and the training error becomes 0, so you can quit.
This array contains three dictionaries, which contain all of the information you’ll need
for classification. You’ve now built a classifier, and the classifier will reduce the training
error to 0 if you wish.
How does the test error look? In order to see the test error, you
need to write some code for classification. The next section will discuss classification.
If you look at the Test Error column in table 7.1, you’ll see that the test error reaches
a minimum and then starts to increase. This sort of behavior is known as overfitting. It
has been claimed in literature that for well-behaved datasets the test error for AdaBoost
reaches a plateau and won’t increase with more classifiers. Perhaps this dataset isn’t
“well behaved.” It did start off with 30% missing values, and the assumptions made for
the missing values were valid for logistic regression but they may not work for a decision
tree. If you went back to our dataset and replaced all the 0s with other values—perhaps
averages for a given class—would you have better performance?
AdaBoost and support vector machines are considered by many to be the most powerful
algorithms in supervised learning. You can draw a number of similarities between
the two. You can think of the weak learner in AdaBoost as a kernel in support vector
machines. You can also write the AdaBoost algorithm in terms of maximizing a minimum
margin. The way these margins are calculated is different and can lead to different
results, especially with higher dimensions.
#The DS at the end of the function names stands for decision stump.
#we use this function to get a classifier array
def adaBoostTrainsDS(dataMat, classLabelList, numIterations = 40):
weakClassifierList = []
sampleRows = np.shape(dataMat)[0]
D= np.mat( np.ones((sampleRows,1))/sampleRows ) ### 1/m
aggClassEst = np.mat( np.zeros((sampleRows,1)) )
#[
# [0]
# [0]
# ...
#]
for i in range(numIterations):
#vertical
bestStumpDict, error, bestClassEstimated = buildStump(dataMat, classLabelList, D)
print("D:", D.T)
###
#The statement max(error,1e-16) is there to make sure you
#don’t have a divide-by-zero error in the case where there’s no error
alpha = float(0.5 * np.log( (1.0-error)/max(error, 1e-16))) ###
bestStumpDict['alpha'] = alpha
weakClassifierList.append(bestStumpDict)
print("classEst: ", bestClassEstimated.T)
#horizontal #vertical
expon = np.multiply( -1*alpha*np.mat(classLabelList).T, bestClassEstimated )
D = np.multiply( D, np.exp(expon) )
D = D/D.sum()
#aggClassEst:This is the strength of the classifier’s predictions
#aggClassEst, which gives you the aggregate estimate of the class
#for every data point.
aggClassEst += alpha*bestClassEstimated #H(x) = sign (E a*h(x))
print( "aggClassEst: ", aggClassEst.T)
########################################
print("np.sign(aggClassEst) != np.mat(classLabelList).T : \n",\
np.sign(aggClassEst) != np.mat(classLabelList).T)
aggErrors = np.multiply( np.sign(aggClassEst) != np.mat(classLabelList).T, \
np.ones((sampleRows,1)) )
print("aggErrors: ", aggErrors)
########################################
errorRate = aggErrors.sum() /sampleRows
print("total error: ", errorRate, "\n")
if( errorRate ==0.0):
break
#return weakClassifierList #temporaty
return weakClassifierList, aggClassEst
##aggClassEst:This is the strength of the classifier’s predictions
def adaClassify(datToClass, classifierArr):
dataMatrix = np.mat(datToClass)
rows = np.shape(dataMatrix)[0] #rows
aggClassEst = np.mat( np.zeros( (rows,1) ) )
for i in range( len(classifierArr) ):
classEst = stumpClassify( dataMatrix, classifierArr[i]['dimFeature'],\
classifierArr[i]['thresh'],\
classifierArr[i]['ineq'])
aggClassEst += classifierArr[i]['alpha'] * classEst
print(aggClassEst)
return np.sign(aggClassEst)
def loadDataSet(fileName):
numFeat = len( open(fileName).readline().split('\t') )
dataMat = []
labelMat = []
fr = open(fileName)
for line in fr.readlines():
lineArr = []
curLine = line.strip().split('\t')
for i in range(numFeat -1):
lineArr.append(float(curLine[i]))
dataMat.append(lineArr)
labelMat.append(float(curLine[-1]))
return dataMat, labelMat
#the area under the curve (AUC). The AUC gives an average value of the
#classifier’s performance
#the first is a NumPy array or matrix in a row vector form.
#This is the strength of the classifier’s predictions(aggClassEst)
def plotROC(predStrengths, classLabels):
cur = (1.0, 1.0) #This holds your cursor for plotting
ySum = 0.0 #The variable ySum is used for calculating the AUC.
numPosClass = sum(np.array(classLabels) ==1.0)
#You’re going to plot in the range of 0.0 to 1.0 on both the x- and y-axes
yStep = 1/float(numPosClass) #y step size
xStep = 1/float(len(classLabels) - numPosClass)
sortedIndiceArray = predStrengths.argsort()###
fig = plt.figure()
fig.clf()
ax=plt.subplot(111)
print("predStrengths: ",predStrengths)
print("sortedIndiceArray: ", sortedIndiceArray.tolist())
for index in sortedIndiceArray.tolist()[0]:
#take a step down in the y direction every time you get a class of 1.0,
if classLabels[index] ==1.0:######as label!=classEst, alpha increased, aggClassEst increased
delX = 0;
delY = yStep;
else:
#you take a step backward in the x direction(false positive rate)
#for every other class
delX = xStep;
delY = 0;
ySum += cur[1] ###
ax.plot([cur[0], cur[0]-delX],#xcoords
[cur[1], cur[1]-delY],#ycoords
c='b')
cur = (cur[0]-delX, cur[1]-delY)
ax.plot([0,1], [0,1], 'b--')
plt.xlabel('False Positive Rate=FP/(FP+TN)')
plt.ylabel('True Positive Rate=TP/(TP+FN)')
plt.title('ROC curve for AdaBoost Horse Colic Detection System')
ax.axis([0,1,0,1])
plt.show()
print("the Area under the Curve is: ", ySum*xStep )