写在开头的话:在学习《机器学习实战》的过程中发现书中很多代码并没有注释,这对新入门的同学是一个挑战,特此贴出我对代码做出的注释,仅供参考,欢迎指正。
1、基于单层决策树构建弱分类器
#coding:gbk
from numpy import *
#作用:载入数据
#输入:无
#输出:数据矩阵,标签向量
def loadSimpData():
datMat = matrix([[1. , 2.1],
[2. , 1.1],
[1.3, 1. ],
[1. , 1. ],
[2. , 1. ]])
classLabels = [1.0, 1.0, -1.0, -1.0, 1.0]
return datMat, classLabels
#作用:从文本载入数据
#输入:文件名
#输出:数据矩阵,标签向量
def loadDataSet(fileName): #general function to parse tab -delimited floats
numFeat = len(open(fileName).readline().split('\t')) #get number of fields
dataMat = []; labelMat = []
fr = open(fileName)
for line in fr.readlines():
lineArr =[]
curLine = line.strip().split('\t')
for i in range(numFeat-1):
lineArr.append(float(curLine[i]))
dataMat.append(lineArr)
labelMat.append(float(curLine[-1]))
return dataMat,labelMat
#作用:通过阈值比较对数据进行分类
#输入:数据矩阵,维度,阈值,阈值不平等性
#输出:返回标签列表,threshIneq为'lt'则小于等于时为-1.0,为'gt'则大于时为1.0,默认值为1.0
def stumpClassify(dataMatrix, dimen, threshVal, threshIneq):
retArray = ones((shape(dataMatrix)[0], 1))#构建m*1矩阵,注意(shape(dataMatrix)[0], 1)为元组
if threshIneq == 'lt':#小于等于情况
retArray[dataMatrix[:, dimen] <= threshVal] = -1.0
else:#大于情况
retArray[dataMatrix[:, dimen] > threshVal] = -1.0
return retArray
#作用:找到数据集上最佳的单层决策树
#输入:数据矩阵,标签向量,基于数据的权重向量D
#输出:最佳单层决策树,最小误差数,最佳预测标签结果
def buildStump(dataArr, classLabels, D):
dataMatrix = mat(dataArr)
labelMat = mat(classLabels).T
m, n = shape(dataMatrix)#dataMatrix行数与列数,即样例个数与特征值个数
numSteps = 10.0#步数
bestStump = {}#最佳单层决策树
bestClasEst = mat(zeros((m, 1)))#最佳预测标签结果
minError = inf#最小误差数,初始化为正无穷大
#对每个特征值来说
for i in range(n):
rangeMin = dataMatrix[:, i].min()
rangeMax = dataMatrix[:, i].max()
stepSize = (rangeMax - rangeMin) / numSteps
#j = -1, 0, 1, ... , int(numSteps)
for j in range(-1, int(numSteps) + 1):
for inequal in ['lt', 'gt']:
threshVal = (rangeMin + float(j) * stepSize)#threshVal最小值小于rangeMin,最大值等于rangeMax
predictedVals = stumpClassify(dataMatrix, i, threshVal, inequal)
#预测正确则为0,错误则为1
errArr = mat(ones((m, 1)))
errArr[predictedVals == labelMat] = 0
#加入权重向量后的误差值
weightedError = D.T * errArr
#print "split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f" % \
#(i, threshVal, inequal, weightedError)
#得到新的最小误差值
if weightedError < minError:
minError = weightedError
bestClasEst = predictedVals.copy()
bestStump['dim'] = i
bestStump['thresh'] = threshVal
bestStump['ineq'] = inequal
return bestStump, minError, bestClasEst
2、完整AdaBoost算法的实现、测试算法及应用算法
#作用:基于单层决策树的AdaBoost训练过程
#输入:数据矩阵,标签向量,最大迭代次数
#输出:单层决策树组
def adaBoostTrainDS(dataArr, classLabels, numIt = 40):
weakClassArr = []#单层决策树组
m = shape(dataArr)[0]#数据矩阵行数,即样例个数
D = mat(ones((m, 1)) / m)#基于数据的权重向量
aggClassEst = mat(zeros((m, 1)))
for i in range(numIt):
bestStump, error, classEst = buildStump(dataArr, classLabels, D)
#print "D:", D.T
# 计算alpha,max(error, 1e-16)的作用是防止error为0,出现除法错误
alpha = float(0.5 * log((1.0 - error) / max(error, 1e-16)))
bestStump['alpha'] = alpha
weakClassArr.append(bestStump)#将决策树压入决策树组
#print "classEst: ", classEst.T
#multiply的作用是计算classLabels和classEst对应的元素是否相等,即样本是否正确分类
expon = multiply(-1 * alpha * mat(classLabels).T, classEst)
D = multiply(D, exp(expon))
D = D / D.sum()
#错误率累加计算
aggClassEst += alpha * classEst
#print "aggClassEst: ", aggClassEst.T
# 当i<0,sign(i) = -1,当i=0,sign(i) = 0,当i>0,sign(i) = 1
aggErrors = multiply(sign(aggClassEst) != mat(classLabels).T, ones((m, 1)))
errorRate = aggErrors.sum() / m
#print "total error: ", errorRate, "\n"
if errorRate == 0.0:
break
#return weakClassArr
return weakClassArr, aggClassEst
#作用:利用训练处的单层决策树组进行分类
#输入:需要分类数据,单层决策树组
#输出:分类情况
def adaClassify(datToClass, classifierArr):
dataMatrix = mat(datToClass)
m = shape(dataMatrix)[0]
aggClassEst = mat(zeros((m, 1)))
for i in range(len(classifierArr)):
classEst = stumpClassify(dataMatrix, classifierArr[i]['dim'], \
classifierArr[i]['thresh'], classifierArr[i]['ineq'])
aggClassEst += classifierArr[i]['alpha'] * classEst
#print aggClassEst
#当i<0,sign(i) = -1,当i=0,sign(i) = 0,当i>0,sign(i) = 1
return sign(aggClassEst)
3、ROC曲线的绘制及AUC计算函数
#作用:ROC曲线的绘制及AUC计算函数
#输入:分类器的预测强度,分类标签
#输出:无
def plotROC(predStrengths, classLabels):
import matplotlib.pyplot as plt
cur = (1.0, 1.0)#一开始假定所有均预测为+1
ySum = 0.0
numPosClas = sum(array(classLabels) == 1.0)#真实为+1个数,即真阳率分母
yStep = 1 / float(numPosClas)#真阳率的1/分母
xStep = 1 / float(len(classLabels) - numPosClas)#假阳率的1/分母
sortedIndicies = predStrengths.argsort()#返回从小到大的索引值
fig = plt.figure()
fig.clf()
ax = plt.subplot(111)
#主窗口输入help(numpy.ndarray.tolist) 将数组转化为列表
#开始一个个预测为-1
for index in sortedIndicies.tolist()[0]:
if classLabels[index] == 1.0:#真实为1,表示预测错误,真阳率分子减少1,假阳率分子不变
delX = 0
delY = yStep
else:#真实为-1,表示预测正确,真阳率分子不变,假阳率分子减少1
delX = xStep
delY = 0
ySum += cur[1]
ax.plot([cur[0], cur[0] - delX], [cur[1], cur[1] - delY], c = 'b')
cur = (cur[0] - delX, cur[1] - delY)#下一个坐标点的起点
ax.plot([0, 1], [0, 1], 'b--')#绘制随机猜测的结果曲线
plt.xlabel('False Positive Rate')#x轴标签
plt.ylabel('True Positive Rate')#y轴标签
plt.title('ROC curve for AdaBoost Horse Colic Detection System')#标题
ax.axis([0, 1, 0, 1])#绘制坐标轴
plt.show()
print "the Area Under the Curve is: ", ySum * xStep