# -- coding: utf-8 -- #kNN from adaboost import* from numpy import* import operator from os import listdir path = r'F:\file\python\py3test\venv\adaboost\horseColicTraining2.txt' dataMat, classLabels= loadSimpData() path1 = r'F:\file\python\py3test\venv\adaboost\horseColicTest2.txt' datArr,labelArr = loadDataSet(path) classifierArray = abaBoostTrainDS(datArr, labelArr, 10) testdat,label = loadDataSet(path1) result = adaClassify(testdat, classifierArray) errorArr = mat(ones((67,1))) print(errorArr[result != mat(label).T].sum())
# -- coding: utf-8 -- #adaboost from numpy import* import operator from os import listdir import matplotlib.pyplot as plt def loadSimpData(): dataMat = matrix([[1.0,2.1], [2.0,1.1], [1.3,1.0], [1.0,1.0], [2.0,1.0]]) classLabels = [1.0, 1.0, -1.0, -1.0, 1.0] return dataMat, classLabels def stumpClassify(dataMatrix,dimen,threshVal,threshIneq): #阈值比较函数 决策树 dimen特征坐标 threshval分类阈值 retArray = ones((shape(dataMatrix)[0],1)) if threshIneq == 'It': retArray[dataMatrix[:,dimen] <= threshVal] = -1.0 #threshval阈值 else: retArray[dataMatrix[:,dimen] > threshVal] = -1.0 return retArray def buidlStump(dataArr, classLabels, D): dataMatrix = mat(dataArr) labelMat = mat(classLabels).T m,n = shape(dataMatrix) numSteps = 10.0 bestStump = {} bestClasEst = mat(zeros((m,1))) minError = inf #初始置为正无穷 for i in range(n): rangeMin = dataMatrix[:,i].min() #计算列向量最大值 rangeMax = dataMatrix[:,i].max() #计算列向量最小值 stepSize = (rangeMax-rangeMin)/numSteps #计算步长 for j in range(-1,int(numSteps)+1): # j从 -1到11 for inequal in ['It','gt']: threshVal = (rangeMin +float(j)*stepSize) #遍历决策树阈值范围 predictedVals = stumpClassify(dataMatrix,i,threshVal,inequal) #计算决策树分类结果 errArr = mat(ones((m,1))) #构造误差向量 errArr[predictedVals == labelMat] = 0 #决策树预测的值与真实值相同的 置为1 weightedError = D.T*errArr #计算误差率 if weightedError < minError: minError = weightedError bestClasEst = predictedVals.copy() #最好决策树的分类结果 bestStump['dim'] = i #最好的特征 bestStump['thresh'] = threshVal #最好的阈值 bestStump['ineq'] = inequal #最好的标记正负方式 return bestStump,minError,bestClasEst def abaBoostTrainDS(dataArr, classLabels, numIT=40): weakClassArr = [] m =shape(dataArr)[0] D = mat(ones((m,1))/m) #初始化 输入向量的权值 aggClassEst = mat(zeros((m,1))) for i in range(numIT): bestStump,error,classEst = buidlStump(dataArr,classLabels,D) #计算最优决策树 print('D:',D.T) alpha = float(0.5*log((1.0-error)/error)) #计算分类器的权值alpha bestStump['alpha'] = alpha #存到决策树字典中 weakClassArr.append(bestStump) #将决策树存到决策树数组中 print('classEst:',classEst.T) expon = multiply(-1*alpha*mat(classLabels).T,classEst) #向量中各个元素相乘 结果还是向量 D = multiply(D,exp(expon)) D = D/D.sum() #更新D 作为下一次决策树的输入变量权值 aggClassEst += alpha*classEst #各个分类器决策加权和 print('aggClassEst:',aggClassEst.T) aggErrors = multiply(sign(aggClassEst)!= mat(classLabels).T,ones((m,1))) #总预测值和真实值不相等的部分,置1为错误数字 errorRate = aggErrors.sum()/m print('total error:',errorRate,'\n') if errorRate == 0.0: break plotRoc(aggClassEst.T, classLabels) return weakClassArr def adaClassify(datToClass, classifierArr): dataMatrix = mat(datToClass) m = shape(dataMatrix)[0] aggClassEst = mat(zeros((m,1))) for i in range(len(classifierArr)): #遍历决策树数组长度 classEst = stumpClassify(dataMatrix,classifierArr[i]['dim'],classifierArr[i]['thresh'],classifierArr[i]['ineq']) aggClassEst += classifierArr[i]['alpha']*classEst print(aggClassEst) return sign(aggClassEst) def loadDataSet(fileName): numFeat = len(open(fileName).readline().split('\t')) #读取特征数+1 dataMat = [] labelMat = [] fr = open(fileName) for line in fr.readlines(): lineArr = [] curline = line.strip().split('\t') for i in range(numFeat-1): #遍历特征 lineArr.append(float(curline[i])) #分离每一行的特征 dataMat.append(lineArr) labelMat.append(float(curline[-1])) return dataMat,labelMat def plotRoc(predStrengths, classLabels): #绘制roc曲线 cur = (1.0,1.0) ySum = 0.0 numPosClas = sum(array(classLabels) == 1.0) yStep = 1/float(numPosClas) #真阳率 xStep = 1/float(len(classLabels)-numPosClas) #假阳率 sortedIndicies = predStrengths.argsort() #按列排序 得到排序的索引值 从小到大 fig = plt.figure() fig.clf() ax = plt.subplot(111) #设定图在窗口上的位置 for index in sortedIndicies.tolist()[0]: #转换成列表 if classLabels[index] == 1.0: de1X = 0 de1Y = yStep #真阳率 越高越好 else: de1X = xStep #假阳率 越低越好 de1Y = 0 ySum += cur[1] ax.plot([cur[0],cur[0]-de1X],[cur[1],cur[1]-de1Y],c='b') cur = (cur[0]-de1X,cur[1]-de1Y) ax.plot([0,1],[0,1],'b--') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') ax.axis([0,1,0,1]) plt.show() print('the Area Under the Curve is:', ySum*xStep)