首先大家了解一下决策树算法,如下:
决策树(Decision Tree)是在已知各种情况发生概率的基础上,通过构成决策树来求取净现值的期望值大于等于零的概率,评价项目风险,判断其可行性的决策分析方法,是直观运用概率分析的一种图解法。由于这种决策分支画成图形很像一棵树的枝干,故称决策树。
我之前使用java实现过决策树算法,链接如下:
https://blog.csdn.net/luohualiushui1/article/details/86767465
然后大家了解一下AdaBoost,概念如下:
Adaboost是一种迭代算法,其核心思想是针对同一个训练集训练不同的分类器(弱分类器),然后把这些弱分类器集合起来,构成一个更强的最终分类器(强分类器)。其算法本身是通过改变数据分布来实现的,它根据每次训练集之中每个样本的分类是否正确,以及上次的总体分类的准确率,来确定每个样本的权值。将修改过权值的新数据集送给下层分类器进行训练,最后将每次训练得到的分类器最后融合起来,作为最后的决策分类器。使用adaboost分类器可以排除一些不必要的训练数据特征,并放在关键的训练数据上面。
我们可以查阅到基于单层决策树的AdaBoost的python代码如下:
#分类逻辑
def stumpClassify(dataMatrix, dimen, threshVal, threshIneq): # just classify the data
retArray = ones((shape(dataMatrix)[0], 1))
if threshIneq == 'lt':
retArray[dataMatrix[:, dimen] <= threshVal] = -1.0
else:
retArray[dataMatrix[:, dimen] > threshVal] = -1.0
return retArray
#构建最优决策树分类
def buildStump(dataArr, classLabels, D):
dataMatrix = mat(dataArr);
labelMat = mat(classLabels).T
m, n = shape(dataMatrix)
numSteps = 10.0;
bestStump = {};
bestClasEst = mat(zeros((m, 1)))
minError = inf # init error sum, to +infinity
for i in range(n): # loop over all dimensions
rangeMin = dataMatrix[:, i].min();
rangeMax = dataMatrix[:, i].max();
stepSize = (rangeMax - rangeMin) / numSteps
for j in range(-1, int(numSteps) + 1): # loop over all range in current dimension
for inequal in ['lt', 'gt']: # go over less than and greater than
threshVal = (rangeMin + float(j) * stepSize)
predictedVals = stumpClassify(dataMatrix, i, threshVal,
inequal) # call stump classify with i, j, lessThan
errArr = mat(ones((m, 1)))
errArr[predictedVals == labelMat] = 0
weightedError = D.T * errArr
if weightedError < minError:
minError = weightedError
bestClasEst = predictedVals.copy()
bestStump['dim'] = i
bestStump['thresh'] = threshVal
bestStump['ineq'] = inequal
return bestStump, minError, bestClasEst
#多个弱分类器构建AdaBoost分类参数
def adaBoostTrainDS(dataArr,classLabels,numIt=40):
weakClassArr = []
m = shape(dataArr)[0]
D = mat(ones((m,1))/m) #init D to all equal
aggClassEst = mat(zeros((m,1)))
for i in range(numIt):
bestStump,error,classEst = buildStump(dataArr,classLabels,D)#build Stump
#print "D:",D.T
alpha = float(0.5*log((1.0-error)/max(error,1e-16)))#calc alpha, throw in max(error,eps) to account for error=0
bestStump['alpha'] = alpha
weakClassArr.append(bestStump) #store Stump Params in Array
#print "classEst: ",classEst.T
expon = multiply(-1*alpha*mat(classLabels).T,classEst) #exponent for D calc, getting messy
D = multiply(D,exp(expon)) #Calc New D for next iteration
D = D/D.sum()
#calc training error of all classifiers, if this is 0 quit for loop early (use break)
aggClassEst += alpha*classEst
#print "aggClassEst: ",aggClassEst.T
aggErrors = multiply(sign(aggClassEst) != mat(classLabels).T,ones((m,1)))
errorRate = aggErrors.sum()/m
print("total error: ",errorRate)
if errorRate == 0.0: break
return weakClassArr
#AdaBoost分类器
def adaClassify(datToClass,classifierArr):
dataMatrix = mat(datToClass)#do stuff similar to last aggClassEst in adaBoostTrainDS
m = shape(dataMatrix)[0]
aggClassEst = mat(zeros((m,1)))
for i in range(len(classifierArr)):
classEst = stumpClassify(dataMatrix, classifierArr[i]['dim'],\
classifierArr[i]['thresh'],\
classifierArr[i]['ineq'])#call stump classify
aggClassEst += classifierArr[i]['alpha']*classEst
print(aggClassEst)
return sign(aggClassEst)
现在我们开始用java实现,首先构建弱分类器参数对象
package com.algorithm;
import java.util.HashMap;
import java.util.Map;
public class StumpInfo {
private Map<String,Object> bestStump = new HashMap<String,Object>();
private double minError;
private double[] bestClasEst;
public Map<String, Object> getBestStump() {
return bestStump;
}
public void setBestStump(Map<String, Object> bestStump) {
this.bestStump = bestStump;
}
public double getMinError() {
return minError;
}
public void setMinError(double minError) {
this.minError = minError;
}
public double[] getBestClasEst() {
return bestClasEst;
}
public void setBestClasEst(double[] bestClasEst) {
this.bestClasEst = bestClasEst;
}
}
然后是决策树分类逻辑
public static double[] stumpClassify(DenseMatrix64F dataMatrix,int dimen,double threshVal,String threshIneq) {
double[] retArray = new double[dataMatrix.numRows];
if(threshIneq.equals("lt")) {
for(int i=0;i<dataMatrix.numRows;i++) {
if(dataMatrix.get(i, dimen) <= threshVal) {
retArray[i] = -1;
}else {
retArray[i] = 1;
}
}
}else {
for(int i=0;i<dataMatrix.numRows;i++) {
if(dataMatrix.get(i, dimen) > threshVal) {
retArray[i] = -1;
}else {
retArray[i] = 1;
}
}
}
return retArray;
}
然后是决策树最优参数计算
public static StumpInfo buildStump(DenseMatrix64F dataMatrix,double [] classLabels,double [] D) {
double numSteps = 10.0;
Map<String,Object> bestStump = new HashMap<String,Object>();
double [] bestClasEst = new double[dataMatrix.numRows];
double minError = Double.MAX_VALUE;
for(int i=0;i<dataMatrix.numCols;i++) {
double rangeMin = Double.MAX_VALUE;
double rangeMax = Double.MIN_VALUE;
for(int j=0;j<dataMatrix.numRows;j++) {
if(rangeMin > dataMatrix.get(j, i)){
rangeMin = dataMatrix.get(j, i);
}
if(rangeMax < dataMatrix.get(j, i)){
rangeMax = dataMatrix.get(j, i);
}
}
double stepSize = (rangeMax - rangeMin) / numSteps;
for(int j=-1;j<Math.floor(numSteps)+1;j++) {
for(String inequal : new String[] {"lt","gt"}){
double threshVal = (rangeMin + (double)j * stepSize);
double [] predictedVals = stumpClassify(dataMatrix, i, threshVal,inequal);
double weightedError=0;
for(int k=0;k<dataMatrix.numRows;k++) {
if(predictedVals[k] != classLabels[k]) {
weightedError += D[k];
}
}
if(weightedError < minError) {
minError = weightedError;
bestClasEst = predictedVals;
bestStump.remove("dim");
bestStump.put("dim", i);
bestStump.remove("thresh");
bestStump.put("thresh", threshVal);
bestStump.remove("ineq");
bestStump.put("ineq",inequal);
}
}
}
}
StumpInfo si = new StumpInfo();
si.setMinError(minError);
si.setBestClasEst(bestClasEst);
si.setBestStump(bestStump);
return si;
}
然后是AdaBoost构建多个弱分类器
public static List<Map<String,Object>> adaBoostTrainDS(DenseMatrix64F dataArr,double[] classLabels,int numIt) {
List<Map<String,Object>> weakClassArr = new ArrayList<Map<String,Object>>();
double[] D = new double[dataArr.numRows];
double[] aggClassEst = new double[dataArr.numRows];
for(int i=0;i<dataArr.numRows;i++) {
D[i] = (double)1/dataArr.numRows;
aggClassEst[i]=0;
}
System.out.println("total error: "+D.toString());
for(int i=0;i<numIt;i++) {
StumpInfo si = buildStump(dataArr,classLabels,D);
double alpha = 0.5*Math.log((1.0-si.getMinError())/Math.max(si.getMinError(),Double.MIN_VALUE));
si.getBestStump().put("alpha", alpha);
weakClassArr.add(si.getBestStump());
double D_sum =0;
double aggErrors=0;
for(int j=0;j<dataArr.numRows;j++) {
D[j] = D[j]*Math.exp(-1*alpha*classLabels[j]*si.getBestClasEst()[j]);
D_sum+= D[j];
aggClassEst[j] += alpha * si.getBestClasEst()[j];
if(Math.signum(aggClassEst[j]) != classLabels[j]) {
aggErrors +=1;
}
}
for(int j=0;j<dataArr.numRows;j++) {
D[j] = D[j]/D_sum;
}
double errorRate = aggErrors/dataArr.numRows;
System.out.println("total error: "+errorRate);
if(errorRate == 0.0)
break;
}
return weakClassArr;
}
然后是AdaBoost主分类器
public static double[] adaClassify(DenseMatrix64F dataMatrix,List<Map<String,Object>> classifierArr) {
double [] aggClassEst = new double[dataMatrix.numRows];
for(int j=0;j<dataMatrix.numRows;j++) {
aggClassEst[j]=0;
}
for(int i=0;i<classifierArr.size();i++) {
double [] classEst = stumpClassify(dataMatrix,(int)classifierArr.get(i).get("dim"),(double)classifierArr.get(i).get("thresh"),(String)classifierArr.get(i).get("ineq"));
for(int j=0;j<dataMatrix.numRows;j++) {
aggClassEst[j]+=((double)classifierArr.get(i).get("alpha"))*classEst[j];
}
}
for(int j=0;j<dataMatrix.numRows;j++) {
aggClassEst[j]=Math.signum(aggClassEst[j]);
}
return aggClassEst;
}
ok到这里便写完了,可以开始测试:
double data[][] = {
{1.0,2.1},
{2,1.1},
{1.3,1},
{1.0,1},
{2,1}
};
DenseMatrix64F matdatas = new DenseMatrix64F(data);
double [] labelMat = {1.0, 1.0, -1.0, -1.0, 1.0};
double [] D = {0.2,0.2,0.2,0.2,0.2};
StumpInfo si = buildStump(matdatas,labelMat,D);
System.out.println(si.getMinError());
System.out.println(si.getBestStump().toString());
List<Map<String,Object>> weakClassArr= adaBoostTrainDS(matdatas,labelMat,9);
System.out.println(weakClassArr.toString());
double testdata[][] = {
{0,0}
};
System.out.println(adaClassify(new DenseMatrix64F(testdata),weakClassArr)[0]);
结果如下:
大家可以看到共构建了三个弱分类器,因为alphas的优化,对训练集的错误率也逐渐下降到0,AdaBoost分类器测试结果也没问题