用java实现基于单层决策树的AdaBoost分类器

20 篇文章 0 订阅
18 篇文章 0 订阅

首先大家了解一下决策树算法,如下:

决策树(Decision Tree)是在已知各种情况发生概率的基础上,通过构成决策树来求取净现值的期望值大于等于零的概率,评价项目风险,判断其可行性的决策分析方法,是直观运用概率分析的一种图解法。由于这种决策分支画成图形很像一棵树的枝干,故称决策树。

我之前使用java实现过决策树算法,链接如下:

https://blog.csdn.net/luohualiushui1/article/details/86767465

然后大家了解一下AdaBoost,概念如下:

Adaboost是一种迭代算法,其核心思想是针对同一个训练集训练不同的分类器(弱分类器),然后把这些弱分类器集合起来,构成一个更强的最终分类器(强分类器)。其算法本身是通过改变数据分布来实现的,它根据每次训练集之中每个样本的分类是否正确,以及上次的总体分类的准确率,来确定每个样本的权值。将修改过权值的新数据集送给下层分类器进行训练,最后将每次训练得到的分类器最后融合起来,作为最后的决策分类器。使用adaboost分类器可以排除一些不必要的训练数据特征,并放在关键的训练数据上面。

我们可以查阅到基于单层决策树的AdaBoost的python代码如下:

#分类逻辑
def stumpClassify(dataMatrix, dimen, threshVal, threshIneq):  # just classify the data
    retArray = ones((shape(dataMatrix)[0], 1))
    if threshIneq == 'lt':
        retArray[dataMatrix[:, dimen] <= threshVal] = -1.0
    else:
        retArray[dataMatrix[:, dimen] > threshVal] = -1.0
    return retArray

#构建最优决策树分类
def buildStump(dataArr, classLabels, D):
    dataMatrix = mat(dataArr);
    labelMat = mat(classLabels).T
    m, n = shape(dataMatrix)
    numSteps = 10.0;
    bestStump = {};
    bestClasEst = mat(zeros((m, 1)))
    minError = inf  # init error sum, to +infinity
    for i in range(n):  # loop over all dimensions
        rangeMin = dataMatrix[:, i].min();
        rangeMax = dataMatrix[:, i].max();
        stepSize = (rangeMax - rangeMin) / numSteps
        for j in range(-1, int(numSteps) + 1):  # loop over all range in current dimension
            for inequal in ['lt', 'gt']:  # go over less than and greater than
                threshVal = (rangeMin + float(j) * stepSize)
                predictedVals = stumpClassify(dataMatrix, i, threshVal,
                                              inequal)  # call stump classify with i, j, lessThan
                errArr = mat(ones((m, 1)))
                errArr[predictedVals == labelMat] = 0
                weightedError = D.T * errArr

                if weightedError < minError:
                    minError = weightedError
                    bestClasEst = predictedVals.copy()
                    bestStump['dim'] = i
                    bestStump['thresh'] = threshVal
                    bestStump['ineq'] = inequal
    return bestStump, minError, bestClasEst

#多个弱分类器构建AdaBoost分类参数
def adaBoostTrainDS(dataArr,classLabels,numIt=40):
    weakClassArr = []
    m = shape(dataArr)[0]
    D = mat(ones((m,1))/m)   #init D to all equal
    aggClassEst = mat(zeros((m,1)))
    for i in range(numIt):
        bestStump,error,classEst = buildStump(dataArr,classLabels,D)#build Stump
        #print "D:",D.T
        alpha = float(0.5*log((1.0-error)/max(error,1e-16)))#calc alpha, throw in max(error,eps) to account for error=0
        bestStump['alpha'] = alpha
        weakClassArr.append(bestStump)                  #store Stump Params in Array
        #print "classEst: ",classEst.T
        expon = multiply(-1*alpha*mat(classLabels).T,classEst) #exponent for D calc, getting messy
        D = multiply(D,exp(expon))                              #Calc New D for next iteration
        D = D/D.sum()
        #calc training error of all classifiers, if this is 0 quit for loop early (use break)
        aggClassEst += alpha*classEst
        #print "aggClassEst: ",aggClassEst.T
        aggErrors = multiply(sign(aggClassEst) != mat(classLabels).T,ones((m,1)))
        errorRate = aggErrors.sum()/m
        print("total error: ",errorRate)
        if errorRate == 0.0: break
    return weakClassArr


#AdaBoost分类器
def adaClassify(datToClass,classifierArr):
    dataMatrix = mat(datToClass)#do stuff similar to last aggClassEst in adaBoostTrainDS
    m = shape(dataMatrix)[0]
    aggClassEst = mat(zeros((m,1)))
    for i in range(len(classifierArr)):
        classEst = stumpClassify(dataMatrix, classifierArr[i]['dim'],\
                                 classifierArr[i]['thresh'],\
                                 classifierArr[i]['ineq'])#call stump classify
        aggClassEst += classifierArr[i]['alpha']*classEst
        print(aggClassEst)
    return sign(aggClassEst)

现在我们开始用java实现,首先构建弱分类器参数对象

package com.algorithm;

import java.util.HashMap;
import java.util.Map;

public class StumpInfo {

	private Map<String,Object> bestStump = new HashMap<String,Object>();
	
	private double minError;
	
	private double[] bestClasEst;

	public Map<String, Object> getBestStump() {
		return bestStump;
	}

	public void setBestStump(Map<String, Object> bestStump) {
		this.bestStump = bestStump;
	}

	public double getMinError() {
		return minError;
	}

	public void setMinError(double minError) {
		this.minError = minError;
	}

	public double[] getBestClasEst() {
		return bestClasEst;
	}

	public void setBestClasEst(double[] bestClasEst) {
		this.bestClasEst = bestClasEst;
	}
	
	
	
}

然后是决策树分类逻辑

public static double[] stumpClassify(DenseMatrix64F dataMatrix,int dimen,double threshVal,String threshIneq) {
		double[] retArray = new double[dataMatrix.numRows];
		
		if(threshIneq.equals("lt")) {
			for(int i=0;i<dataMatrix.numRows;i++) {
				if(dataMatrix.get(i, dimen) <= threshVal) {
					retArray[i] = -1;
				}else {
					retArray[i] = 1;
				}
			}
		}else {
			for(int i=0;i<dataMatrix.numRows;i++) {
				if(dataMatrix.get(i, dimen) > threshVal) {
					retArray[i] = -1;
				}else {
					retArray[i] = 1;
				}
			}
		}
		
		return retArray;
	}

然后是决策树最优参数计算

public static StumpInfo buildStump(DenseMatrix64F dataMatrix,double [] classLabels,double [] D) {
		
	    double numSteps = 10.0;
	    Map<String,Object> bestStump = new HashMap<String,Object>();
	    double [] bestClasEst = new double[dataMatrix.numRows];

	    double minError = Double.MAX_VALUE;
	    
	    for(int i=0;i<dataMatrix.numCols;i++) {
	    	
	    	double rangeMin = Double.MAX_VALUE;
	    	double rangeMax = Double.MIN_VALUE;
	    	
	    	for(int j=0;j<dataMatrix.numRows;j++) {
	    		if(rangeMin > dataMatrix.get(j, i)){
	    			rangeMin = dataMatrix.get(j, i);
	    		}
	    		if(rangeMax < dataMatrix.get(j, i)){
	    			rangeMax = dataMatrix.get(j, i);
	    		}
	    	}

	        double stepSize = (rangeMax - rangeMin) / numSteps;
	        for(int j=-1;j<Math.floor(numSteps)+1;j++) {
	        	for(String inequal : new String[] {"lt","gt"}){
	        		double threshVal = (rangeMin + (double)j * stepSize);
					double [] predictedVals = stumpClassify(dataMatrix, i, threshVal,inequal); 
	     	        double weightedError=0;
	     	        for(int k=0;k<dataMatrix.numRows;k++) {
	     	        	if(predictedVals[k] != classLabels[k]) {
	     	        		weightedError += D[k];
	     	        	}
	     	        }

 	               if(weightedError < minError) {
	                    minError = weightedError;
	                    bestClasEst = predictedVals;
	                    
	                    bestStump.remove("dim");
	                    bestStump.put("dim", i);
	                    
	                    bestStump.remove("thresh");
	                    bestStump.put("thresh", threshVal);
	                    
	                    bestStump.remove("ineq");
	                    bestStump.put("ineq",inequal);
 	               }	
	        	}
	        }
	          
	    }
	    
	    StumpInfo si = new StumpInfo();
	    
	    si.setMinError(minError);
	    si.setBestClasEst(bestClasEst);
	    si.setBestStump(bestStump);    
	        
	    return si;
		
	}

然后是AdaBoost构建多个弱分类器

	public static  List<Map<String,Object>> adaBoostTrainDS(DenseMatrix64F dataArr,double[] classLabels,int numIt) {
		
		List<Map<String,Object>> weakClassArr = new ArrayList<Map<String,Object>>();
	    
		double[] D = new double[dataArr.numRows];
		double[] aggClassEst = new double[dataArr.numRows];
		
		for(int i=0;i<dataArr.numRows;i++) {
			D[i] = (double)1/dataArr.numRows;
			aggClassEst[i]=0;
		}
		System.out.println("total error: "+D.toString());
		for(int i=0;i<numIt;i++) {
			StumpInfo si  = buildStump(dataArr,classLabels,D);
	        double alpha = 0.5*Math.log((1.0-si.getMinError())/Math.max(si.getMinError(),Double.MIN_VALUE));
	        si.getBestStump().put("alpha", alpha);
	        weakClassArr.add(si.getBestStump());
	        
	        double D_sum =0;
	        double aggErrors=0;
	        
	        for(int j=0;j<dataArr.numRows;j++) {
	        	
	        	D[j] = D[j]*Math.exp(-1*alpha*classLabels[j]*si.getBestClasEst()[j]);
	        	D_sum+= D[j];
	        	
	        	aggClassEst[j] += alpha * si.getBestClasEst()[j];
	        	
	        	if(Math.signum(aggClassEst[j]) != classLabels[j]) {
	        		aggErrors +=1;
	        	}
	        }
	        
	        for(int j=0;j<dataArr.numRows;j++) {
	        	
	        	D[j] = D[j]/D_sum;
	        }
	                                
	        double errorRate = aggErrors/dataArr.numRows;
	        System.out.println("total error: "+errorRate);
	        if(errorRate == 0.0)
	        	break;
		}

	       
	    return weakClassArr;
	    
	}

然后是AdaBoost主分类器

public static double[] adaClassify(DenseMatrix64F dataMatrix,List<Map<String,Object>> classifierArr) {
	    double [] aggClassEst = new double[dataMatrix.numRows];
	    for(int j=0;j<dataMatrix.numRows;j++) {
	    	aggClassEst[j]=0;
		}
	    for(int i=0;i<classifierArr.size();i++) {
	        double [] classEst = stumpClassify(dataMatrix,(int)classifierArr.get(i).get("dim"),(double)classifierArr.get(i).get("thresh"),(String)classifierArr.get(i).get("ineq"));
			for(int j=0;j<dataMatrix.numRows;j++) {
				aggClassEst[j]+=((double)classifierArr.get(i).get("alpha"))*classEst[j];
			}
	    }
	    
	    for(int j=0;j<dataMatrix.numRows;j++) {
	    	aggClassEst[j]=Math.signum(aggClassEst[j]);
		}
	    return aggClassEst;
	}

ok到这里便写完了,可以开始测试:

		double data[][] = {
				   {1.0,2.1},
		           {2,1.1},
		           {1.3,1},
		           {1.0,1},
		           {2,1}
		};
		
		
		DenseMatrix64F matdatas = new DenseMatrix64F(data);
		
		double [] labelMat = {1.0, 1.0, -1.0, -1.0, 1.0};
		
		double [] D = {0.2,0.2,0.2,0.2,0.2};
		
		StumpInfo si = buildStump(matdatas,labelMat,D);
		
		System.out.println(si.getMinError());
		
		System.out.println(si.getBestStump().toString());
		
		List<Map<String,Object>> weakClassArr= adaBoostTrainDS(matdatas,labelMat,9);
		
		System.out.println(weakClassArr.toString());
		
		double testdata[][] = {
				   {0,0}
		};
		
		System.out.println(adaClassify(new DenseMatrix64F(testdata),weakClassArr)[0]);

 

结果如下:

大家可以看到共构建了三个弱分类器,因为alphas的优化,对训练集的错误率也逐渐下降到0,AdaBoost分类器测试结果也没问题

 

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

路边草随风

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值