Weka算法Classifier-meta-Bagging源码分析

最新推荐文章于 2023-03-02 17:10:25 发布

smilehehe110

最新推荐文章于 2023-03-02 17:10:25 发布

阅读量1.2k

点赞数 2

分类专栏：数据挖掘之WEKA 机器学习之随机森林文章标签： weka Bagging 源码分析分类

数据挖掘之WEKA 同时被 2 个专栏收录

11 篇文章 0 订阅

订阅专栏

机器学习之随机森林

6 篇文章 0 订阅

订阅专栏

Weka算法Classifier-meta-Bagging源码分析

Bagging部分比较简单，算法和代码放到一起写了。

一、Bagging算法

严格来看Bagging并不能算是一种分类算法，Bagging和Boosting一样，是一种组合基本分类器的方法，也就是使用多个基分类器来获取更为强大的分类器，其核心思想是有放回的抽样。

Bagging算法的训练流程：

1、从样本集中有放回的抽样M个样本。

2、用这M个样本训练基分类器C。

3、重复这个过程X次，得到若干个基分类器。

Bagging算法的预测流程：

1、对于新传入实例A，用这X个新分类器得到一个分类结果的列表。

2、若待分类属性是数值型（回归），求这个列表的算数平均值作为结果返回。

3、若待分类属性是枚举类型（分类），按这个列表对分类结果进行投票，返回票数最高的。

二、Weka代码实现

（1）基分类器

Weka中的默认基分类器使用的是REPTree，也就是Fast decision tree learner，至于这个具体是个什么，后面我再写文章进行分析。

[cpp]view plaincopy 
    
 public Bagging() {  
   
    m_Classifier = new weka.classifiers.trees.REPTree();  
  }  

（2）构建过程BuildClassifier

整个BuildClassifier都是围绕标m_CalcOutOfBag来展开的，这个m_CalcOutOfBag标识的意思是：是否计算OutofBag的错误比例。

假如我们对训练集M进行抽样，抽样的数量和M的数量是一样的，那么肯定会有一些样本并没有被抽到（为什么？因为是有放回的抽样），这个标识就是用来评测这些没抽到的样本的准确率，如果没有这个标，那么这个准确率到后面就不会被计算了。

[java]view plaincopy 
    
 if (m_CalcOutOfBag && (m_BagSizePercent != 100)) {  
       throw new IllegalArgumentException("Bag size needs to be 100% if "  
           + "out-of-bag error is to be calculated!");  
     }  
   
     int bagSize = data.numInstances() * m_BagSizePercent / 100;  
     Random random = new Random(m_Seed);  
   
     boolean[][] inBag = null;  
     if (m_CalcOutOfBag)  
       inBag = new boolean[m_Classifiers.length][];  
   
     for (int j = 0; j < m_Classifiers.length; j++) {  
       Instances bagData = null;  
   
       // create the in-bag dataset  
       if (m_CalcOutOfBag) {  
         inBag[j] = new boolean[data.numInstances()];  
         // bagData = resampleWithWeights(data, random, inBag[j]);  
         bagData = data.resampleWithWeights(random, inBag[j]);  
       } else {  
         bagData = data.resampleWithWeights(random);  
         if (bagSize < data.numInstances()) {  
           bagData.randomize(random);  
           Instances newBagData = new Instances(bagData, 0, bagSize);  
           bagData = newBagData;  
         }  
       }  

这一部分是抽样，首先如果有m_CalcOutOfBag标，则必须要求抽样比例是100%。

其次算出要抽样的大小。

inBag数组是用来记录Instances中哪些样本被抽到了哪些没被抽到。

data.resampleWithWeight就是进行有放回的抽样。

[java]view plaincopy 
    
 if (m_Classifier instanceof Randomizable) {  
   ((Randomizable) m_Classifiers[j]).setSeed(random.nextInt());  
 }  
   
 // build the classifier  
 m_Classifiers[j].buildClassifier(bagData);  

接着是构建分类树的过程，调用具体classifier的buildClassifier方法。

最后是计算OutOfBag的过程，代码我已写注释。

[java]view plaincopy 
    
 if (getCalcOutOfBag()) { //如果有这个标就计算  
       double outOfBagCount = 0.0; //错误的权重和  
       double errorSum = 0.0;//错误的偏差值的和  
       boolean numeric = data.classAttribute().isNumeric();//是否是连续数值  
       for (int i = 0; i < data.numInstances(); i++) {  
         double vote;//代表投票结果  
         double[] votes;//代表投票  
         if (numeric)  
           votes = new double[1];//如果是数值，则取平均数，计算平均数的过程一个数组单元就够了  
         else  
           votes = new double[data.numClasses()];//否则则要进行投票  
   
         // determine predictions for instance  
         int voteCount = 0;  
         for (int j = 0; j < m_Classifiers.length; j++) {  
           if (inBag[j][i])  
             continue;//如果已经被采样，就忽略，因为要计算的是OutOfBag  
   
           voteCount++;//记录有多少样本被计算  
           if (numeric) {  
             votes[0] = m_Classifiers[j].classifyInstance(data.instance(i));//数值型则直接把预测结果累加  
           } else {  

[java]view plaincopy 
    
         double[] newProbs = m_Classifiers[j].distributionForInstance(data  
             .instance(i));  
         for (int k = 0; k < newProbs.length; k++) {  
           votes[k] += newProbs[k]; //枚举型则要把所有枚举概率进行累加  
         }  
       }  
     }  
   
     // "vote"  
     if (numeric) {  
       vote = votes[0];  
       if (voteCount > 0) {  
         vote /= voteCount; // 数值型取均值  
       }  
     } else {  
       if (Utils.eq(Utils.sum(votes), 0)) {  
       } else {  
         Utils.normalize(votes);//归一化  
       }  
       vote = Utils.maxIndex(votes); // 选出最大的index  
     }  
     outOfBagCount += data.instance(i).weight();//累加权重  
     if (numeric) {  
       errorSum += StrictMath.abs(vote - data.instance(i).classValue())  
           * data.instance(i).weight();//累加错误偏差  
     } else {  
       if (vote != data.instance(i).classValue())  
         errorSum += data.instance(i).weight();//如果是枚举就对出错进行计数  
     }  
   }  
   
   m_OutOfBagError = errorSum / outOfBagCount;//最后取个平均值  
 } else {  
   m_OutOfBagError = 0;//如果没有那个标就不计算了  
 }  

三、根据权重进行无放回抽样的过程

也就是 data.resampleWithWeights(random, inBag[j]);这个方法，感觉看了一下还挺有意思的，就放上来剖析一下。

重载形式有3个，前两个都会调用第三个：

[java]view plaincopy 
    
 public Instances resampleWithWeights(Random random, double[] weights) {  
   
   return resampleWithWeights(random, weights, null);  
 }  

[java]view plaincopy 
    
 public Instances resampleWithWeights(Random random, boolean[] sampled) {  
   
   double[] weights = new double[numInstances()];  
   for (int i = 0; i < weights.length; i++) {  
     weights[i] = instance(i).weight();  
   }  
   return resampleWithWeights(random, weights, sampled);  
 }  

[java]view plaincopy 
    
 public Instances resampleWithWeights(Random random, double[] weights,  
     boolean[] sampled) {  
   
     if (weights.length != numInstances()) {  
       throw new IllegalArgumentException("weights.length != numInstances.");  
     }  
   
     Instances newData = new Instances(this, numInstances());  
     if (numInstances() == 0) {  
       return newData;  
     }  
   
     // Walker's method, see pp. 232 of "Stochastic Simulation" by B.D. Ripley  
     double[] P = new double[weights.length];  
     System.arraycopy(weights, 0, P, 0, weights.length);  
     Utils.normalize(P);  
     double[] Q = new double[weights.length];  
     int[] A = new int[weights.length];  
     int[] W = new int[weights.length];  
     int M = weights.length;  
     int NN = -1;  
     int NP = M;  
     for (int I = 0; I < M; I++) {  
       if (P[I] < 0) {  
         throw new IllegalArgumentException("Weights have to be positive.");  
       }  
       Q[I] = M * P[I];  
       if (Q[I] < 1.0) {  
         W[++NN] = I;  
       } else {  
         W[--NP] = I;  
       }  
     }  
     if (NN > -1 && NP < M) {  
       for (int S = 0; S < M - 1; S++) {  
         int I = W[S];  
         int J = W[NP];  
         A[I] = J;  
         Q[J] += Q[I] - 1.0;  
         if (Q[J] < 1.0) {  
           NP++;  
         }  
         if (NP >= M) {  
           break;  
         }  
       }  
       // A[W[M]] = W[M];  
     }  
   
     for (int I = 0; I < M; I++) {  
       Q[I] += I;  
     }  
   
     for (int i = 0; i < numInstances(); i++) {  
       int ALRV;  
       double U = M * random.nextDouble();  
       int I = (int) U;  
       if (U < Q[I]) {  
         ALRV = I;  
       } else {  
         ALRV = A[I];  
       }  
       newData.add(instance(ALRV));  
       if (sampled != null) {  
         sampled[ALRV] = true;  
       }  
       newData.instance(newData.numInstances() - 1).setWeight(1);  
     }  
   
     return newData;  
   }  

这个所谓的

[java]view plaincopy 
    
 Walker's method, see pp. 232 of "Stochastic Simulation" by B.D. Ripley  

我找了半天也不知道是个啥算法，代码也没啥注释，大体一看没看懂，等下次有机会再把这个函数的算法补上吧。

smilehehe110

关注

2
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录