J48源码学习笔记(四)buildTree(),collaspe(),prune()

回到C45PruneableClassifierTree.buildClassifier()中

 buildTree(data, m_subtreeRaising || !m_cleanup);
   collapse();
   if (m_pruneTheTree) {
     prune();
   }
   if (m_cleanup) {
     cleanup(new Instances(data, 0));
   }

这个buildTree()在父类 ClassifierTree 中

  public void buildTree(Instances data, boolean keepData) throws Exception {

    Instances [] localInstances;//数据结构是个vector

    if (keepData) {
      m_train = data;
    }
    m_test = null;
    m_isLeaf = false;
    m_isEmpty = false;
    m_sons = null;
    m_localModel = m_toSelectModel.selectModel(data);//在 C45PruneableClassifierTree.java中已经调用super(model)把model传了过来
    if (m_localModel.numSubsets() > 1) {//numSubsets在C4.5Split中已经设置,大小=m_complexityIndex即划分属性的取值数或者说是划分的包数
      localInstances = m_localModel.split(data);//对数据进行划分
      data = null;
      m_sons = new ClassifierTree [m_localModel.numSubsets()];
      for (int i = 0; i < m_sons.length; i++) {
    m_sons[i] = getNewTree(localInstances[i]);//把实例分别用来建一颗新树
    localInstances[i] = null;
      }
    }else{
      m_isLeaf = true;
      if (Utils.eq(data.sumOfWeights(), 0))
    m_isEmpty = true;
      data = null;
    }
  }

ClassifierSplitModel.split()展开

 public final Instances [] split(Instances data) 
       throws Exception { 

    Instances [] instances = new Instances [m_numSubsets];
    double [] weights;
    double newWeight;
    Instance instance;
    int subset, i, j;

    for (j=0;j<m_numSubsets;j++)
      instances[j] = new Instances((Instances)data,
                        data.numInstances());
    for (i = 0; i < data.numInstances(); i++) {
      instance = ((Instances) data).instance(i);
      weights = weights(instance);
      subset = whichSubset(instance);
      if (subset > -1)
    instances[subset].add(instance);
      else
    for (j = 0; j < m_numSubsets; j++)
      if (Utils.gr(weights[j],0)) {
        newWeight = weights[j]*instance.weight();
        instances[j].add(instance);
        instances[j].lastInstance().setWeight(newWeight);
      }
    }
    for (j = 0; j < m_numSubsets; j++)
      instances[j].compactify();

    return instances;
  }

C45PruneableClassifierTree中重写的getNewTree()展开


    C45PruneableClassifierTree newTree = 
      new C45PruneableClassifierTree(m_toSelectModel, m_pruneTheTree, m_CF,
                     m_subtreeRaising, m_cleanup);
    newTree.buildTree((Instances)data, m_subtreeRaising || !m_cleanup);

    return newTree;
  }

collaspe()是把一个树坍塌成一个结点,如果这么做不会导致训练错误增加的话。不同于剪枝,collaspe()无法增加精度。

 public final void collapse(){

    double errorsOfSubtree;
    double errorsOfTree;
    int i;

    if (!m_isLeaf){//若不为叶子结点
      errorsOfSubtree = getTrainingErrors();//获取子树的训练错误
      errorsOfTree = localModel().distribution().numIncorrect();
      if (errorsOfSubtree >= errorsOfTree-1E-3){//若划分子树产生的错误比不划分大

    // Free adjacent trees
    m_sons = null;
    m_isLeaf = true;


    m_localModel = new NoSplit(localModel().distribution());//不划分
      }else
    for (i=0;i<m_sons.length;i++)
      son(i).collapse();
    }
  }

获取训练错误的方法

private double getTrainingErrors(){

double errors = 0;
int i;

if (m_isLeaf)
  return localModel().distribution().numIncorrect();//如果是叶子结点,返回类最少的那一类实例个数
else{
  for (i=0;i<m_sons.length;i++)
errors = errors+son(i).getTrainingErrors();
  return errors;
}

}

prune()用来c4.5剪枝法进行剪枝

 public void prune() throws Exception {

    double errorsLargestBranch;
    double errorsLeaf;
    double errorsTree;
    int indexOfLargestBranch;
    C45PruneableClassifierTree largestBranch;
    int i;

    if (!m_isLeaf){

      //从叶子结点到树根进行
      for (i=0;i<m_sons.length;i++)
    son(i).prune();

      // 计算最大的分枝的错误
      indexOfLargestBranch = localModel().distribution().maxBag();
      if (m_subtreeRaising) {//如果允许子树提升
    errorsLargestBranch = son(indexOfLargestBranch).
      getEstimatedErrorsForBranch((Instances)m_train);
      } else {
    errorsLargestBranch = Double.MAX_VALUE;
      }

      // Compute error if this Tree would be leaf
      errorsLeaf = 
    getEstimatedErrorsForDistribution(localModel().distribution());

      // Compute error for the whole subtree
      errorsTree = getEstimatedErrors();

      // Decide if leaf is best choice.
      if (Utils.smOrEq(errorsLeaf,errorsTree+0.1) &&
      Utils.smOrEq(errorsLeaf,errorsLargestBranch+0.1)){

    // Free son Trees
    m_sons = null;
    m_isLeaf = true;

    // Get NoSplit Model for node.
    m_localModel = new NoSplit(localModel().distribution());
    return;
      }

      // Decide if largest branch is better choice
      // than whole subtree.
      if (Utils.smOrEq(errorsLargestBranch,errorsTree+0.1)){
    largestBranch = son(indexOfLargestBranch);
    m_sons = largestBranch.m_sons;
    m_localModel = largestBranch.localModel();
    m_isLeaf = largestBranch.m_isLeaf;
    newDistribution(m_train);
    prune();
      }
    }
  }
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值