J48源码学习笔记(一)

先从j48.java开始,j48.java继承了抽象类Classifier.java,其中第一个方法buildClassfier()

  public void buildClassifier(Instances instances) 
       throws Exception {

    ModelSelection modSelection;     

    if (m_binarySplits)//判断是否是二分树,来设定模型
      modSelection = new BinC45ModelSelection(m_minNumObj, instances);
    else
      modSelection = new C45ModelSelection(m_minNumObj, instances);
    if (!m_reducedErrorPruning)//判断是否减少错误剪枝
      m_root = new C45PruneableClassifierTree(modSelection, !m_unpruned, m_CF,
                        m_subtreeRaising, !m_noCleanup);//m_CF是置信度
    else
      m_root = new PruneableClassifierTree(modSelection, !m_unpruned, m_numFolds,
                       !m_noCleanup, m_Seed);
    m_root.buildClassifier(instances);//建立分类器
    if (m_binarySplits) {
      ((BinC45ModelSelection)modSelection).cleanup();
    } else {
      ((C45ModelSelection)modSelection).cleanup();
    }
  }

怎样进行模型选择的后面来研究,先研究下如何建立分类树
展开C45PruneableClassifierTree.java中的buildClassifier

  public void buildClassifier(Instances data) throws Exception {

    // can classifier tree handle the data?
    getCapabilities().testWithFail(data);//去除一些有错误的实例

    // remove instances with missing class
    data = new Instances(data);
    data.deleteWithMissingClass();

   buildTree(data, m_subtreeRaising || !m_cleanup);
   collapse();
   if (m_pruneTheTree) {
     prune();
   }
   if (m_cleanup) {
     cleanup(new Instances(data, 0));
   }
  }

发现了buildTree()方法是用来建树的
buildTree()是在基类ClassifierTree.java中,展开buildTree()

 public void buildTree(Instances data, boolean keepData) throws Exception {

    Instances [] localInstances;

    if (keepData) {//判断是否持有数据is training data to be kept?

      m_train = data;
    }
    m_test = null;//The pruning instances. 
    m_isLeaf = false;//是否是叶子
    m_isEmpty = false;//是否为空
    m_sons = null;//指向孩子结点的指针
    m_localModel = m_toSelectModel.selectModel(data);//The model selection method.
    if (m_localModel.numSubsets() > 1) {
      localInstances = m_localModel.split(data);
      data = null;
      m_sons = new ClassifierTree [m_localModel.numSubsets()];
      for (int i = 0; i < m_sons.length; i++) {
    m_sons[i] = getNewTree(localInstances[i]);
    localInstances[i] = null;
      }
    }else{
      m_isLeaf = true;
      if (Utils.eq(data.sumOfWeights(), 0))
    m_isEmpty = true;
      data = null;
    }
  }

调用 modSelection.selectModel(data);//选一个划分模型
modSelection.split(data). // 分割数据
m_sons[i] = getNewTree(localInstances[i]); // 构建子树
将 C45ModelSelection.selectModel(data) 继续展开:

 public final ClassifierSplitModel selectModel(Instances data){

    double minResult;
    double currentResult;
    C45Split [] currentModel;
    C45Split bestModel = null;
    NoSplit noSplitModel = null;
    double averageInfoGain = 0;
    int validModels = 0;
    boolean multiVal = true;
    Distribution checkDistribution;
    Attribute attribute;
    double sumOfWeights;
    int i;

    try{

      // 检查是否所有实例都属于一个类或者有足够多的实例去分类

      checkDistribution = new Distribution(data);
      noSplitModel = new NoSplit(checkDistribution);
      if (Utils.sm(checkDistribution.total(),2*m_minNoObj) ||
      Utils.eq(checkDistribution.total(),
           checkDistribution.perClass(checkDistribution.maxClass())))
    return noSplitModel;

      // 检查是否所有的属性都是名目属性并且足够多的取值
      if (m_allData != null) {
    Enumeration enu = data.enumerateAttributes();
    while (enu.hasMoreElements()) {
      attribute = (Attribute) enu.nextElement();
      if ((attribute.isNumeric()) ||
          (Utils.sm((double)attribute.numValues(),
            (0.3*(double)m_allData.numInstances())))){//属性是否是数值属性或者属性取值个数是否少于比数据集中实例的3分之1少
        multiVal = false; //不是多值
        break;
      }
    }
      } 

      currentModel = new C45Split[data.numAttributes()];//设置模型数组大小
      sumOfWeights = data.sumOfWeights();

      // 对每一个属性
      for (i = 0; i < data.numAttributes(); i++){

    // 除了类属性
    if (i != (data).classIndex()){

      // 从目前的属性获得模型并建立分类器去分类训练数据
      currentModel[i] = new  C45Split(i,m_minNoObj,sumOfWeights);//i是属性的标号
      currentModel[i].buildClassifier(data);//建一个C45Split的分类器用i属性来对样本进行划分,从而得到信息增益和信息增益率

      // 检查这个模型是否有用
      if (currentModel[i].checkModel())
        if (m_allData != null) {
          if ((data.attribute(i).isNumeric()) ||
          (multiVal || Utils.sm((double)data.attribute(i).numValues(),
                    (0.3*(double)m_allData.numInstances())))){
        averageInfoGain = averageInfoGain+currentModel[i].infoGain();
        validModels++;
          } 
        } else {
          averageInfoGain = averageInfoGain+currentModel[i].infoGain();
          validModels++;
        }
    }else
      currentModel[i] = null;//如果类属性就为null
      }

      // 检查是否每一个划分都被找到
      if (validModels == 0)
    return noSplitModel;
      averageInfoGain = averageInfoGain/(double)validModels;

      // 找到最好的划分
      minResult = 0;
      for (i=0;i<data.numAttributes();i++){
    if ((i != (data).classIndex()) &&
        (currentModel[i].checkModel()))

      // Use 1E-3 here to get a closer approximation to the original
      // implementation.
      if ((currentModel[i].infoGain() >= (averageInfoGain-1E-3)) &&//信息增益与平均值做比较,比平均值小的,增益率再高也不做考虑
          Utils.gr(currentModel[i].gainRatio(),minResult)){ 
        bestModel = currentModel[i];
        minResult = currentModel[i].gainRatio();
      } 
      }

      // 检查是否划分被找出
      if (Utils.eq(minResult,0))
    return noSplitModel;

      // Add all Instances with unknown values for the corresponding
      // attribute to the distribution for the model, so that
      // the complete distribution is stored with the model. 
      bestModel.distribution().
      addInstWithUnknown(data,bestModel.attIndex());

      // 对于数值属性,要找到最大的分割点
      if (m_allData != null)
    bestModel.setSplitPoint(m_allData);
      return bestModel;
    }catch(Exception e){
      e.printStackTrace();
    }
    return null;
  }

关于Instance.java默认设置每个实例的权重为1

  public Instance(int numAttributes) {

    m_AttValues = new double[numAttributes];
    for (int i = 0; i < m_AttValues.length; i++) {
      m_AttValues[i] = MISSING_VALUE;
    }
    m_Weight = 1;
    m_Dataset = null;
  }
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值