有关spark mlib中决策树其他博客补充_warn decisiontreemetadata: decisiontree reducing m-CSDN博客

本文链接：https://blog.csdn.net/u011707542/article/details/53586537

某博客链接点击一下进去可以查看关于spark mlib一些难理解的东西，但是本人学习中发现有一个错误就是连续特征比如有n个不同样本，对应的bin数目是n+1, split数目是n,,,,对于这一点我在查看源码时候有出入，已经告诉了博主，博主进行了核对，源码里面DecisionTreeMetadata.scala中初始化了numBins这个Array代码如下

 val maxPossibleBins = math.min(strategy.maxBins, numExamples).toInt
  val numBins = Array.fill[Int](numFeatures)(maxPossibleBins)//给每个特征填充bin数目初始化都是maxPossibin

以上我们可以看到每个特征初始化的值要不是maxBins就是我们一开始程序设置的参数，要不就是样本数量，后面也没对连续特征进行修改，只是对离散特征进行了修改，无序情况bin的数量是2^n-2,有序情况就是n。

  def buildMetadata(
      input: RDD[LabeledPoint],
      strategy: Strategy,
      numTrees: Int,
      featureSubsetStrategy: String): DecisionTreeMetadata = {

    val numFeatures = input.map(_.features.size).take(1).headOption.getOrElse {
      throw new IllegalArgumentException(s"DecisionTree requires size of input RDD > 0, " +
        s"but was given by empty one.")
    }
    val numExamples = input.count()
    val numClasses = strategy.algo match {
      case Classification => strategy.numClasses
      case Regression => 0
    }

    val maxPossibleBins = math.min(strategy.maxBins, numExamples).toInt
    if (maxPossibleBins < strategy.maxBins) {//样本数量是哪里的样本数量呢？？
      logWarning(s"DecisionTree reducing maxBins from ${strategy.maxBins} to $maxPossibleBins" +
        s" (= number of training instances)")
    }

    // We check the number of bins here against maxPossibleBins.
    // This needs to be checked here instead of in Strategy since maxPossibleBins can be modified
    // based on the number of training examples.
    if (strategy.categoricalFeaturesInfo.nonEmpty) {
      val maxCategoriesPerFeature = strategy.categoricalFeaturesInfo.values.max//所有离散特征取值数最多的
      val maxCategory =
        strategy.categoricalFeaturesInfo.find(_._2 == maxCategoriesPerFeature).get._1
      require(maxCategoriesPerFeature <= maxPossibleBins,
        s"DecisionTree requires maxBins (= $maxPossibleBins) to be at least as large as the " +
        s"number of values in each categorical feature, but categorical feature $maxCategory " +
        s"has $maxCategoriesPerFeature values. Considering remove this and other categorical " +
        "features with a large number of values, or add more training examples.")
    }

    val unorderedFeatures = new mutable.HashSet[Int]()
    val numBins = Array.fill[Int](numFeatures)(maxPossibleBins)//给每个特征填充bin数目初始化都是maxPossibleBins
    if (numClasses > 2) {//if else里面处理的只是离散特征的多元分类跟二元分类两个情况，并且只有多元分类才进行了有序无序判断
      // Multiclass classification。
      val maxCategoriesForUnorderedFeature =
        ((math.log(maxPossibleBins / 2 + 1) / math.log(2.0)) + 1).floor.toInt
      strategy.categoricalFeaturesInfo.foreach { case (featureIndex, numCategories) =>
        // Hack: If a categorical feature has only 1 category, we treat it as continuous.上面case的格式map(n->k)
        // TODO(SPARK-9957): Handle this properly by filtering out those features.
        if (numCategories > 1) {
          // Decide if some categorical features should be treated as unordered features,
          //  which require 2 * ((1 << numCategories - 1) - 1) bins.
          // We do this check with log values to prevent overflows in case numCategories is large.
          // The next check is equivalent to: 2 * ((1 << numCategories - 1) - 1) <= maxBins
          if (numCategories <= maxCategoriesForUnorderedFeature) {//当前特征取值数量小于所求的m,认为无序
            unorderedFeatures.add(featureIndex)//添加无序特征的索引
            numBins(featureIndex) = numUnorderedBins(numCategories)//多元分类无序时候bin数量是2的numCategories次方-2
          } else {
            numBins(featureIndex) = numCategories//多元分类有序时候该特征对应的bin数量大小是values取值个数k
          }
        }
      }
    } else {
      // Binary classification or regression
      strategy.categoricalFeaturesInfo.foreach { case (featureIndex, numCategories) =>
        // If a categorical feature has only 1 category, we treat it as continuous: SPARK-9957
        if (numCategories > 1) {
          numBins(featureIndex) = numCategories//二元分类时候离散特征bin数是该特征不同取值的个数
        }
      }
    }

    // Set number of features to use per node (for random forests).
    val _featureSubsetStrategy = featureSubsetStrategy match {
      case "auto" =>
        if (numTrees == 1) {
          "all"
        } else {
          if (strategy.algo == Classification) {
            "sqrt"
          } else {
            "onethird"
          }
        }
      case _ => featureSubsetStrategy
    }
    val numFeaturesPerNode: Int = _featureSubsetStrategy match {
      case "all" => numFeatures
      case "sqrt" => math.sqrt(numFeatures).ceil.toInt
      case "log2" => math.max(1, (math.log(numFeatures) / math.log(2)).ceil.toInt)
      case "onethird" => (numFeatures / 3.0).ceil.toInt
    }

    new DecisionTreeMetadata(numFeatures, numExamples, numClasses, numBins.max,
      strategy.categoricalFeaturesInfo, unorderedFeatures.toSet, numBins,
      strategy.impurity, strategy.quantileCalculationStrategy, strategy.maxDepth,
      strategy.minInstancesPerNode, strategy.minInfoGain, numTrees, numFeaturesPerNode)
  }