spark mllib 决策树算法

该样例取自spark高级数据分析第四章的内容,数据集来自http://t.cn/ R2wmIsI,包含一个 CSV 格式的压缩数据文件 covtype.data.gz,附带一个描述数据文件的 信息文件 covtype.info
spark mllib将特征向量抽象为LabeledPoint,它由一个包含多个特征值的Spark MLlib Vector 和一个称为标号(label)的目标值组成。该目标为 Double 类型,而 Vector 本质上 是对多个 Double 类型值的抽象。这说明 LabeledPoint 只适用于数值型特征。但只要经过 适当编码,LabeledPoint 也可用于类别型特征。另一个就是对于非数值类特征取one-hot编码,例如天气分为晴天、阴天、下雨,今天天气晴朗则100见代码:

package com.demo.rdf

import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.mllib.evaluation.{BinaryClassificationMetrics, MulticlassMetrics}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.configuration.BoostingStrategy
import org.apache.spark.mllib.tree.{GradientBoostedTrees, RandomForest, DecisionTree}
import org.apache.spark.mllib.tree.model.{GradientBoostedTreesModel, DecisionTreeModel}
import org.apache.spark.rdd.RDD
import org.apache.spark
import org.apache.spark.{SparkConf, SparkContext}
/**
  * Created by leslie on 16/10/26.
  */
object RunRDFs {
  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("DecisionTree")
    val sc = new SparkContext(conf)
    val rawData = sc.textFile("/zhenfei1/covtype/covtype.data")

    val data = rawData.map{line=>
      val values = line.split(",").map(_.toDouble)
      val featureVector = Vectors.dense(values.init)
      val label = values.last-1
      LabeledPoint(label,featureVector)
    }
    val Array(trainData,cvData,testData) = data.randomSplit(Array(0.8,0.1,0.1))
    trainData.cache();cvData.cache();testData.cache()
    simpleDecisionTree(trainData,cvData)
    randomClassifier(trainData,cvData)
    evaluate(trainData,cvData,testData)
  }

  /**
    * 创建一个普通的决策树
    *
    * @param trainData
    * @param cvData
    */
  def simpleDecisionTree(trainData:RDD[LabeledPoint],cvData:RDD[LabeledPoint])={
    val model = DecisionTree.trainClassifier(trainData,7,Map[Int,Int](),"entropy",20,300)
    val metrics = getMetrics(model,cvData)
    println("confusionMatrix : "+metrics.confusionMatrix)
    println("precision"+metrics.precision)
    (0 until 7).map(line=>
      ((metrics.precision(line),metrics.recall(line)))
    ).foreach(println)
  }

  /**
    * 随即森林模型
    *
    * @param trainData
    * @param cvData
    */
  def randomClassifier(trainData:RDD[LabeledPoint],cvData:RDD[LabeledPoint])={
    val trainPriorProbabilities = classProbabilities(trainData)
    val cvPriorProbabilities = classProbabilities(cvData)
    val accurry = trainPriorProbabilities.zip(cvPriorProbabilities).map{
      case (trainPro,cvPro)=> trainPro*cvPro
    }.sum
    println(accurry)
  }
  def GBDTClassifier(trainData:RDD[LabeledPoint],cvData:RDD[LabeledPoint])={
    val boostingStrategy = BoostingStrategy.defaultParams("Classification")

    val evaluations=
      for(depth<-Array(1,20);bins<-Array(10,300)) yield {

        val boostingStrategy = BoostingStrategy.defaultParams("Classification")
        boostingStrategy.setNumIterations(100)//迭代次数
        boostingStrategy.treeStrategy.setNumClasses(2)//分类数目
        boostingStrategy.treeStrategy.setMaxDepth(depth)//决策树最高层
//        boostingStrategy.treeStrategy.setImpurity(impurity)
        boostingStrategy.treeStrategy.setCategoricalFeaturesInfo(Map[Int,Int]())

//        val model = DecisionTree.trainClassifier(trainData,7,Map[Int,Int](),impurity,depth,bins)
        val model = GradientBoostedTrees.train(trainData,boostingStrategy) //.train(trainData,7,Map[Int,Int](),impurity,depth,bins)

        val trainAccuracy = getMetrics(model,trainData).precision

//        val pricision = getMetrics(model,cvData).precision
        ((depth,bins),trainAccuracy)
      }

  }
  /**
    * 决策树调优 采用不同参数进行测验
    *
    * @param trainData
    * @param cvData
    * @param testData
    */
  def evaluate(trainData:RDD[LabeledPoint],cvData:RDD[LabeledPoint],testData:RDD[LabeledPoint])={
    val evaluations=
      for(impurity<-Array("gini","entropy");depth<-Array(1,20);bins<-Array(10,300)) yield {
        val model = DecisionTree.trainClassifier(trainData,7,Map[Int,Int](),impurity,depth,bins)
        val pricision = getMetrics(model,cvData).precision
        ((impurity,depth,bins),pricision)
      }
    evaluations.sortBy(_._2).reverse.foreach(println)
    val model = DecisionTree.trainClassifier(trainData.union(cvData),7,Map[Int,Int](),"entropy",20,300)
    println(getMetrics(model,testData).precision)
    println(getMetrics(model,trainData.union(cvData)).precision)
  }

  /**
    * 数据中类别型特征使用one-hot编码,这种编码迫使决策树算法在底层要单独考虑类别型特征的每一个值,
    * 增加内存使用量并且减慢决策速度。我们取消one-hot编码:
    *
    * @param rawdata
    * @return
    */
  def unencodeOneHot(rawdata:RDD[String]):RDD[LabeledPoint]={
    rawdata.map{line=>
      val values = line.split(',').map(_.toDouble)
      val wilderness = values.slice(10,14).indexOf(1.0).toDouble
      val soil = values.slice(14,54).indexOf(1.0).toDouble
      val featureVector = Vectors.dense(values.slice(0,10):+wilderness:+soil)
      val label = values.last-1
      LabeledPoint(label,featureVector)
    }
  }

  /**
    * 多组数据测试检验
    *
    * @param rawdata
    */
  def evaluateCategorical(rawdata:RDD[String])={
    val data = unencodeOneHot(rawdata)
    val Array(trainData,cvData,testData) = data.randomSplit(Array(0.8,0.1,0.1))
    trainData.cache();cvData.cache();testData.cache();
    val evaluations = for(impurity<-Array("gini","entropy");depth<-Array(10,20,30);bins<-Array(40,300)) yield{
      val model = DecisionTree.trainClassifier(trainData,7,Map(10->4,11->40),impurity,depth,bins)
      val trainAccuray = getMetrics(model,trainData).precision
      val cvAccuray = getMetrics(model ,cvData).precision
      ((impurity,depth,bins),(trainAccuray,cvAccuray))
    }
    evaluations.sortBy(_._2._2).reverse.foreach(println)
//    val model = DecisionTree.trainClassifier(
//      trainData.union(cvData), 7, Map(10 -> 4, 11 -> 40), "entropy", 30, 300)
//    println(getMetrics(model, testData).precision)
//
//    trainData.unpersist()
//    cvData.unpersist()
//    testData.unpersist()
  }
  def testCategorical(rawData:RDD[String])={
    val data = unencodeOneHot(rawData)
    val Array(trainData,cvData,testData) = data.randomSplit(Array(0.8,0.1,0.1))
    trainData.cache();cvData.cache();testData.cache()
    val model = DecisionTree.trainClassifier(trainData.union(cvData),7,Map[Int,Int](),"entropy",30,300)
    println(getMetrics(model,testData).precision)
  }

  /**
    *
    * @param rawData
    */
  def evaluateForest(rawData:RDD[String])={
    val data = unencodeOneHot(rawData)
    val Array(trainData,cvData,testData) = data.randomSplit(Array(0.8,0.1,0.1))
    trainData.cache();cvData.cache();testData.cache()
    val forest = RandomForest.trainClassifier(
      trainData, 7, Map(10 -> 4, 11 -> 40), 20, "auto", "entropy", 30, 300)
    val pridictionsAndLabels = cvData.map(example=>
      (forest.predict(example.features),example.label)
    )
    println(new MulticlassMetrics(pridictionsAndLabels).precision)
    val input = "2709,125,28,67,23,3224,253,207,61,6094,0,29"
    val vector = Vectors.dense(input.split(",").map(_.toDouble))
    println(forest.predict(vector))
  }
  def classProbabilities(data:RDD[LabeledPoint]):Array[Double]={
    val countByCategory = data.map(_.label).countByValue()
    val counts = countByCategory.toArray.sortBy(_._1).map(_._2)
    counts.map(_.toDouble/counts.sum)
  }

  /**
    * 个性化push的模型评估
    * @param model
    * @param data
    * @return
    */
  def getMetric(model:GradientBoostedTreesModel,data:RDD[LabeledPoint]):BinaryClassificationMetrics={
    val predicitAndLabels = data.map{line=>
      (model.predict(line.features),line.label)
    }
    new BinaryClassificationMetrics(predicitAndLabels)
  }
  def getMetrics(model:GradientBoostedTreesModel,data:RDD[LabeledPoint]):MulticlassMetrics={
    val predicitionAndlabel = data.map{line=>
      (model.predict(line.features),line.label)
    }
    new BinaryClassificationMetrics(predicitionAndlabel)
    new MulticlassMetrics(predicitionAndlabel)
  }
  def getMetrics(model:DecisionTreeModel,data:RDD[LabeledPoint]):MulticlassMetrics={
    val pridictionAndlabels = data.map{line=>
      (model.predict(line.features),line.label)
    }
//    new (pridictionAndlabels)
    new MulticlassMetrics(pridictionAndlabels)
  }

}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值