该样例取自spark高级数据分析第四章的内容,数据集来自http://t.cn/ R2wmIsI,包含一个 CSV 格式的压缩数据文件 covtype.data.gz,附带一个描述数据文件的 信息文件 covtype.info
spark mllib将特征向量抽象为LabeledPoint,它由一个包含多个特征值的Spark MLlib Vector 和一个称为标号(label)的目标值组成。该目标为 Double 类型,而 Vector 本质上 是对多个 Double 类型值的抽象。这说明 LabeledPoint 只适用于数值型特征。但只要经过 适当编码,LabeledPoint 也可用于类别型特征。另一个就是对于非数值类特征取one-hot编码,例如天气分为晴天、阴天、下雨,今天天气晴朗则100见代码:
package com.demo.rdf
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.mllib.evaluation.{BinaryClassificationMetrics, MulticlassMetrics}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.configuration.BoostingStrategy
import org.apache.spark.mllib.tree.{GradientBoostedTrees, RandomForest, DecisionTree}
import org.apache.spark.mllib.tree.model.{GradientBoostedTreesModel, DecisionTreeModel}
import org.apache.spark.rdd.RDD
import org.apache.spark
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by leslie on 16/10/26.
*/
object RunRDFs {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("DecisionTree")
val sc = new SparkContext(conf)
val rawData = sc.textFile("/zhenfei1/covtype/covtype.data")
val data = rawData.map{line=>
val values = line.split(",").map(_.toDouble)
val featureVector = Vectors.dense(values.init)
val label = values.last-1
LabeledPoint(label,featureVector)
}
val Array(trainData,cvData,testData) = data.randomSplit(Array(0.8,0.1,0.1))
trainData.cache();cvData.cache();testData.cache()
simpleDecisionTree(trainData,cvData)
randomClassifier(trainData,cvData)
evaluate(trainData,cvData,testData)
}
/**
* 创建一个普通的决策树
*
* @param trainData
* @param cvData
*/
def simpleDecisionTree(trainData:RDD[LabeledPoint],cvData:RDD[LabeledPoint])={
val model = DecisionTree.trainClassifier(trainData,7,Map[Int,Int](),"entropy",20,300)
val metrics = getMetrics(model,cvData)
println("confusionMatrix : "+metrics.confusionMatrix)
println("precision"+metrics.precision)
(0 until 7).map(line=>
((metrics.precision(line),metrics.recall(line)))
).foreach(println)
}
/**
* 随即森林模型
*
* @param trainData
* @param cvData
*/
def randomClassifier(trainData:RDD[LabeledPoint],cvData:RDD[LabeledPoint])={
val trainPriorProbabilities = classProbabilities(trainData)
val cvPriorProbabilities = classProbabilities(cvData)
val accurry = trainPriorProbabilities.zip(cvPriorProbabilities).map{
case (trainPro,cvPro)=> trainPro*cvPro
}.sum
println(accurry)
}
def GBDTClassifier(trainData:RDD[LabeledPoint],cvData:RDD[LabeledPoint])={
val boostingStrategy = BoostingStrategy.defaultParams("Classification")
val evaluations=
for(depth<-Array(1,20);bins<-Array(10,300)) yield {
val boostingStrategy = BoostingStrategy.defaultParams("Classification")
boostingStrategy.setNumIterations(100)//迭代次数
boostingStrategy.treeStrategy.setNumClasses(2)//分类数目
boostingStrategy.treeStrategy.setMaxDepth(depth)//决策树最高层
// boostingStrategy.treeStrategy.setImpurity(impurity)
boostingStrategy.treeStrategy.setCategoricalFeaturesInfo(Map[Int,Int]())
// val model = DecisionTree.trainClassifier(trainData,7,Map[Int,Int](),impurity,depth,bins)
val model = GradientBoostedTrees.train(trainData,boostingStrategy) //.train(trainData,7,Map[Int,Int](),impurity,depth,bins)
val trainAccuracy = getMetrics(model,trainData).precision
// val pricision = getMetrics(model,cvData).precision
((depth,bins),trainAccuracy)
}
}
/**
* 决策树调优 采用不同参数进行测验
*
* @param trainData
* @param cvData
* @param testData
*/
def evaluate(trainData:RDD[LabeledPoint],cvData:RDD[LabeledPoint],testData:RDD[LabeledPoint])={
val evaluations=
for(impurity<-Array("gini","entropy");depth<-Array(1,20);bins<-Array(10,300)) yield {
val model = DecisionTree.trainClassifier(trainData,7,Map[Int,Int](),impurity,depth,bins)
val pricision = getMetrics(model,cvData).precision
((impurity,depth,bins),pricision)
}
evaluations.sortBy(_._2).reverse.foreach(println)
val model = DecisionTree.trainClassifier(trainData.union(cvData),7,Map[Int,Int](),"entropy",20,300)
println(getMetrics(model,testData).precision)
println(getMetrics(model,trainData.union(cvData)).precision)
}
/**
* 数据中类别型特征使用one-hot编码,这种编码迫使决策树算法在底层要单独考虑类别型特征的每一个值,
* 增加内存使用量并且减慢决策速度。我们取消one-hot编码:
*
* @param rawdata
* @return
*/
def unencodeOneHot(rawdata:RDD[String]):RDD[LabeledPoint]={
rawdata.map{line=>
val values = line.split(',').map(_.toDouble)
val wilderness = values.slice(10,14).indexOf(1.0).toDouble
val soil = values.slice(14,54).indexOf(1.0).toDouble
val featureVector = Vectors.dense(values.slice(0,10):+wilderness:+soil)
val label = values.last-1
LabeledPoint(label,featureVector)
}
}
/**
* 多组数据测试检验
*
* @param rawdata
*/
def evaluateCategorical(rawdata:RDD[String])={
val data = unencodeOneHot(rawdata)
val Array(trainData,cvData,testData) = data.randomSplit(Array(0.8,0.1,0.1))
trainData.cache();cvData.cache();testData.cache();
val evaluations = for(impurity<-Array("gini","entropy");depth<-Array(10,20,30);bins<-Array(40,300)) yield{
val model = DecisionTree.trainClassifier(trainData,7,Map(10->4,11->40),impurity,depth,bins)
val trainAccuray = getMetrics(model,trainData).precision
val cvAccuray = getMetrics(model ,cvData).precision
((impurity,depth,bins),(trainAccuray,cvAccuray))
}
evaluations.sortBy(_._2._2).reverse.foreach(println)
// val model = DecisionTree.trainClassifier(
// trainData.union(cvData), 7, Map(10 -> 4, 11 -> 40), "entropy", 30, 300)
// println(getMetrics(model, testData).precision)
//
// trainData.unpersist()
// cvData.unpersist()
// testData.unpersist()
}
def testCategorical(rawData:RDD[String])={
val data = unencodeOneHot(rawData)
val Array(trainData,cvData,testData) = data.randomSplit(Array(0.8,0.1,0.1))
trainData.cache();cvData.cache();testData.cache()
val model = DecisionTree.trainClassifier(trainData.union(cvData),7,Map[Int,Int](),"entropy",30,300)
println(getMetrics(model,testData).precision)
}
/**
*
* @param rawData
*/
def evaluateForest(rawData:RDD[String])={
val data = unencodeOneHot(rawData)
val Array(trainData,cvData,testData) = data.randomSplit(Array(0.8,0.1,0.1))
trainData.cache();cvData.cache();testData.cache()
val forest = RandomForest.trainClassifier(
trainData, 7, Map(10 -> 4, 11 -> 40), 20, "auto", "entropy", 30, 300)
val pridictionsAndLabels = cvData.map(example=>
(forest.predict(example.features),example.label)
)
println(new MulticlassMetrics(pridictionsAndLabels).precision)
val input = "2709,125,28,67,23,3224,253,207,61,6094,0,29"
val vector = Vectors.dense(input.split(",").map(_.toDouble))
println(forest.predict(vector))
}
def classProbabilities(data:RDD[LabeledPoint]):Array[Double]={
val countByCategory = data.map(_.label).countByValue()
val counts = countByCategory.toArray.sortBy(_._1).map(_._2)
counts.map(_.toDouble/counts.sum)
}
/**
* 个性化push的模型评估
* @param model
* @param data
* @return
*/
def getMetric(model:GradientBoostedTreesModel,data:RDD[LabeledPoint]):BinaryClassificationMetrics={
val predicitAndLabels = data.map{line=>
(model.predict(line.features),line.label)
}
new BinaryClassificationMetrics(predicitAndLabels)
}
def getMetrics(model:GradientBoostedTreesModel,data:RDD[LabeledPoint]):MulticlassMetrics={
val predicitionAndlabel = data.map{line=>
(model.predict(line.features),line.label)
}
new BinaryClassificationMetrics(predicitionAndlabel)
new MulticlassMetrics(predicitionAndlabel)
}
def getMetrics(model:DecisionTreeModel,data:RDD[LabeledPoint]):MulticlassMetrics={
val pridictionAndlabels = data.map{line=>
(model.predict(line.features),line.label)
}
// new (pridictionAndlabels)
new MulticlassMetrics(pridictionAndlabels)
}
}