Spark大数据-ml的逻辑斯蒂回归和决策树解决分类问题

最新推荐文章于 2021-02-15 14:19:16 发布

chenbengang

最新推荐文章于 2021-02-15 14:19:16 发布

阅读量503

点赞数

分类专栏： Spark大数据文章标签：大数据决策树机器学习 spark

本文链接：https://blog.csdn.net/chenbengang/article/details/103783540

版权

Spark大数据专栏收录该内容

20 篇文章 0 订阅

订阅专栏

ml的逻辑斯蒂回归和决策树解决分类问题

1.逻辑斯蒂回归解决分类问题

使用ml库的逻辑斯蒂回归解决鸢尾花的二分类问题：

// 二分类逻辑斯地回归，只用两个属性长度和宽度
import org.apache.spark.sql.Row

import org.apache.spark.ml.linalg.{Vector,Vectors}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.{Pipeline,PipelineModel}
import org.apache.spark.ml.feature.{IndexToString,StringIndexer,VectorIndexer,HashingTF,Tokenizer}

import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.classification.LogisticRegressionModel
import org.apache.spark.ml.classification.{BinaryLogisticRegressionSummary,LogisticRegression}

import org.apache.spark.sql.functions
import org.apache.spark.sql.SparkSession

val spark= SparkSession.builder.appName("LogisticRegressionClass").getOrCreate()
import spark.implicits._//开启隐式转换
case class Iris(features:org.apache.spark.ml.linalg.Vector,label:String)
// 加载数据
val data=spark.
	sparkContext.
    textFile("file:///home/chenbengang/ziyu_bigdata/quick_learn_spark/iris.txt").
    map(_.split(",")).
    map(p =>Iris(Vectors.dense(p(0).toDouble,p(1).toDouble,p(2).toDouble,p(3).toDouble),p(4).toString())).
    toDF()
data.show()
// 注册为临时表
data.createOrReplaceTempView("iris")
// 执行sql查询,筛选数据成二分类数据
val df=spark.sql("select * from iris where label!='Iris-setosa'")
df.map(t => t(1)+":"+t(0)).collect().foreach(println)

// 标签转化为数值类型
val labelIndexer=new StringIndexer().setInputCol("label").setOutputCol("indexedLabel").fit(df)
val featureIndexer=new VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").fit(df)
// 特征列转换
val featureIndexer=new VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").fit(df)

// 数据切分
val Array(trainingData,testData)=df.randomSplit(Array(0.7,0.3))
// 模型参数设置
val lr=new LogisticRegression()
    .setLabelCol("indexedLabel")
    .setFeaturesCol("indexedFeatures")
    .setMaxIter(10)
    .setRegParam(0.3)
    .setElasticNetParam(0.8)
// 设置labelConvert将数值型label转化为原来类别
val labelConvert=new IndexToString().setInputCol("prediction").setOutputCol("predictionLabel").setLabels(labelIndexer.labels)
// 构建pipeline
val lrPipeline=new Pipeline().setStages(Array(labelIndexer,featureIndexer,lr,labelConvert))
// 训练模型
val lrPipelineModel=lrPipeline.fit(trainingData)
// 预测
val lrPredictions=lrPipelineModel.transform(testData)
// 输出预测结果
lrPredictions.select("predictionLabel","label","features","probability").collect().foreach{
    case Row(predictionLabel:String,label:String,features:Vector,prob:Vector)=> 
        println(s"($label,$features)--> prob=$prob,predictedLabel=$predictionLabel")}
// 对模型进行评估
val evaluator=new MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction")
val lrAccuracy=evaluator.evaluate(lrPredictions)
println("TestError="+(1.0-lrAccuracy))
// 获取模型
val lrModel=lrPipelineModel.stages(2).asInstanceOf[LogisticRegressionModel]
println("Multinomial coefficients: " + lrModel.coefficientMatrix+
        " \nMultinomial intercepts: "+lrModel.interceptVector+
        " \nnumClasses: "+lrModel.numClasses+
        " \nnumFeatures: "+lrModel.numFeatures)

2.决策树解决分类问题

使用ml库的决策树解决鸢尾花的分类问题：

import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.linalg.{Vector,Vectors}
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer}

val spark=SparkSession.builder().master("local").appName("decsion-tree").getOrCreate()
import spark.implicits._
// 2. 读取数据，简要分析：
case class Iris(features:org.apache.spark.ml.linalg.Vector, label: String)
val data=spark.sparkContext.textFile("/home/chenbengang/ziyu_bigdata/quick_learn_spark/iris.txt").map(_.split(",")).map(
    p => Iris(
        Vectors.dense(p(0).toDouble ,p(1).toDouble, p(2).toDouble, p(3).toDouble), p(4).toString()
    )
).toDF
data.show()
data.createOrReplaceTempView("iris")
val df = spark.sql("select * from iris")
df.show()
df.map(t => t(1)+":"+t(0)).collect().foreach(println)
// 3. 进一步处理特征和标签，以及数据分组：
// 分别获取标签列和特征列，进行索引，并进行了重命名。
// 使用StringIndexer将label转化为索引
val labelIndexer = new StringIndexer().setInputCol("label").setOutputCol("indexedLabel").fit(df)
// VectorIndexer将features转化为索引
val featureIndexer=new VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").setMaxCategories(4).fit(df)
// IndexToString将prediction转化为label，labels为labelIndexer.labels
val labelConverter=new IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels)
// 将数据随机分成训练集和测试集,0.7,0.3
val Array(trainingData,testData)=data.randomSplit(Array(0.7,0.3))
// 构建决策树分类模型
import org.apache.spark.ml.classification.DecisionTreeClassificationModel
import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
val dtClassifier=new DecisionTreeClassifier().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures")
// 标签转换，特征转换，模型，转化为标签
val pipelinedClassifier=new Pipeline().setStages(Array(labelIndexer,featureIndexer,dtClassifier,labelConverter))
// 训练
val modelClassifier=pipelinedClassifier.fit(trainingData)
// 预测
val predictionClassifier=modelClassifier.transform(testData)
predictionClassifier.select("predictedLabel", "label", "features").show(20)
// 5. 评估决策树分类模型：
// 创建一个多分类评估器，指标为准确率
val evaluatorClassifier=new MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction").
setMetricName("accuracy")
// 使用评估器和预测结果计算准确率
val acurracy=evaluatorClassifier.evaluate(predictionClassifier)
println("test error= "+(1.0-acurracy))
// 查看决策树模型
val treeModelClassifier=modelClassifier.stages(2).asInstanceOf[DecisionTreeClassificationModel]
println("Learned classification tree model:\n"+treeModelClassifier.toDebugString)