ml的逻辑斯蒂回归和决策树解决分类问题
1.逻辑斯蒂回归解决分类问题
使用ml库的逻辑斯蒂回归解决鸢尾花的二分类问题:
// 二分类逻辑斯地回归,只用两个属性长度和宽度
import org.apache.spark.sql.Row
import org.apache.spark.ml.linalg.{Vector,Vectors}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.{Pipeline,PipelineModel}
import org.apache.spark.ml.feature.{IndexToString,StringIndexer,VectorIndexer,HashingTF,Tokenizer}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.classification.LogisticRegressionModel
import org.apache.spark.ml.classification.{BinaryLogisticRegressionSummary,LogisticRegression}
import org.apache.spark.sql.functions
import org.apache.spark.sql.SparkSession
val spark= SparkSession.builder.appName("LogisticRegressionClass").getOrCreate()
import spark.implicits._//开启隐式转换
case class Iris(features:org.apache.spark.ml.linalg.Vector,label:String)
// 加载数据
val data=spark.
sparkContext.
textFile("file:///home/chenbengang/ziyu_bigdata/quick_learn_spark/iris.txt").
map(_.split(",")).
map(p =>Iris(Vectors.dense(p(0).toDouble,p(1).toDouble,p(2).toDouble,p(3).toDouble),p(4).toString())).
toDF()
data.show()
// 注册为临时表
data.createOrReplaceTempView("iris")
// 执行sql查询,筛选数据成二分类数据
val df=spark.sql("select * from iris where label!='Iris-setosa'")
df.map(t => t(1)+":"+t(0)).collect().foreach(println)
// 标签转化为数值类型
val labelIndexer=new StringIndexer().setInputCol("label").setOutputCol("indexedLabel").fit(df)
val featureIndexer=new VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").fit(df)
// 特征列转换
val featureIndexer=new VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").fit(df)
// 数据切分
val Array(trainingData,testData)=df.randomSplit(Array(0.7,0.3))
// 模型参数设置
val lr=new LogisticRegression()
.setLabelCol("indexedLabel")
.setFeaturesCol("indexedFeatures")
.setMaxIter(10)
.setRegParam(0.3)
.setElasticNetParam(0.8)
// 设置labelConvert将数值型label转化为原来类别
val labelConvert=new IndexToString().setInputCol("prediction").setOutputCol("predictionLabel").setLabels(labelIndexer.labels)
// 构建pipeline
val lrPipeline=new Pipeline().setStages(Array(labelIndexer,featureIndexer,lr,labelConvert))
// 训练模型
val lrPipelineModel=lrPipeline.fit(trainingData)
// 预测
val lrPredictions=lrPipelineModel.transform(testData)
// 输出预测结果
lrPredictions.select("predictionLabel","label","features","probability").collect().foreach{
case Row(predictionLabel:String,label:String,features:Vector,prob:Vector)=>
println(s"($label,$features)--> prob=$prob,predictedLabel=$predictionLabel")}
// 对模型进行评估
val evaluator=new MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction")
val lrAccuracy=evaluator.evaluate(lrPredictions)
println("TestError="+(1.0-lrAccuracy))
// 获取模型
val lrModel=lrPipelineModel.stages(2).asInstanceOf[LogisticRegressionModel]
println("Multinomial coefficients: " + lrModel.coefficientMatrix+
" \nMultinomial intercepts: "+lrModel.interceptVector+
" \nnumClasses: "+lrModel.numClasses+
" \nnumFeatures: "+lrModel.numFeatures)
2.决策树解决分类问题
使用ml库的决策树解决鸢尾花的分类问题:
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.linalg.{Vector,Vectors}
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer}
val spark=SparkSession.builder().master("local").appName("decsion-tree").getOrCreate()
import spark.implicits._
// 2. 读取数据,简要分析:
case class Iris(features:org.apache.spark.ml.linalg.Vector, label: String)
val data=spark.sparkContext.textFile("/home/chenbengang/ziyu_bigdata/quick_learn_spark/iris.txt").map(_.split(",")).map(
p => Iris(
Vectors.dense(p(0).toDouble ,p(1).toDouble, p(2).toDouble, p(3).toDouble), p(4).toString()
)
).toDF
data.show()
data.createOrReplaceTempView("iris")
val df = spark.sql("select * from iris")
df.show()
df.map(t => t(1)+":"+t(0)).collect().foreach(println)
// 3. 进一步处理特征和标签,以及数据分组:
// 分别获取标签列和特征列,进行索引,并进行了重命名。
// 使用StringIndexer将label转化为索引
val labelIndexer = new StringIndexer().setInputCol("label").setOutputCol("indexedLabel").fit(df)
// VectorIndexer将features转化为索引
val featureIndexer=new VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").setMaxCategories(4).fit(df)
// IndexToString将prediction转化为label,labels为labelIndexer.labels
val labelConverter=new IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels)
// 将数据随机分成训练集和测试集,0.7,0.3
val Array(trainingData,testData)=data.randomSplit(Array(0.7,0.3))
// 构建决策树分类模型
import org.apache.spark.ml.classification.DecisionTreeClassificationModel
import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
val dtClassifier=new DecisionTreeClassifier().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures")
// 标签转换,特征转换,模型,转化为标签
val pipelinedClassifier=new Pipeline().setStages(Array(labelIndexer,featureIndexer,dtClassifier,labelConverter))
// 训练
val modelClassifier=pipelinedClassifier.fit(trainingData)
// 预测
val predictionClassifier=modelClassifier.transform(testData)
predictionClassifier.select("predictedLabel", "label", "features").show(20)
// 5. 评估决策树分类模型:
// 创建一个多分类评估器,指标为准确率
val evaluatorClassifier=new MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction").
setMetricName("accuracy")
// 使用评估器和预测结果计算准确率
val acurracy=evaluatorClassifier.evaluate(predictionClassifier)
println("test error= "+(1.0-acurracy))
// 查看决策树模型
val treeModelClassifier=modelClassifier.stages(2).asInstanceOf[DecisionTreeClassificationModel]
println("Learned classification tree model:\n"+treeModelClassifier.toDebugString)