本文主要使用Scala计算二分类模型的评价指标,包括以下内容:
- precision、recall、F1Score
- accuracy
- AUC
- KS
对上述指标计算方法进行封装,方便调用。传入参数为预测的数据框
构造数据
简单的构造数据,得到预测的DataFrame,其包含预测的概率、label和真实的label。
import org.apache.spark.ml.{Model, Pipeline, PipelineModel, PipelineStage}
import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, Evaluator}
import org.apache.spark.ml.tuning.ParamGridBuilder
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.tuning.{CrossValidator}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.sql.functions._
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
val builder = SparkSession
.builder()
.appName("LR")
.config("spark.executor.heartbeatInterval","60s")
.config("spark.network.timeout","120s")
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.config("spark.kryoserializer.buffer.max","512m")
.config("spark.dynamicAllocation.enabled", false)
.config("spark.sql.inMemoryColumnarStorage.compressed", true)
.config("spark.sql.inMemoryColumnarStorage.batchSize", 10000)
.config("spark.sql.broadcastTimeout", 600)
.config("spark.sql.autoBroadcastJoinThreshold", -1)
.config("spark.sql.crossJoin.enabled", true)
.master("local[*]")
val spark = builder.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
import spark.implicits._
import org.apache.spark.ml.{Model, Pipeline, PipelineModel, PipelineStage}
import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, Evaluator}
import org.apache.spark.ml.tuning.ParamGridBuilder
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.tuning.CrossValidator
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.sql.functions._
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
builder: org.apache.spark.sql.SparkSession.Builder = org.apache.spark.sql.SparkSession$Builder@20fd3d0a
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@58767892
import spark.implicits._
var dfTrain = Seq(
(1, 5.1, 3.5, 1.4, 0.2, 0),
(2, 4.9, 3.0, 1.4, 0.2, 1),
(3, 4.7, 3.2, 1.3, 0.2, 0),
(4, 4.6, 3.1, 1.5, 0.2, 1),
(5, 5.0, 3.6, 1.4, 0.2, 0),
(56, 5.7, 2.8, 4.5, 1.3,1),
(57, 5.3, 3.3, 4.7, 1.6,0),
(58, 4.9, 2.4, 3.3, 1.0,1),
(59, 6.6, 3.9, 4.6, 1.3,1),
(60, 5.2, 2.7, 3.9, 1.4,0)
).toDF("id","x1","x2", "x3","x4","label")
// 测试集直接copy就行了,仅用来测试
var dfTest = dfTrain
dfTrain.show()
+---+---+---+---+---+-----+
| id| x1| x2| x3| x4|label|
+---+---+---+---+---+-----+
| 1|5.1|3.5|1.4|0.2| 0|
| 2|4.9|3.0|1.4|0.2| 1|
| 3|4.7|3.2|1.3|0.2| 0|
| 4|4.6|3.1|1.5|0.2| 1|
| 5|5.0|3.6|1.4|0.2| 0|
| 56|5.7|2.8|4.5|1.3| 1|
| 57|5.3|3.3|4.7|1.6| 0|
| 58|4.9|2.4|3.3|1.0| 1|
| 59|6.6|3.9|4.6|1.3| 1|
| 60|5.2|2.7|3.9|1.4| 0|
+---+---+---+---+---+-----+
dfTrain: org.apache.spark.sql.DataFrame = [id: int, x1: double ... 4 more fields]
dfTest: org.apache.spark.sql.DataFrame = [id: int, x1: double ... 4 more fields]
// 数据转换
val assemble = new VectorAssembler()
.setInputCols(Array("x1","x2","x3","x4"))
.setOutputCol("features")
// 模型
val lr = new LogisticRegression()
.setMaxIter(10)
.setRegParam(0.01)
.setLabelCol("label")//设置训练集真实label列名,默认也是"label"
.setFeaturesCol("features")//设置训练集特征列,默认"features"
.setPredictionCol("preLabel")//设置预测结果中label列名,默认为"prediction"
.setProbabilityCol("prob")//设置预测结果中概率列名,默认"probability"
// 模型
val pipeline = new Pipeline().setStages(Array(assemble, lr))
val Model = pipeline.fit(dfTrain)
assemble: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_2bfbc5f2ff24
lr: org.apache.spark.ml.classification.LogisticRegression = logreg_6fd6be208198
pipeline: org.apache.spark.ml.Pipeline = pipeline_5b2f95daec89
Model: org.apache.spark.ml.PipelineModel = pipeline_5b2f95daec89
val preResult = Model.transform(dfTest)
preResult.show()
+---+---+---+---+---+-----+-----------------+--------------------+--------------------+--------+
| id| x1| x2| x3| x4|label| features| rawPrediction| prob|preLabel|
+---+---+---+---+---+-----+-----------------+--------------------+--------------------+--------+
| 1|5.1|3.5|1.4|0.2| 0|[5.1,3.5,1.4,0.2]|[0.51973594549227...|[0.62708601946512...| 0.0|
| 2|4.9|3.0|1.4|0.2| 1|[4.9,3.0,1.4,0.2]|[-1.0899529556434...|[0.25162713725554...| 1.0|
| 3|4.7|3.2|1.3|0.2| 0|[4.7,3.2,1.3,0.2]|[0.41832944562126...|[0.60308343184906...| 0.0|
| 4|4.6|3.1|1.5|0.2| 1|[4.6,3.1,1.5,0.2]|[0.24687940631850...|[0.56140826798745...| 0.0|
| 5|5.0|3.6|1.4|0.2| 0|[5.0,3.6,1.4,0.2]|[1.26603211145541...|[0.78006275423495...| 0.0|
| 56|5.7|2.8|4.5|1.3| 1|[5.7,2.8,4.5,1.3]|[-2.0093807897371...|[0.11822151224039...| 1.0|
| 57|5.3|3.3|4.7|1.6| 0|[5.3,3.3,4.7,1.6]|[2.17258007146063...|[0.89776002662622...| 0.0|
| 58|4.9|2.4|3.3|1.0| 1|[4.9,2.4,3.3,1.0]|[-1.9539003322336...|[0.12412868907566...| 1.0|
| 59|6.6|3.9|4.6|1.3| 1|[6.6,3.9,4.6,1.3]|[0.12192431314750...|[0.53044337453190...| 0.0|
| 60|5.2|2.7|3.9|1.4| 0|[5.2,2.7,3.9,1.4]|[-0.5811999760827...|[0.35865652594949...| 1.0|
+---+---+---+---+---+-----+-----------------+--------------------+--------------------+--------+
preResult: org.apache.spark.sql.DataFrame = [id: int, x1: double ... 8 more fields]
可以看到预测结果是数据集,包含概率和label(probability,prediction),以及真实的label
混淆矩阵
import spark.implicits._
val preLabel ="preLabel"
val preProb = "prob"
val trueLabel = "label"
val PredictDf = preResult
preResult.filter(s"$preLabel==1").show()
+---+---+---+---+---+-----+-----------------+--------------------+--------------------+--------+
| id| x1| x2| x3| x4|label| features| rawPrediction| prob|preLabel|
+---+---+---+---+---+-----+-----------------+--------------------+--------------------+--------+
| 2|4.9|3.0|1.4|0.2| 1|[4.9,3.0,1.4,0.2]|[-1.0899529556434...|[0.25162713725554...| 1.0|
| 56|5.7|2.8|4.5|1.3| 1|[5.7,2.8,4.5,1.3]|[-2.0093807897371...|[0.11822151224039...| 1.0|
| 58|4.9|2.4|3.3|1.0| 1|[4.9,2.4,3.3,1.0]|[-1.9539003322336...|[0.12412868907566...| 1.0|
| 60|5.2|2.7|3.9|1.4| 0|[5.2,2.7,3.9,1.4]|[-0.5811999760827...|[0.35865652594949...| 1.0|
+---+---+---+---+---+-----+-----------------+--------------------+--------------------+--------+
import spark.implicits._
preLabel: String = preLabel
preProb: String = prob
trueLabel: String = label
PredictDf: org.apache.spark.sql.DataFrame = [id: int, x1: double ... 8 more fields]
// import spark.implicits._
val preLabel ="preLabel"
val preProb = "prob"
val trueLabel = "label"
val PredictDf = preResult
// --- 统计TP、FP、FN、TN
// 实际为正,预测为正
val TP = PredictDf.filter(s"$preLabel == 1 and $trueLabel == 1").count().toDouble
// 实际为负,预测为正
val FP = PredictDf.filter(s"$preLabel == 1 and $trueLabel == 0").count().toDouble
// 实际为负,预测为负
val TN = PredictDf.filter(s"$preLabel == 0 and $trueLabel == 0").count().toDouble
// 实际为正,预测为负
val FN = PredictDf.filter(s"$preLabel == 0 and $trueLabel == 1").count().toDouble
// 自己拼的混淆矩阵,中文jupyter有bug就不写了。。。
println("\t " + "Pre Neg " + "Pre Pos " + "\n" +
"True Neg " + TN + " " + FP + "\n"+
"True Pos " + FN + " " + TP + "\n"
)
// 直接groupBy
PredictDf.groupBy(s"$trueLabel")
.pivot(s"$preLabel", (0 to 1)).count().na.fill(0.0)
.orderBy(asc(s"$trueLabel"))
.withColumnRenamed(s"$trueLabel", "True-Predict")
.show(truncate = true)
Pre Neg Pre Pos
True Neg 4.0 1.0
True Pos 2.0 3.0
+------------+---+---+
|True-Predict| 0| 1|
+------------+---+---+
| 0| 4| 1|
| 1| 2| 3|
+------------+---+---+
preLabel: String = preLabel
preProb: String = prob
trueLabel: String = label
PredictDf: org.apache.spark.sql.DataFrame = [id: int, x1: double ... 8 more fields]
TP: Double = 3.0
FP: Double = 1.0
TN: Double = 4.0
FN: Double = 2.0
accuarcy&precision&recall&f1
混淆矩阵都有了,precision、recall、f1Score都很好计算
// 正样本
println("正样本precision: " + TP / (TP + FP))
println("正样本recall: " + TP/(TP+FN))
println("正样本F1Score: " + 2*TP/(2*TP+FP+FN))
// 负样本
println("负样本precision: " + TN / (TN + FN))
println("负样本recall: " + TN/(FP+TN))
println("负样本F1Score: " + 2*TN/(2*TN+FP+FN))
println("Accuaracy: "+(TP+TN)/(TP+TN+FP+FN))
// 负样本
println("Neg precision: " + TN / (TN + FN))
println("Neg recall: " + TN/(FP+TN))
println("Neg F1Score: " + 2*TN/(2*TN+FP+FN))
// 正样本
println("Pos precision: " + TP / (TP + FP))
println("Pos recall: " + TP/(TP+FN))
println("Pos F1Score: " + 2*TP/(2*TP+FP+FN))
Accuaracy: 0.7
Neg precision: 0.6666666666666666
Neg recall: 0.8
Neg F1Score: 0.7272727272727273
Pos precision: 0.75
Pos recall: 0.6
Pos F1Score: 0.6666666666666666
也可以直接掉包计算
import org.apache.spark.mllib.evaluation.MulticlassMetrics
val predictionRDD = PredictDf.select(preLabel, trueLabel).as[(Double, Double)].rdd
val multiclassMetrics = new MulticlassMetrics(predictionRDD)
println("Accuaracy: "+multiclassMetrics.accuracy)
val labels = multiclassMetrics.labels
labels.foreach { l =>
println(s"Precision($l) = " + multiclassMetrics.precision(l))
println(s"Recall($l) = " + multiclassMetrics.recall(l))
println(s"F1Score($l) = " + multiclassMetrics.fMeasure(l))
}
Accuaracy: 0.7
Precision(0.0) = 0.6666666666666666
Recall(0.0) = 0.8
F1Score(0.0) = 0.7272727272727272
Precision(1.0) = 0.75
Recall(1.0) = 0.6
F1Score(1.0) = 0.6666666666666665
import org.apache.spark.mllib.evaluation.MulticlassMetrics
predictionRDD: org.apache.spark.rdd.RDD[(Double, Double)] = MapPartitionsRDD[1160] at rdd at <console>:89
multiclassMetrics: org.apache.spark.mllib.evaluation.MulticlassMetrics = org.apache.spark.mllib.evaluation.MulticlassMetrics@28b99672
labels: Array[Double] = Array(0.0, 1.0)
AUC和PRC
scala中提供了计算AUC和PRC的方法,但是需要传入rdd格式的数据。另外需要对数据概率字段进行切分,原始字段中是Vector。先看下数据切分的代码:
val aucDf = PredictDf.select(preProb, trueLabel).map(x => (
x(0).asInstanceOf[Vector].toArray, x(0).asInstanceOf[Vector].toArray(0), x(0).asInstanceOf[Vector].toArray(1)
, x(1).toString.toDouble))
.toDF("probVector","prob0","prob1",trueLabel)
aucDf.show()
+--------------------+-------------------+-------------------+-----+
| probVector| prob0| prob1|label|
+--------------------+-------------------+-------------------+-----+
|[0.62708601946512...| 0.6270860194651235| 0.3729139805348764| 0.0|
|[0.25162713725554...| 0.2516271372555436| 0.7483728627444565| 1.0|
|[0.60308343184906...| 0.6030834318490638| 0.3969165681509361| 0.0|
|[0.56140826798745...| 0.561408267987451| 0.4385917320125489| 1.0|
|[0.78006275423495...| 0.780062754234951|0.21993724576504908| 0.0|
|[0.11822151224039...|0.11822151224039172| 0.8817784877596083| 1.0|
|[0.89776002662622...| 0.8977600266262256|0.10223997337377445| 0.0|
|[0.12412868907566...|0.12412868907566986| 0.8758713109243301| 1.0|
|[0.53044337453190...| 0.5304433745319013|0.46955662546809873| 1.0|
|[0.35865652594949...| 0.3586565259494905| 0.6413434740505095| 0.0|
+--------------------+-------------------+-------------------+-----+
aucDf: org.apache.spark.sql.DataFrame = [probVector: array<double>, prob0: double ... 2 more fields]
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
val predictionRDD1 = aucDf.select("prob1", trueLabel).as[(Double, Double)].rdd
val BinaryMetrics = new BinaryClassificationMetrics(predictionRDD1)
println("AUC: " + BinaryMetrics.areaUnderROC)
println("PRC: " + BinaryMetrics.areaUnderPR)
AUC: 0.92
PRC: 0.9183333333333333
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
predictionRDD1: org.apache.spark.rdd.RDD[(Double, Double)] = MapPartitionsRDD[1127] at rdd at <console>:86
BinaryMetrics: org.apache.spark.mllib.evaluation.BinaryClassificationMetrics = org.apache.spark.mllib.evaluation.BinaryClassificationMetrics@12535044
KS
简单介绍KS计算的逻辑:
- 阈值以0.1为单位,从0递增至1
- 根据不同的阈值分别计算TPR、FPR
- kS = Max(TPR-FPR)
(0.0 to 1.0 by 0.1).toArray
res102: Array[Double] = Array(0.0, 0.1, 0.2, 0.30000000000000004, 0.4, 0.5, 0.6000000000000001, 0.7000000000000001, 0.8, 0.9, 1.0)
aucDf.filter(s"prob1 >= 0.1 and $trueLabel == 1").count().toDouble
res104: Double = 5.0
import scala.collection.mutable.ArrayBuffer
val Tpr_Fpr = ArrayBuffer[Double]()
var tp=0.0
var fp=0.0
var tn=0.0
var fn=0.0
for(threshold <- 0.0 to 1.0 by 0.1){
//判为正类实际也为正类
tp = aucDf.filter(s"prob1 >= $threshold and $trueLabel == 1").count().toDouble
//判为正类实际为负类
fp = aucDf.filter(s"prob1 >= $threshold and $trueLabel == 0").count().toDouble
//判为负类实际为负类
tn = aucDf.filter(s"prob1 < $threshold and $trueLabel == 0").count().toDouble
//判为负类实际为正类
fn = aucDf.filter(s"prob1 < $threshold and $trueLabel == 1").count().toDouble
Tpr_Fpr.append(tp/(tp+fn)-fp/(fp+tn))
}
println("KS Value: "+Tpr_Fpr.max)
KS Value: 0.8
import scala.collection.mutable.ArrayBuffer
Tpr_Fpr: scala.collection.mutable.ArrayBuffer[Double] = ArrayBuffer(0.0, 0.0, 0.19999999999999996, 0.4, 0.8, 0.39999999999999997, 0.39999999999999997, 0.6, 0.4, 0.0, 0.0)
tp: Double = 0.0
fp: Double = 0.0
tn: Double = 5.0
fn: Double = 5.0
封装
把上面所有的指标封装起来,方便调用。指标不是很多,调用方法是print所有的指标
/**
*
* @param spark
* @param PredictDf 原始的预测数据集,不用对probVector做split
* @param preLabel 预测的label列名,prediction
* @param trueLabel 真实label列名,默认label
*/
def BinaryClassificationModelMectrics3(spark: SparkSession, PredictDf: DataFrame,probName:String="probability"
, preLabel: String = "prediction", trueLabel: String = "label"): Unit = {
import spark.implicits._
println("--------------------------------------- Confusion Matrix ------------------------------------------------")
// --- 统计TP、FP、FN、TN
// 实际为正,预测为正
val TP = PredictDf.filter(s"$preLabel == 1 and $trueLabel == 1").count().toDouble
// 实际为负,预测为正
val FP = PredictDf.filter(s"$preLabel == 1 and $trueLabel == 0").count().toDouble
// 实际为负,预测为负
val TN = PredictDf.filter(s"$preLabel == 0 and $trueLabel == 0").count().toDouble
// 实际为正,预测为负
val FN = PredictDf.filter(s"$preLabel == 0 and $trueLabel == 1").count().toDouble
// 自己拼的混淆矩阵,中文jupyter有bug就不写了。。。
println("\t " + "Pre Neg " + "Pre Pos " + "\n" +
"True Neg " + TN + " " + FP + "\n" +
"True Pos " + FN + " " + TP + "\n")
// 直接groupBy
PredictDf.groupBy(s"$trueLabel")
.pivot(s"$preLabel", (0 to 1)).count().na.fill(0.0)
.orderBy(asc(s"$trueLabel"))
.withColumnRenamed(s"$trueLabel", "True-Predict")
.show(truncate = true)
//------ 计算accuracy、recall、precision、f1score
println("---------------------------------- Accuarcy&Precision&Recall&F1Score ------------------------------------")
println("---------------------- Use Package")
val predictionRDD = PredictDf.select(preLabel, trueLabel).as[(Double, Double)].rdd
val multiclassMetrics = new MulticlassMetrics(predictionRDD)
println("Accuaracy: " + multiclassMetrics.accuracy)
val labels = multiclassMetrics.labels
labels.foreach { l =>
println(s"Precision($l) = " + multiclassMetrics.precision(l))
println(s"Recall($l) = " + multiclassMetrics.recall(l))
println(s"F1Score($l) = " + multiclassMetrics.fMeasure(l))
}
println("---------------------- Not Use Package")
println("Accuaracy: " + (TP + TN) / (TP + TN + FP + FN))
// 负样本
println("Neg precision: " + TN / (TN + FN))
println("Neg recall: " + TN / (FP + TN))
println("Neg F1Score: " + 2 * TN / (2 * TN + FP + FN))
// 正样本
println("Pos precision: " + TP / (TP + FP))
println("Pos recall: " + TP / (TP + FN))
println("Pos F1Score: " + 2 * TP / (2 * TP + FP + FN))
println("-----------------------------*----------- Auc&Prc&Ks ----------------------------------------------------")
//---- 计算auc prc
val aucDf = PredictDf.select(probName, trueLabel).map(x => (
x(0).asInstanceOf[Vector].toArray, x(0).asInstanceOf[Vector].toArray(0), x(0).asInstanceOf[Vector].toArray(1)
, x(1).toString.toDouble))
.toDF("probVector","prob0","prob1",trueLabel)
val predictionRDD1 = aucDf.select("prob1", trueLabel).as[(Double, Double)].rdd
val BinaryMetrics = new BinaryClassificationMetrics(predictionRDD1)
println("AUC: " + BinaryMetrics.areaUnderROC)
println("PRC: " + BinaryMetrics.areaUnderPR)
// ---KS
val Tpr_Fpr = ArrayBuffer[Double]()
var tp=0.0
var fp=0.0
var tn=0.0
var fn=0.0
for(threshold <- 0.0 to 1.0 by 0.1){
//判为正类实际也为正类
tp = aucDf.filter(s"prob1 >= $threshold and $trueLabel == 1").count().toDouble
//判为正类实际为负类
fp = aucDf.filter(s"prob1 >= $threshold and $trueLabel == 0").count().toDouble
//判为负类实际为负类
tn = aucDf.filter(s"prob1 < $threshold and $trueLabel == 0").count().toDouble
//判为负类实际为正类
fn = aucDf.filter(s"prob1 < $threshold and $trueLabel == 1").count().toDouble
Tpr_Fpr.append(tp/(tp+fn)-fp/(fp+tn))
}
println("KS Value: "+Tpr_Fpr.max)
}
BinaryClassificationModelMectrics3: (spark: org.apache.spark.sql.SparkSession, PredictDf: org.apache.spark.sql.DataFrame, probName: String, preLabel: String, trueLabel: String)Unit
BinaryClassificationModelMectrics3(spark=spark, PredictDf=PredictDf,probName="prob"
, preLabel = "preLabel", trueLabel= "label")
--------------------------------------- Confusion Matrix ------------------------------------------------
Pre Neg Pre Pos
True Neg 4.0 1.0
True Pos 2.0 3.0
+------------+---+---+
|True-Predict| 0| 1|
+------------+---+---+
| 0| 4| 1|
| 1| 2| 3|
+------------+---+---+
---------------------------------- Accuarcy&Precision&Recall&F1Score ------------------------------------
---------------------- Use Package
Accuaracy: 0.7
Precision(0.0) = 0.6666666666666666
Recall(0.0) = 0.8
F1Score(0.0) = 0.7272727272727272
Precision(1.0) = 0.75
Recall(1.0) = 0.6
F1Score(1.0) = 0.6666666666666665
---------------------- Not Use Package
Accuaracy: 0.7
Neg precision: 0.6666666666666666
Neg recall: 0.8
Neg F1Score: 0.7272727272727273
Pos precision: 0.75
Pos recall: 0.6
Pos F1Score: 0.6666666666666666
-----------------------------*----------- Auc&Prc&Ks ----------------------------------------------------
AUC: 0.92
PRC: 0.9183333333333332
KS Value: 0.8
2020-03-25 于南京市江宁区九龙湖