>朴素贝叶斯原理见该博客(强烈推荐):
>https://mp.weixin.qq.com/s?src=11×tamp=1584688625&ver=2227&signature=O754zhc6apcSqOgNLOcewFs6K3RMvj9Tuz1nB4I*-IfaZLh5wlbpKA8iJxFtQ*xLy3FoYyW*pB2t7puAhrS7WS8uZLuH2XBdcv8u1Cp2u-Elufc7IvQ67zGNA6uFwLGC&new=1
/**
* Created by GangTian on 2020/3/22 in SIBAT
* 针对bayes中预测速度太慢的部分进行改进,个人认为主要原因在于生成的模型过于简单了。
* NavieBayes2生成模型时,各特征均加入了未知数X,这样可以极大的提升预测时的速度,
* 但当各features纬度太高时(几十纬或上百纬时),生成模型所需时间很大。
* 不过针对features纬度较低时可用。当然,Spark ml可直接调用Bayes模型
*/
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.functions.udf
import scala.collection.mutable.ArrayBuffer
object NavieBayes {
/**
* 模型训练,针对历史数据,主要生成相应的条件概率和先验概率
*
* @param trainData trainData格式:f1,f2,f3.....fn,label (f1,f2..fn数据类型任意,colName任意)
* @return
*/
def train(trainData: DataFrame): DataFrame = {
import trainData.sparkSession.implicits._
val columns = trainData.columns
val arrColumn = columns.map(column => col(column).cast("String"))
val transData = trainData.select(arrColumn: _*)
val featuresCol = columns.filter(colName => !colName.equals("label"))
val lab_df = transData.groupBy("label").count()
//1.计算先验概率
val lamda = 1 // 采用拉普拉斯平滑处理(lamda = 1),可以避免出现新词时,条件概率为零的情况
val k = lab_df.count()
val totalRecord = trainData.count()
val pri_pro = lab_df.withColumn("pri_pro", ($"count" + lamda) / (totalRecord + k * lamda * 1.0))
//2.统计各条件下的记录数
var condition_record = transData.groupByKey(row => row.getAs[String](featuresCol.head) + "," + row.getAs[String]("label"))
.flatMapGroups((str, it) => {
val arr = str.split(",")
val category = arr.head
val label = arr.last
val num = it.toList.length
Array((category, label, num, featuresCol.head))
}).toDF("optional", "label", "num", "featureName")
for (i <- 1 until (featuresCol.length)) {
val featureName = featuresCol(i)
val df = transData.groupByKey(row => row.getAs[String](featureName) + "," + row.getAs[String]("label"))
.flatMapGroups((str, it) => {
val arr = str.split(",")
val category = arr.head
val label = arr.last
val num = it.toList.length
Array((category, label, num, featureName))
}).toDF("optional", "label", "num", "featureName")
condition_record = condition_record.union(df)
}
//3.计算各特征中的个数
val featuresOptional = new ArrayBuffer[(String, Long)]()
for (featureName <- featuresCol) {
val num = transData.select(featureName).distinct().count()
featuresOptional.append((featureName, num))
}
val addFeaturesOptional = udf((featureName: String) => {
featuresOptional.filter(_._1 == featureName).head._2
})
//4.计算历史的条件概率,不一定管用,若预测的输入特征在历史记录中存在,直接调用历史的条件概率即可,若输入在历史记录种不存在,则需重新计算
var history_pro = condition_record.withColumn("historyFeaturesNum", addFeaturesOptional($"featureName"))
.join(lab_df.withColumnRenamed("label", "key"), $"key" === $"label").drop("key")
.withColumn("condition_pro", ($"num" + lamda) * 1.0 / ($"count" + $"historyFeaturesNum" * lamda))
.join(pri_pro.withColumnRenamed("label", "key").withColumnRenamed("count", "ct"), $"key" === $"label")
.drop("key", "ct")
//5.获取各label对应的条件概率
val cdf = history_pro.map(row => (row.getAs[String]("label"), row.getAs[Double]("pri_pro"))).distinct().collect()
//6.针对history_pro,对于部分未出现的结果,进行条件概率和先验概率填充,但填充进去的任一特征的值,不得超出历史记录中的可选值
for (colName <- featuresCol) {
val df = history_pro.filter($"featureName" === colName)
val historyFeaturesNum = df.map(_.getAs[Long]("historyFeaturesNum")).distinct().collect().head
val optionalType = df.map(_.getAs[String]("optional")).distinct().collect()
val k = optionalType.length
val m = lab_df.count().toInt
val labs = lab_df.map(_.getAs[String]("label")).collect()
val record = df.map(row => (row.getAs[String]("optional") + "," + row.getAs[String]("label"))).collect()
if (record.length < k * m) {
for (i <- 0 until (k)) {
for (j <- 0 until (m)) {
val tp = optionalType(i) + "," + labs(j)
if (!record.contains(tp)) {
val count = df.filter($"label" === labs(j)).map(_.getAs[Long]("count")).distinct().collect().head
//自己算一个condition_pro概率,pri_pro概率根据label,自动填充就行
val ct = lab_df.filter($"label" === labs(j)).map(_.getAs[Long]("count")).head()
val prip = cdf.filter(_._1 == labs(j)).head._2
val conp = lamda * 1.0 / (k + ct)
val xdf = Seq((optionalType(i), labs(j), 0L, colName, historyFeaturesNum, count, conp, prip))
.toDF("optional", "label", "num", "featureName", "historyFeaturesNum", "count", "condition_pro", "pri_pro")
history_pro = history_pro.union(xdf)
}
}
}
}
//7.增加任一特征未曾出现的概率,optional用X表示
for (j <- 0 until (m)) {
//自己算一个condition_pro概率,pri_pro概率根据label,自动填充就行
val count = df.filter($"label" === labs(j)).map(_.getAs[Long]("count")).distinct().collect().head
val ct = lab_df.filter($"label" === labs(j)).map(_.getAs[Long]("count")).head()
val conp = lamda * 1.0 / ((k + 1) + ct)
val prip = cdf.filter(_._1 == labs(j)).head._2
val xdf = Seq(("X", labs(j), 0L, colName, historyFeaturesNum, count, conp, prip))
.toDF("optional", "label", "num", "featureName", "historyFeaturesNum", "count", "condition_pro", "pri_pro")
history_pro = history_pro.union(xdf)
}
}
history_pro
}
/**
* 对输入数据进行预测
*
* @param testData 输入同trainData(不含label列)
* @param model train生成的model
* @return
*/
def predict(testData: DataFrame, model: DataFrame): DataFrame = {
import model.sparkSession.implicits._
val columns = testData.columns
val arrColumn = columns.map(column => col(column).cast("String"))
val transData = testData.select(arrColumn: _*) //将所有字段全部转为String形式
val testRecord = transData.map(row => {
var str = ""
for (column <- columns) str += (row.getAs[String](column) + ",")
str.dropRight(1)
}).collect().map(_.split(","))
//最终预测结果
var predictLabel = Seq[String]()
for (record <- testRecord) { //针对测试集
//存放每一个特征对应的所有label的概率,最后寻找最大的label作为当前记录的最终预测的label
var features_pro = Seq[(String, Double)]()
//计算各特征条件概率
for (i <- 0 until (columns.length)) {
val option = record(i) //找到arrivalDate对应的值
val featureName = columns(i) // 找到了当前值,所表示的featureName = arrivalDate
//1.首先判断该option在当前特征中是否出现过
val tf = model.filter($"featureName" === featureName)
val labels = tf.map(row => row.getAs[String]("optional")).collect()
if (labels.contains(option)) {
//值在当前特征的历史记录种存在,则直接从model中选择condition_pro(条件概率)和pri_pro(先验概率)
val res = tf.filter($"optional" === option).map(row => (row.getAs[String]("label"), row.getAs[Double]("condition_pro"))).collect()
for (re <- res) features_pro = features_pro :+ re
} else {
//当出现新特征时,基于拉普拉斯平滑,重新计算条件概率
val res = tf.filter($"optional" === "X").map(row => (row.getAs[String]("label"), row.getAs[Double]("condition_pro"))).collect()
for (re <- res) features_pro = features_pro :+ re
}
}
val result = features_pro.toDF("key", "probability_condition").groupByKey(row => row.getAs[String]("key"))
.flatMapGroups((str, it) => {
var gv = 1.0
val its = it.toList.map(_.getAs[Double]("probability_condition"))
for (s <- its) gv = gv * s
Array((str, gv))
}).toDF("key", "probability_condition").join(model.select("label", "pri_pro").distinct(), $"key" === $"label")
.withColumn("probability", $"probability_condition" * $"pri_pro").drop("key")
.sort($"probability".desc).head().getAs[String]("label")
predictLabel = predictLabel :+ result
}
predictLabel.toDF("prediction")
}
}
测试数据集:
生成的贝叶斯模型: