算法小白的第二次尝试----NavieBayes-CSDN博客

本文链接：https://blog.csdn.net/Java_Man_China/article/details/105056167

>朴素贝叶斯原理见该博客（强烈推荐）：
>https://mp.weixin.qq.com/s?src=11&timestamp=1584688625&ver=2227&signature=O754zhc6apcSqOgNLOcewFs6K3RMvj9Tuz1nB4I*-IfaZLh5wlbpKA8iJxFtQ*xLy3FoYyW*pB2t7puAhrS7WS8uZLuH2XBdcv8u1Cp2u-Elufc7IvQ67zGNA6uFwLGC&new=1

/**
  * Created by GangTian on 2020/3/22 in SIBAT
  * 针对bayes中预测速度太慢的部分进行改进，个人认为主要原因在于生成的模型过于简单了。
  * NavieBayes2生成模型时，各特征均加入了未知数X，这样可以极大的提升预测时的速度，
  * 但当各features纬度太高时（几十纬或上百纬时），生成模型所需时间很大。
  * 不过针对features纬度较低时可用。当然，Spark ml可直接调用Bayes模型
  */

import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.functions.udf
import scala.collection.mutable.ArrayBuffer

object NavieBayes {
  /**
    * 模型训练，针对历史数据，主要生成相应的条件概率和先验概率
    *
    * @param trainData trainData格式：f1,f2,f3.....fn,label (f1,f2..fn数据类型任意，colName任意)
    * @return
    */
  def train(trainData: DataFrame): DataFrame = {
    import trainData.sparkSession.implicits._
    val columns = trainData.columns
    val arrColumn = columns.map(column => col(column).cast("String"))
    val transData = trainData.select(arrColumn: _*)
    val featuresCol = columns.filter(colName => !colName.equals("label"))
    val lab_df = transData.groupBy("label").count()

    //1.计算先验概率
    val lamda = 1 // 采用拉普拉斯平滑处理（lamda = 1）,可以避免出现新词时，条件概率为零的情况
    val k = lab_df.count()
    val totalRecord = trainData.count()
    val pri_pro = lab_df.withColumn("pri_pro", ($"count" + lamda) / (totalRecord + k * lamda * 1.0))

    //2.统计各条件下的记录数
    var condition_record = transData.groupByKey(row => row.getAs[String](featuresCol.head) + "," + row.getAs[String]("label"))
      .flatMapGroups((str, it) => {
        val arr = str.split(",")
        val category = arr.head
        val label = arr.last
        val num = it.toList.length
        Array((category, label, num, featuresCol.head))
      }).toDF("optional", "label", "num", "featureName")

    for (i <- 1 until (featuresCol.length)) {
      val featureName = featuresCol(i)
      val df = transData.groupByKey(row => row.getAs[String](featureName) + "," + row.getAs[String]("label"))
        .flatMapGroups((str, it) => {
          val arr = str.split(",")
          val category = arr.head
          val label = arr.last
          val num = it.toList.length
          Array((category, label, num, featureName))
        }).toDF("optional", "label", "num", "featureName")
      condition_record = condition_record.union(df)
    }

    //3.计算各特征中的个数
    val featuresOptional = new ArrayBuffer[(String, Long)]()
    for (featureName <- featuresCol) {
      val num = transData.select(featureName).distinct().count()
      featuresOptional.append((featureName, num))
    }

    val addFeaturesOptional = udf((featureName: String) => {
      featuresOptional.filter(_._1 == featureName).head._2
    })

    //4.计算历史的条件概率，不一定管用，若预测的输入特征在历史记录中存在，直接调用历史的条件概率即可，若输入在历史记录种不存在，则需重新计算
    var history_pro = condition_record.withColumn("historyFeaturesNum", addFeaturesOptional($"featureName"))
      .join(lab_df.withColumnRenamed("label", "key"), $"key" === $"label").drop("key")
      .withColumn("condition_pro", ($"num" + lamda) * 1.0 / ($"count" + $"historyFeaturesNum" * lamda))
      .join(pri_pro.withColumnRenamed("label", "key").withColumnRenamed("count", "ct"), $"key" === $"label")
      .drop("key", "ct")

    //5.获取各label对应的条件概率
    val cdf = history_pro.map(row => (row.getAs[String]("label"), row.getAs[Double]("pri_pro"))).distinct().collect()

    //6.针对history_pro，对于部分未出现的结果，进行条件概率和先验概率填充，但填充进去的任一特征的值，不得超出历史记录中的可选值
    for (colName <- featuresCol) {
      val df = history_pro.filter($"featureName" === colName)
      val historyFeaturesNum = df.map(_.getAs[Long]("historyFeaturesNum")).distinct().collect().head
      val optionalType = df.map(_.getAs[String]("optional")).distinct().collect()
      val k = optionalType.length
      val m = lab_df.count().toInt
      val labs = lab_df.map(_.getAs[String]("label")).collect()
      val record = df.map(row => (row.getAs[String]("optional") + "," + row.getAs[String]("label"))).collect()
      if (record.length < k * m) {
        for (i <- 0 until (k)) {
          for (j <- 0 until (m)) {
            val tp = optionalType(i) + "," + labs(j)
            if (!record.contains(tp)) {
              val count = df.filter($"label" === labs(j)).map(_.getAs[Long]("count")).distinct().collect().head
              //自己算一个condition_pro概率，pri_pro概率根据label，自动填充就行
              val ct = lab_df.filter($"label" === labs(j)).map(_.getAs[Long]("count")).head()
              val prip = cdf.filter(_._1 == labs(j)).head._2
              val conp = lamda * 1.0 / (k + ct)
              val xdf = Seq((optionalType(i), labs(j), 0L, colName, historyFeaturesNum, count, conp, prip))
                .toDF("optional", "label", "num", "featureName", "historyFeaturesNum", "count", "condition_pro", "pri_pro")
              history_pro = history_pro.union(xdf)
            }
          }
        }
      }

      //7.增加任一特征未曾出现的概率，optional用X表示
      for (j <- 0 until (m)) {
        //自己算一个condition_pro概率，pri_pro概率根据label，自动填充就行
        val count = df.filter($"label" === labs(j)).map(_.getAs[Long]("count")).distinct().collect().head
        val ct = lab_df.filter($"label" === labs(j)).map(_.getAs[Long]("count")).head()
        val conp = lamda * 1.0 / ((k + 1) + ct)
        val prip = cdf.filter(_._1 == labs(j)).head._2
        val xdf = Seq(("X", labs(j), 0L, colName, historyFeaturesNum, count, conp, prip))
          .toDF("optional", "label", "num", "featureName", "historyFeaturesNum", "count", "condition_pro", "pri_pro")
        history_pro = history_pro.union(xdf)
      }
    }
    history_pro
  }

  /**
    * 对输入数据进行预测
    *
    * @param testData 输入同trainData(不含label列)
    * @param model    train生成的model
    * @return
    */
  def predict(testData: DataFrame, model: DataFrame): DataFrame = {
    import model.sparkSession.implicits._
    val columns = testData.columns
    val arrColumn = columns.map(column => col(column).cast("String"))
    val transData = testData.select(arrColumn: _*) //将所有字段全部转为String形式
    val testRecord = transData.map(row => {
      var str = ""
      for (column <- columns) str += (row.getAs[String](column) + ",")
      str.dropRight(1)
    }).collect().map(_.split(","))

    //最终预测结果
    var predictLabel = Seq[String]()
    for (record <- testRecord) { //针对测试集
      //存放每一个特征对应的所有label的概率，最后寻找最大的label作为当前记录的最终预测的label
      var features_pro = Seq[(String, Double)]()
      //计算各特征条件概率
      for (i <- 0 until (columns.length)) {
        val option = record(i) //找到arrivalDate对应的值
        val featureName = columns(i) // 找到了当前值，所表示的featureName = arrivalDate
        //1.首先判断该option在当前特征中是否出现过
        val tf = model.filter($"featureName" === featureName)
        val labels = tf.map(row => row.getAs[String]("optional")).collect()
        if (labels.contains(option)) {
          //值在当前特征的历史记录种存在，则直接从model中选择condition_pro（条件概率）和pri_pro（先验概率）
          val res = tf.filter($"optional" === option).map(row => (row.getAs[String]("label"), row.getAs[Double]("condition_pro"))).collect()
          for (re <- res) features_pro = features_pro :+ re
        } else {
          //当出现新特征时，基于拉普拉斯平滑，重新计算条件概率
          val res = tf.filter($"optional" === "X").map(row => (row.getAs[String]("label"), row.getAs[Double]("condition_pro"))).collect()
          for (re <- res) features_pro = features_pro :+ re
        }
      }

      val result = features_pro.toDF("key", "probability_condition").groupByKey(row => row.getAs[String]("key"))
        .flatMapGroups((str, it) => {
          var gv = 1.0
          val its = it.toList.map(_.getAs[Double]("probability_condition"))
          for (s <- its) gv = gv * s
          Array((str, gv))
        }).toDF("key", "probability_condition").join(model.select("label", "pri_pro").distinct(), $"key" === $"label")
        .withColumn("probability", $"probability_condition" * $"pri_pro").drop("key")
        .sort($"probability".desc).head().getAs[String]("label")
      predictLabel = predictLabel :+ result
    }
    predictLabel.toDF("prediction")
  }
}