贝叶斯平滑

常用指标

uCTR --点击用户数 / 曝光用户数
ctr – 点击次数 / 曝光次数
uCVR --购买用户数 / 点击用户数
cvr – 购买次数/点击次数
ctcvr – 购买用户数 / 曝光用户数
arpu – 成功支付的金额 / 曝光用户数
ecpm- 流水*1000/曝光次数

package sparkSQL

import org.apache.spark.sql.{DataFrame, Row, SparkSession}

object BayesParam {

  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder()
      .appName(this.getClass.getName)
      .master("local[2]")
      .getOrCreate()
    val sc = spark.sparkContext

    val lineRDD = sc.textFile("F:\\ideaProjects\\spark-version2\\src\\main\\resources\\bayesparam\\themeInfo")

    import spark.implicits._

    val themeInfoDF = lineRDD.map {
      line => {
        val arr = line.split(" ")
        val groupId = arr(0)
        val themeId = arr(1)
        val theme_ver = arr(2)
        val price_level = arr(3)
        val exposure_users = arr(4)
        val click_users = arr(5)
        val payment_users = arr(6)
        val payment_amt = arr(7)
        (groupId, themeId, theme_ver, price_level, exposure_users, click_users, payment_users, payment_amt)
      }
    }.toDF("groupId", "themeId", "theme_ver", "price_level", "exposure_users", "click_users", "payment_users", "payment_amt")

    import org.apache.spark.sql.functions._

    val key = "key"
    val values = Array("ctr", "cvr", "ctcvr", "arpu")

    val themeInfoCtrDF = themeInfoDF.withColumn(key, concat_ws("-", col("groupId"), col("theme_ver")))
      .withColumn("ctr", col("click_users") / col("exposure_users"))
      .withColumn("cvr", col("payment_users") / col("click_users"))
      .withColumn("ctcvr", col("payment_users") / col("exposure_users"))
      .withColumn("arpu", col("payment_amt") / col("exposure_users"))
      .selectExpr(key, "groupId", "themeId", "theme_ver", "price_level", "exposure_users", "click_users", "payment_users", "payment_amt","ctr", "cvr", "ctcvr", "arpu")

    //获取分组key的ctr、cvr、ctcvr、arpu指标的alpha、beta参数
    var tmp_themeInfoCtrDF = themeInfoCtrDF.select(key).distinct()

    for (value <- values) {
      //计算平均数mean和方差variance
      val meanAndVarianceMap: Map[String, Array[Double]] = evaluateMeanAndVariance(key, value, themeInfoCtrDF)
      val alphaAndBetaMap: Map[String, Array[Double]] = evaluateAlphaAndBeta(meanAndVarianceMap)

      //通过实名函数定义UDF
      val getMeanUDF = udf((key: String) => meanAndVarianceMap.getOrElse(key, Array(0.0, 0.0))(0))
      val getVarianceUDF = udf((key: String) => meanAndVarianceMap.getOrElse(key, Array(0.0, 0.0))(1))
      val getAlphaUDF = udf((key: String) => alphaAndBetaMap.getOrElse(key, Array(0.0, 0.0))(0))
      val getBetaUDF = udf((key: String) => alphaAndBetaMap.getOrElse(key, Array(0.0, 0.0))(1))

      if (value.equals("ctr")) {
        tmp_themeInfoCtrDF = tmp_themeInfoCtrDF
          .withColumn(value+"_mean", getMeanUDF(col(key)))
          .withColumn(value+"_var",getVarianceUDF(col(key)))
          .withColumn(value+"_a", getAlphaUDF(col(key)))
          .withColumn(value+"_b",getBetaUDF(col(key)))
      } else {
        tmp_themeInfoCtrDF = tmp_themeInfoCtrDF
          .withColumn(value+"_a", getAlphaUDF(col(key)))
          .withColumn(value+"_b",getBetaUDF(col(key)))
      }
    }

    tmp_themeInfoCtrDF = tmp_themeInfoCtrDF.select(key,"ctr_mean","ctr_var","ctr_a","ctr_b","cvr_a","cvr_b","ctcvr_a","ctcvr_b","arpu_a","arpu_b")

    //关联themeInfoCtrDF,计算贝叶斯平滑修正后的ctr、cvr、ctcvr、arpu
    val resultDF = themeInfoCtrDF.join(tmp_themeInfoCtrDF,Seq(key))
      .select(key,"groupId", "themeId", "theme_ver", "price_level", "exposure_users", "click_users", "payment_users", "payment_amt",
        "ctr", "cvr", "ctcvr", "arpu",
        "ctr_mean","ctr_var",
        "ctr_a","ctr_b","cvr_a","cvr_b","ctcvr_a","ctcvr_b","arpu_a","arpu_b")
      .withColumn("b_ctr",(col("click_users")+col("ctr_a")) / (col("exposure_users")+col("ctr_a")+col("ctr_b")))
      .withColumn("b_cvr",(col("payment_users")+col("cvr_a")) / (col("click_users")+col("cvr_a")+col("cvr_b")))
      .withColumn("b_ctcvr",(col("payment_users")+col("ctcvr_a")) / (col("exposure_users")+col("ctcvr_a")+col("ctcvr_b")))
      .withColumn("b_arpu",(col("payment_amt")+col("ctcvr_a")) / (col("exposure_users")+col("ctcvr_a")+col("ctcvr_b")))
      .select(key,"groupId", "themeId", "theme_ver", "price_level", "exposure_users", "click_users", "payment_users", "payment_amt",
        "ctr", "cvr", "ctcvr", "arpu",
        "ctr_mean","ctr_var",
        "ctr_a","ctr_b",
        "b_ctr","b_cvr","b_ctcvr","b_arpu")

    resultDF.show()
  }


  def evaluateAlphaAndBeta(meanAndVarianceMap: Map[String, Array[Double]]): Map[String, Array[Double]] = {
    meanAndVarianceMap.map {
      case (key: String, meanAndVariance: Array[Double]) => {
        val mean = meanAndVariance(0)
        val variance = meanAndVariance(1)
        var tmp = 0.0
        if (mean != 0) {
          tmp = mean * (1 - mean) / variance - 1
        }
        val alpha = mean * tmp
        val beta = (1 - mean) * tmp
        (key, Array(alpha, beta))
      }
    }
  }

  def evaluateMeanAndVariance(key: String, value: String, themeInfoCtrDF: DataFrame): Map[String, Array[Double]] = {
    import org.apache.spark.sql.functions._
    val tmpDF = themeInfoCtrDF.withColumn(key, concat_ws("-", col("groupId"), col("theme_ver")))
      .selectExpr(key, "groupId", "themeId", "theme_ver", "price_level", "exposure_users", "click_users", "payment_users", value)
      .groupBy(key)
      .agg(avg(value) as "mean", variance(value) as "variance")
      .selectExpr(key, "mean", "variance")
    val rdd = tmpDF.rdd


    tmpDF.rdd.map {
      case Row(key: String, mean: Double, variance: Double) => {
        (key, Array(mean, variance))
      }
    }.collect().toMap
  }
}

测试数据:
P3105 10008611 10 0 100 10 1 2
P3105 10008612 10 1 3000 150 10 30
P3105 10008613 10 0 4000 400 20 40
P3105 10008614 10 1 5000 1000 100 300
P3105 10008615 10 0 10000 1000 200 600
P3302 10008811 10 0 100 10 1 3
P3302 10008812 10 1 3000 150 10 20
P3302 10008813 10 0 4000 400 20 40
P3302 10008814 10 1 5000 1000 100 300
P3302 10008815 10 0 10000 1000 200 400
P6105 10009911 10 0 100 10 1 4
P6105 10009912 10 1 3000 150 10 30
P6105 10009913 10 0 4000 400 20 50
P6105 10009914 10 1 5000 1000 100 300
P6105 10009915 10 0 10000 1000 200 400
P6106 10006611 10 0 1000 500 50 600
P6106 10006612 10 0 10000 1000 200 800
P6107 10006611 10 0 10000 1000 200 800
P6107 10006612 10 0 10000 1000 200 800

测试结果:
+--------+-------+--------+---------+-----------+--------------+-----------+-------------+-----------+----+-------------------+--------------------+--------------------+-------------------+-------------------+------------------+------------------+-------------------+--------------------+--------------------+--------------------+
|     key|groupId| themeId|theme_ver|price_level|exposure_users|click_users|payment_users|payment_amt| ctr|                cvr|               ctcvr|                arpu|           ctr_mean|            ctr_var|             ctr_a|             ctr_b|              b_ctr|               b_cvr|             b_ctcvr|              b_arpu|
+--------+-------+--------+---------+-----------+--------------+-----------+-------------+-----------+----+-------------------+--------------------+--------------------+-------------------+-------------------+------------------+------------------+-------------------+--------------------+--------------------+--------------------+
|P3105-10|  P3105|10008611|       10|          0|           100|         10|            1|          2| 0.1|                0.1|                0.01|                0.02|0.11000000000000001|              0.003| 3.479666666666668|28.153666666666673|  0.102403140035452|  0.1024160952724648|0.011070317361543248|0.014648413192283759|
|P3105-10|  P3105|10008612|       10|          1|          3000|        150|           10|         30|0.05|0.06666666666666667|0.003333333333333...|                0.01|0.11000000000000001|              0.003| 3.479666666666668|28.153666666666673|0.05062606515739591| 0.07214376022608955|0.003803741384968...|0.010094081610327097|
|P3105-10|  P3105|10008613|       10|          0|          4000|        400|           20|         40| 0.1|               0.05|               0.005|                0.01|0.11000000000000001|              0.003| 3.479666666666668|28.153666666666673|0.10007846282317341|0.053295138194626016|0.005286284953395472|0.010071571238348868|
|P3105-10|  P3105|10008614|       10|          1|          5000|       1000|          100|        300| 0.2|                0.1|                0.02|                0.06|0.11000000000000001|              0.003| 3.479666666666668|28.153666666666673|0.19943417975607652| 0.10008554981248043| 0.01971123497414546| 0.05832516285004365|
|P3105-10|  P3105|10008615|       10|          0|         10000|       1000|          200|        600| 0.1|                0.2|                0.02|                0.06|0.11000000000000001|              0.003| 3.479666666666668|28.153666666666673|0.10003153358210194|  0.1975190554380674|0.019853071823959543| 0.05914781657896534|
|P6105-10|  P6105|10009911|       10|          0|           100|         10|            1|          4| 0.1|                0.1|                0.01|                0.04|0.11000000000000001|              0.003| 3.479666666666668|28.153666666666673|  0.102403140035452|  0.1024160952724648|0.011070317361543248| 0.02180460485376478|
|P6105-10|  P6105|10009912|       10|          1|          3000|        150|           10|         30|0.05|0.06666666666666667|0.003333333333333...|                0.01|0.11000000000000001|              0.003| 3.479666666666668|28.153666666666673|0.05062606515739591| 0.07214376022608955|0.003803741384968...|0.010094081610327097|
|P6105-10|  P6105|10009913|       10|          0|          4000|        400|           20|         50| 0.1|               0.05|               0.005|              0.0125|0.11000000000000001|              0.003| 3.479666666666668|28.153666666666673|0.10007846282317341|0.053295138194626016|0.005286284953395472|0.012464214380825567|
|P6105-10|  P6105|10009914|       10|          1|          5000|       1000|          100|        300| 0.2|                0.1|                0.02|                0.06|0.11000000000000001|              0.003| 3.479666666666668|28.153666666666673|0.19943417975607652| 0.10008554981248043| 0.01971123497414546| 0.05832516285004365|
|P6105-10|  P6105|10009915|       10|          0|         10000|       1000|          200|        400| 0.1|                0.2|                0.02|                0.04|0.11000000000000001|              0.003| 3.479666666666668|28.153666666666673|0.10003153358210194|  0.1975190554380674|0.019853071823959543| 0.03950044420146245|
|P6106-10|  P6106|10006611|       10|          0|          1000|        500|           50|        600| 0.5|                0.1|                0.05|                 0.6|                0.3|0.08000000000000002|0.4874999999999998|1.1374999999999995| 0.4996755272681892|  0.1023355576739752| 0.04896575803031086|  0.5610435524750427|
|P6106-10|  P6106|10006612|       10|          0|         10000|       1000|          200|        800| 0.1|                0.2|                0.02|                0.08|                0.3|0.08000000000000002|0.4874999999999998|1.1374999999999995|0.10003249471960805| 0.19880429477794048|0.020110266746813873|  0.0796691997595584|
|P6107-10|  P6107|10006611|       10|          0|         10000|       1000|          200|        800| 0.1|                0.2|                0.02|                0.08|                0.1|                0.0|          Infinity|          Infinity|                NaN|                 NaN|                 NaN|                 NaN|
|P6107-10|  P6107|10006612|       10|          0|         10000|       1000|          200|        800| 0.1|                0.2|                0.02|                0.08|                0.1|                0.0|          Infinity|          Infinity|                NaN|                 NaN|                 NaN|                 NaN|
|P3302-10|  P3302|10008811|       10|          0|           100|         10|            1|          3| 0.1|                0.1|                0.01|                0.03|0.11000000000000001|              0.003| 3.479666666666668|28.153666666666673|  0.102403140035452|  0.1024160952724648|0.011070317361543248| 0.01822650902302427|
|P3302-10|  P3302|10008812|       10|          1|          3000|        150|           10|         20|0.05|0.06666666666666667|0.003333333333333...|0.006666666666666667|0.11000000000000001|              0.003| 3.479666666666668|28.153666666666673|0.05062606515739591| 0.07214376022608955|0.003803741384968...| 0.00694891149764796|
|P3302-10|  P3302|10008813|       10|          0|          4000|        400|           20|         40| 0.1|               0.05|               0.005|                0.01|0.11000000000000001|              0.003| 3.479666666666668|28.153666666666673|0.10007846282317341|0.053295138194626016|0.005286284953395472|0.010071571238348868|
|P3302-10|  P3302|10008814|       10|          1|          5000|       1000|          100|        300| 0.2|                0.1|                0.02|                0.06|0.11000000000000001|              0.003| 3.479666666666668|28.153666666666673|0.19943417975607652| 0.10008554981248043| 0.01971123497414546| 0.05832516285004365|
|P3302-10|  P3302|10008815|       10|          0|         10000|       1000|          200|        400| 0.1|                0.2|                0.02|                0.04|0.11000000000000001|              0.003| 3.479666666666668|28.153666666666673|0.10003153358210194|  0.1975190554380674|0.019853071823959543| 0.03950044420146245|
+--------+-------+--------+---------+-----------+--------------+-----------+-------------+-----------+----+-------------------+--------------------+--------------------+-------------------+-------------------+------------------+------------------+-------------------+--------------------+--------------------+--------------------+

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值