RFE用户活跃度模型

package org.example

import org.apache.spark.ml.clustering.{
   KMeans, KMeansModel}
import org.apache.spark.ml.feature.{
   MinMaxScaler, VectorAssembler}
import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.sql.{
   Column, DataFrame, Dataset, Row, SaveMode, SparkSession}

import java.text.{
   ParseException, SimpleDateFormat}
import java.util.{
   Calendar, Date}
import scala.collection.immutable

/**
 * Desc 用户活跃度模型-RFE
 * Recency:最近一次访问时间,用户最后一次访问距今时间
 * Frequency:访问频率,用户一段时间内的访问次数,
 * Engagements:页面互动度,用户一段时间内访问的页面浏览量,或者独立页面数、下载量、 视频播放数量等
 */

object QlRFE_Hive {
   

  def main(args: Array[String]): Unit = {
   
    val sparkSession = SparkSession.builder()
      .appName("QlRFE_Active")
      .enableHiveSupport()
      .getOrCreate()
    sparkSession.sparkContext.setLogLevel("WARN")

    //获取主程序传递参数
    if (args.length != 1) {
   
      println(
        """
          |Usage:qlRFE_Hive
          |Param:
          |      in_day:  计算日期
          |format: yyyyMMdd
          |
          |""".stripMargin)
    }
    val Array(in_day) = args

    //获取前一个月日期
    val dateFormat = new SimpleDateFormat("yyyyMMdd")
    val cal_day: Calendar = Calendar.getInstance()
    try {
   
      cal_day.setTime(dateFormat.parse(in_day))
    } catch {
   
      case e: ParseException => println(e.printStackTrace())
        println(
          """
            |format: yyyyMMdd
            |""".stripMargin)
        System.exit(1)
    }
    cal_day.add(Calendar.MONTH, -1)
    val last_m_day: String = dateFormat.format(cal_day.getTime) //上月同一天
    val this_month_part: String = in_day.substring(0, 6) //本月
    val this_day_part: String = in_day.substring(6) //本月
    val last_month_part: String = last_m_day.substring(0, 6) //上月月份字段
    val last_day_part: String = last_m_day.substring(6) //上月同一天日期字段

    //读取hive表数据(近一个月数据:上月同期至当天)
    val qlRfe_lastMonth: DataFrame = sparkSession.sql(
      s"""
         |SELECT day_id,
         |       user_name,
         |       last_visittime,
         |       visitor_cnt,
         |       visitor_pagecnt
         |  from edc.dwd_qlsys_user_d
         | where month_part= '${last_month_part}'
         |   and day_part>= '${last_day_part}'
         |""".stripMargin)
    val qlRfe_ThisMonth: DataFrame = sparkSession.sql(
      s"""
         |SELECT day_id,
         |       user_name,
         |       last_visittime,
         |       visitor_cnt,
         |       visitor_pagecnt
         |  from edc.dwd_qlsys_user_d
         | where month_part= '${this_month_part}'
         |   and day_part<= '${this_day_part}'
         |""".stripMargin)
    val qlRfe_table: Dataset[Row] = qlRfe_lastMonth.union(qlRfe_ThisMonth)

    /*    qlRfe_table.printSchema()
        qlRfe_table.createOrReplaceTempView("qlrfe")
        sparkSession.sql("select max(day_id),min(day_id) from qlrfe").show()*/

    import sparkSession.implicits._
    //    import scala.collection.JavaConversions._
    import org.apache.spark.sql.functions._

    /**
     * Recency:最近一次访问时间,用户最后一次访问距今间隔天数
     * Frequency:访问频率,用户一段时间内的访问次数,
     * Engagements:页面互动度,用户一段时间内访问的页面浏览量
     */
    val recencyAggStr: Column = datediff(to_date(max(col("day_id").cast("String")), "yyyyMMdd"),
      to_date(max("last_visittime"), "yyyy-MM-dd")).as("recencyStr")
    val frequencyAggStr: Column = sum("visitor_cnt").as("frequencyStr")
    val engagementsAggStr: Column = sum("visitor_pagecnt").as("engagementsStr")

    val rfe_result: DataFrame = qlRfe_table.groupBy("user_name")
      .agg(max("day_id").as("day_id"), recencyAggStr, frequencyAggStr, engagementsAggStr)
      .select("day_id", "user_name", "recencyStr", "frequencyStr", "engagementsStr")

    /**
     * +--------+------------+----------+------------+--------------+
     * |  day_id|   user_name|recencyStr|frequencyStr|engagementsStr|
     * +--------+------------+----------+------------+--------------+
     * |20210812|   cao_hq601|      null|        null|          null|
     * |20210812|      chen_j|      null|         5.0|         152.0|
     * |20210812|      gao_pf|         0|        87.0|        6410.0|
     * |20210812|      guo_xw|      null|        10.0|         131.0|
     * |20210812|    li_dq601|         0|        13.0|        1069.0|
     * |20210812|    liu_l603|      null|        54.0|        4605.0|
     * |20210812|     wang_c5|         0|        32.0|         768.0|
     * |20210812|    wang_jm3|      null|        null|          null|
     * |20210812|    zhang_jk|         0|        26.0|         745.0|
     */


    /**
     * rfe规则打分
     */
    val recencyStr = "recencyStr"
    val frequencyStr = "frequencyStr"
    val engagementsStr = "engagementsStr"

    /*val maxRow: util.List[Row] = rfe_result.agg(max(recencyStr).as(recencyStr), max(frequencyStr).as(frequencyStr), max(engagementsStr).as(engagementsStr)).collectAsList()
    val maxRecency: Int = maxRow.get(0).getAs[Int](recencyStr)
    val maxFrequency: Int = maxRow.get(0).getAs[Double](frequencyStr).asInstanceOf[Int]
    val maxEngagements: Int = maxRow.get(0).getAs[Double](engagementsStr).asInstanceOf[Int]*/

    val recencyScore: Column = when(col(recencyStr).between(0, 7), 5)
      .when(col(recencyStr).between(8, 14), 4)
      .when(col(recencyStr).between(
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值