package org.example
import org.apache.spark.ml.clustering.{
KMeans, KMeansModel}
import org.apache.spark.ml.feature.{
MinMaxScaler, VectorAssembler}
import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.sql.{
Column, DataFrame, Dataset, Row, SaveMode, SparkSession}
import java.text.{
ParseException, SimpleDateFormat}
import java.util.{
Calendar, Date}
import scala.collection.immutable
/**
* Desc 用户活跃度模型-RFE
* Recency:最近一次访问时间,用户最后一次访问距今时间
* Frequency:访问频率,用户一段时间内的访问次数,
* Engagements:页面互动度,用户一段时间内访问的页面浏览量,或者独立页面数、下载量、 视频播放数量等
*/
object QlRFE_Hive {
def main(args: Array[String]): Unit = {
val sparkSession = SparkSession.builder()
.appName("QlRFE_Active")
.enableHiveSupport()
.getOrCreate()
sparkSession.sparkContext.setLogLevel("WARN")
//获取主程序传递参数
if (args.length != 1) {
println(
"""
|Usage:qlRFE_Hive
|Param:
| in_day: 计算日期
|format: yyyyMMdd
|
|""".stripMargin)
}
val Array(in_day) = args
//获取前一个月日期
val dateFormat = new SimpleDateFormat("yyyyMMdd")
val cal_day: Calendar = Calendar.getInstance()
try {
cal_day.setTime(dateFormat.parse(in_day))
} catch {
case e: ParseException => println(e.printStackTrace())
println(
"""
|format: yyyyMMdd
|""".stripMargin)
System.exit(1)
}
cal_day.add(Calendar.MONTH, -1)
val last_m_day: String = dateFormat.format(cal_day.getTime) //上月同一天
val this_month_part: String = in_day.substring(0, 6) //本月
val this_day_part: String = in_day.substring(6) //本月
val last_month_part: String = last_m_day.substring(0, 6) //上月月份字段
val last_day_part: String = last_m_day.substring(6) //上月同一天日期字段
//读取hive表数据(近一个月数据:上月同期至当天)
val qlRfe_lastMonth: DataFrame = sparkSession.sql(
s"""
|SELECT day_id,
| user_name,
| last_visittime,
| visitor_cnt,
| visitor_pagecnt
| from edc.dwd_qlsys_user_d
| where month_part= '${last_month_part}'
| and day_part>= '${last_day_part}'
|""".stripMargin)
val qlRfe_ThisMonth: DataFrame = sparkSession.sql(
s"""
|SELECT day_id,
| user_name,
| last_visittime,
| visitor_cnt,
| visitor_pagecnt
| from edc.dwd_qlsys_user_d
| where month_part= '${this_month_part}'
| and day_part<= '${this_day_part}'
|""".stripMargin)
val qlRfe_table: Dataset[Row] = qlRfe_lastMonth.union(qlRfe_ThisMonth)
/* qlRfe_table.printSchema()
qlRfe_table.createOrReplaceTempView("qlrfe")
sparkSession.sql("select max(day_id),min(day_id) from qlrfe").show()*/
import sparkSession.implicits._
// import scala.collection.JavaConversions._
import org.apache.spark.sql.functions._
/**
* Recency:最近一次访问时间,用户最后一次访问距今间隔天数
* Frequency:访问频率,用户一段时间内的访问次数,
* Engagements:页面互动度,用户一段时间内访问的页面浏览量
*/
val recencyAggStr: Column = datediff(to_date(max(col("day_id").cast("String")), "yyyyMMdd"),
to_date(max("last_visittime"), "yyyy-MM-dd")).as("recencyStr")
val frequencyAggStr: Column = sum("visitor_cnt").as("frequencyStr")
val engagementsAggStr: Column = sum("visitor_pagecnt").as("engagementsStr")
val rfe_result: DataFrame = qlRfe_table.groupBy("user_name")
.agg(max("day_id").as("day_id"), recencyAggStr, frequencyAggStr, engagementsAggStr)
.select("day_id", "user_name", "recencyStr", "frequencyStr", "engagementsStr")
/**
* +--------+------------+----------+------------+--------------+
* | day_id| user_name|recencyStr|frequencyStr|engagementsStr|
* +--------+------------+----------+------------+--------------+
* |20210812| cao_hq601| null| null| null|
* |20210812| chen_j| null| 5.0| 152.0|
* |20210812| gao_pf| 0| 87.0| 6410.0|
* |20210812| guo_xw| null| 10.0| 131.0|
* |20210812| li_dq601| 0| 13.0| 1069.0|
* |20210812| liu_l603| null| 54.0| 4605.0|
* |20210812| wang_c5| 0| 32.0| 768.0|
* |20210812| wang_jm3| null| null| null|
* |20210812| zhang_jk| 0| 26.0| 745.0|
*/
/**
* rfe规则打分
*/
val recencyStr = "recencyStr"
val frequencyStr = "frequencyStr"
val engagementsStr = "engagementsStr"
/*val maxRow: util.List[Row] = rfe_result.agg(max(recencyStr).as(recencyStr), max(frequencyStr).as(frequencyStr), max(engagementsStr).as(engagementsStr)).collectAsList()
val maxRecency: Int = maxRow.get(0).getAs[Int](recencyStr)
val maxFrequency: Int = maxRow.get(0).getAs[Double](frequencyStr).asInstanceOf[Int]
val maxEngagements: Int = maxRow.get(0).getAs[Double](engagementsStr).asInstanceOf[Int]*/
val recencyScore: Column = when(col(recencyStr).between(0, 7), 5)
.when(col(recencyStr).between(8, 14), 4)
.when(col(recencyStr).between(
RFE用户活跃度模型
最新推荐文章于 2024-06-28 07:30:00 发布