1.0 背景
基于上一篇博客的背景, 要求,在一个dataframe中按照不同的时间跨度对item_id进行groupby来统计,最后生成不同时间跨度的df
示例: 从图一转为图二
图一:
图二
2.0 实现方式
2.1 实现基础和一些小原则
在使用Scala完成Spark作业时,应尽量注意一些原则,保证具有Scala风格:
- 尽量不要定义可变的变量var;
- 尽量不要写for循环,基本都可以通多map,reduce来完成;
- 各个方法之间无需通过传递spark参数来传递sparkContext,只需要在同一个spark作业中调用
SparkSession.builder().getOrCreate()
获得;如果没有会创建,如果有直接get; - Scala不管是option还是transform代码尽量控制在一行完成,以便于debug定位问题.
- 在定义各种方法时,如果不能确定参数的个数,可以定义一个统一管理参数的方法;
2.2 实现原理
- 获取最大时间间隔过去前180天以内的所有记录;
- 按照itemId进行groupby;
- 将基于日期的方法剥离出来,单独写一个方法, 构建struct结构;
- 独立pv和uv的计算,便于step3调用;
- 将统计结果进行保存,格式为parquet;
2.3 代码
case class Config(behaviorBasePath: String = "xxxxxx",
baseDate: DateTime = null,
dateSpan: Int = 180,
outputBasePath: String = "xxxx") { }
def main(args: Array[String]): Unit = {
new scopt.OptionParser[Config](
"Command for free novel user behavior statistic base time duration") {
opt[String]("behavior_base_path").text("用户行为表路径")
.action((x, c) => c.copy(behaviorBasePath = x))
opt[Calendar]('d', "base_date")
.action((x, c) => c.copy(baseDate = new DateTime(x.getTimeInMillis)))
opt[Int]("date_span").text("设置最大的统计范围/天, 默认180天").action((x, c) => c.copy(dateSpan = x))
opt[String]("output_base_path").action((x, c) => c.copy(outputBasePath = x))
}.parse(args, Config()) match {
case Some(config) => process(config)
case None => throw new IllegalArgumentException("命令参数不匹配!")
}
}
def process(config: Config): Unit = {
SparkSession.builder().appName("xxx").getOrCreate()
val info = (0 until N)
.map(d => getDateFormat(date).minusDays(d))
.map(beforeDay => {
val fullPath = s"${filePath}/date=${beforeDay.toString("yyyyMMdd")}"
getAllBehaviorPerDay(fullPath, beforeDay)
})
.reduce(_ union _)
val res = getStats(info, config.baseDate)
res.write.mode("overwrite").parquet(config.outputPath)
}
def getAllBehaviorPerDay(path: String, date: DateTime): DataFrame = {
val spark = SparkSession.builder().getOrCreate()
spark.read.parquet(path)
.where(col("userId").isNotNull && col("userId")))
.where(col("action").isNotNull)
.where(col("itemId").isNotNull)
.withColumn("read_time", get_json_object(col("extra_data"), "$.read_time"))
.withColumn("userType", when(col("is_new_user"), lit("new")).otherwise("old"))
.withColumn("date", lit(date.toString("YYYY-MM-dd")))
.select("userId", "itemId", "userType", "action", "read_time", "date")
}
def getStats(info: DataFrame, baseDate: DateTime): DataFrame = {
val stats = info.select("userId", "itemId", "userType", "behavior_type", "read_time", "date")
.groupBy("itemId")
.agg(col("itemId"),
Array(1, 2, 3, 7, 14, 21, 30, 60, 90, 180).map(x => {
getUnitBaseDate(x.toString, baseDate.minusDays(x).toString("YYYY-MM-dd"),
col("userType"), col("behavior_type"), col("date"), col("read_time"), col("userId"))
.as("stats_day" + x.toString)
}): _*
)
stats.select("itemId", Array(1, 2, 3, 7, 14, 21, 30, 60, 90, 180).map("day" + _): _*)
}
//获取一天统计的单元
def getUnitBaseDate(n: String, baseDate: String, userType: Column, action: Column, date: Column,
readTime: Column, userId: Column): Column = {
struct(
getPV(baseDate, "new", "expose", "expose", userType, action, date, userId).as("new_expose_pv"),
sum(when(userType === "new" , readTime).otherwise(0)).cast("long").as("new_read_time"),
sum(when(userType === "old", readTime).otherwise(0)).cast("long").as("old_read_time"),
getPV(baseDate, "new", "read", "read", userType, action, date, userId).as("new_read_pv"),
getUV(baseDate, "old", "shelf_after_read", "shelf", userType, action, date, userId).as("old_shelf_uv")
).as("day" + n)
}
def getPV(baseDate: String, user: String, action1: String, action2: String, userType: Column,
action: Column, date: Column, userId: Column): Column = {
count(when(date >= baseDate && userType === user && (action === action1 || action === action2),
userId).otherwise(lit(null))).cast("int")
}
def getUV(baseDate: String, user: String, action1: String, action2: String, userType: Column,
action: Column, date: Column, userId: Column): Column = {
countDistinct(
when(date >= baseDate && userType === user && (action === action1 || action === action2),
userId).otherwise(lit(null))).cast("int")
}
2.4 代码亮点
- agg()里面可以用Array,展开用 : _*
- dataframe可以用where进行过滤,直接是对列的操作,也可以解析json;
- 可以用Try解决异常问题;