1. spark-shell 执行脚本
spark-shell 中相当于定义了一个Object并提供main(),且代码都是在其中执行,不需额外定义Object。
test.scala
//import org.apache.spark.sql.types._
import org.apache.spark.sql.Row
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder().getOrCreate()
spark.sparkContext.setLogLevel("WARN")
val rdd = spark.sparkContext.parallelize(Array("1 xiaoMing M 26", "2 XiongDa M 27")).map(_.split(" "))
rdd.take(5)
val rowRDD = rdd.map(p => Row(p(0).toInt, p(1).trim, p(2).trim, p(3).toInt))
rowRDD.take(5).foreach(println)
调用
spark-shell -i <test.scala
2. 流程概述
针对不同模块的数据分别写到parquet文件的不同group下,然后对不同group进行union整合,产出新的parquet文件,最终落地到hive。
2.1 整体
run.scala
import java.util.Date
import java.time.LocalDate
import java.time.format.DateTimeFormatter
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions.{col, sum,countDistinct}
import org.apache.spark.sql.types._
val spark = SparkSession.builder().getOrCreate()
val endDate = LocalDate.parse("2021-08-31", DateTimeFormatter.ofPattern("yyyy-MM-dd"))
val df = read_parquet(spark, endDate)
sprak.stop()
2.2 读parquet文件
import java.time.LocalDate
import java.time.format.DateTimeFormatter
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.DataFrame
def read_parquet(spark:SparkSession, endDate:LocalDate):DataFrame = {
val pre7Day = endDate.minusDays(7-1)
val df = spark.read.parquet(f"/user/db_dmp/db_dmp.db/chat_detail/")
.filter('pt>=f"$pre7Day" && 'pt<=f"$endDate")
.select("uid", "fuid", "sourcetype")
.dropDuplicates()
return df
2.3 join 表连接
def table_join(spark:SparkSession, endDate:LocalDate):DataFrame = {
val pre3Day = endDate.minusDays(3-1)
val pre7Day = endDate.minusDays(7-1)
val sql = f"""
select uid
,count(distinct if(sourcetype='0' and pt='$endDate', fuid, null)) as agree_search_1d_cnt
,count(distinct if(sourcetype='0' and pt>='$pre3Day' and pt<='$endDate', fuid, null)) as agree_search_3d_cnt
,count(distinct if(sourcetype='0' and pt>='$pre7Day' and pt<='$endDate', fuid, null)) as agree_search_7d_cnt
from zz.zx_user_action_agree_d_incr
where pt>='$pre7Day' and pt<='$endDate'
group by uid
"""
val df1 = spark.sql(sql)
df1.show()
val df = df1.join(df2, df1("uid")===df2("uid") && df1("fuid")===df2("fuid"), "left")
.select(df1("uid"), df1("fuid"), df2("sourcetype"))
return df
2.4 DF处理
def dealDF(spark:SparkSession, endDate:LocalDate):DataFrame = {
val pre3Day = endDate.minusDays(3-1)
val pre7Day = endDate.minusDays(7-1)
df.groupBy("uid")
.agg(
countDistinct(when($"sourcetype"==="0" && $"pt"==="$endDate", $"fuid") otherwise null) as "search_1",
countDistinct(when($"sourcetype"==="0" && $"pt">="$pre3Day" && $"pt"<="$endDate", $"fuid") otherwise null) as "search_3",
countDistinct($"fuid") as "frd_cnt"
).withColumn(
"chat_with_friends",
struct(
struct(
col("search_1").as("1d"),
col("search_3").as("3d"),
col("search_7").as("7d")
).as("search"),
col('is_active')
)
).select("uid","chat_with_friends")
return df
或
df.withColumn("chatTime", to_timestamp('request_time, "yyyy-MM-dd HH:mm:ss"))
.withColumn("chatHour", hour('chatTime))
.withColumn("chatDayOfWeek", date_format('chatTime, "u"))
.select("uid", "fuid", "chatHour", "chatDayOfWeek")
.dropDuplicates()
df.groupBy("uid")
.agg(collect_list("searchCnt"), collect_list("totalCnt"))
.as[(String, Seq[Map[String, Integer]], Seq[Map[String, Integer]])]
.map{
case (uid, list1, list2) => (
uid,
if(list1.size==0) null else list1.reduce(_ ++ _),
if(list2.size==0) null else list2.reduce(_ ++ _)
)
}.toDF("uid", "search", "nearby", "kdy", "total")
2.5 写parquet文件
def write_parquet(spark:SparkSession, pt:LocalDate) = {
df.repartition(2).write.mode("overwrite")
.parquet(f"/user/db_dmp/db_dmp.db/lx_features/pt=${pt}/group=add_friends")
2.6 合并group生成pivot表
def generatePivotTable(pt:LocalDate):Unit = {
// 合并group
val df = spark.read.option("mergeSchema", "true")
.parquet(f"/user/db_dmp/db_dmp.db/lx_features/pt=$pt")
// 空值需要填充0的字段
val zeroPatchList = Array(
"user_active_1d",
"user_active_3d",
"chat_friends_by_hour_total_23")
val df_result = df.groupBy("uid")
.agg(
max($"user_active"("1d")) as "user_active_1d",
max($"user_active"("3d")) as "user_active_3d",
max($"add_friends"("scan")("1d")) as "add_friends_scan_1d",
max($"add_friends"("scan")("3d")) as "add_friends_scan_3d",
max($"add_friends"("scan")("7d")) as "add_friends_scan_7d"
).na.fill(0, zeroPatchList)
df_result.repartition(20).write.mode("overwrite")
.parquet(f"/user/db_dmp/db_dmp.db/features_pivot/pt=$pt")
}
与pyspark 使用情形类似,可参考: pyspark实现数据生产到parquet及hive表
3. 数据统计
def statistics(pt:LocalDate):DataFrame ={
val df = spark.read.option("mergeSchema", "true")
.parquet(f"/user/db_dmp/db_dmp.db/features_pivot/pt=${pt}")
val cols = List("agree_frd_1d_cnt", "agree_kdy_7d_cnt")
cols.map(colName => {
df.filter(col(colName)>0)
.groupBy(colName)
.count()
.withColumnRenamed(colName, "key")
.withColumnRenamed("count", "cnt")
.withColumn("tag", lit(colName))
.select("tag", "key", "cnt")
}).reduce((x, y) => x unionAll y)
}