Spark实现数据生产到parquet及hive表

1. spark-shell 执行脚本

spark-shell 中相当于定义了一个Object并提供main(),且代码都是在其中执行,不需额外定义Object。

test.scala

//import org.apache.spark.sql.types._
import org.apache.spark.sql.Row
import org.apache.spark.sql.SparkSession

val spark = SparkSession.builder().getOrCreate()
spark.sparkContext.setLogLevel("WARN")
val rdd = spark.sparkContext.parallelize(Array("1 xiaoMing M 26", "2 XiongDa M 27")).map(_.split(" "))
rdd.take(5)

val rowRDD = rdd.map(p => Row(p(0).toInt, p(1).trim, p(2).trim, p(3).toInt))
rowRDD.take(5).foreach(println)

调用

spark-shell -i <test.scala

2. 流程概述

       针对不同模块的数据分别写到parquet文件的不同group下,然后对不同group进行union整合,产出新的parquet文件,最终落地到hive。

2.1 整体

run.scala

import java.util.Date
import java.time.LocalDate
import java.time.format.DateTimeFormatter

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions.{col, sum,countDistinct}
import org.apache.spark.sql.types._

val spark = SparkSession.builder().getOrCreate()
val endDate = LocalDate.parse("2021-08-31", DateTimeFormatter.ofPattern("yyyy-MM-dd"))

val df = read_parquet(spark, endDate)

sprak.stop()

2.2 读parquet文件

import java.time.LocalDate
import java.time.format.DateTimeFormatter
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.DataFrame

def read_parquet(spark:SparkSession, endDate:LocalDate):DataFrame = {
  val pre7Day = endDate.minusDays(7-1)
  val df = spark.read.parquet(f"/user/db_dmp/db_dmp.db/chat_detail/")
    .filter('pt>=f"$pre7Day" && 'pt<=f"$endDate")
    .select("uid", "fuid", "sourcetype")
    .dropDuplicates()

return df

2.3 join 表连接

def table_join(spark:SparkSession, endDate:LocalDate):DataFrame = {
    val pre3Day = endDate.minusDays(3-1)
    val pre7Day = endDate.minusDays(7-1)
    val sql = f"""
        select uid
        ,count(distinct if(sourcetype='0' and pt='$endDate', fuid, null)) as agree_search_1d_cnt
        ,count(distinct if(sourcetype='0' and pt>='$pre3Day' and pt<='$endDate', fuid, null)) as agree_search_3d_cnt
        ,count(distinct if(sourcetype='0' and pt>='$pre7Day' and pt<='$endDate', fuid, null)) as agree_search_7d_cnt
        from zz.zx_user_action_agree_d_incr
        where pt>='$pre7Day' and pt<='$endDate'
        group by uid
    """
    val df1 = spark.sql(sql)
    df1.show()

    val df = df1.join(df2, df1("uid")===df2("uid") && df1("fuid")===df2("fuid"), "left")
      .select(df1("uid"), df1("fuid"), df2("sourcetype"))

return df

2.4 DF处理

def dealDF(spark:SparkSession, endDate:LocalDate):DataFrame = {
    val pre3Day = endDate.minusDays(3-1)
    val pre7Day = endDate.minusDays(7-1)
    df.groupBy("uid")
      .agg(
        countDistinct(when($"sourcetype"==="0" && $"pt"==="$endDate", $"fuid") otherwise null) as "search_1",
        countDistinct(when($"sourcetype"==="0" && $"pt">="$pre3Day" && $"pt"<="$endDate", $"fuid") otherwise null) as "search_3",
        countDistinct($"fuid") as "frd_cnt"
      ).withColumn(
        "chat_with_friends",
        struct(
          struct(
            col("search_1").as("1d"),
            col("search_3").as("3d"),
            col("search_7").as("7d")
          ).as("search"),
          col('is_active')
        )
      ).select("uid","chat_with_friends")

return df

df.withColumn("chatTime", to_timestamp('request_time, "yyyy-MM-dd HH:mm:ss"))
  .withColumn("chatHour", hour('chatTime))
  .withColumn("chatDayOfWeek", date_format('chatTime, "u"))
  .select("uid", "fuid", "chatHour", "chatDayOfWeek")
  .dropDuplicates()

df.groupBy("uid")
  .agg(collect_list("searchCnt"), collect_list("totalCnt"))
  .as[(String, Seq[Map[String, Integer]], Seq[Map[String, Integer]])]
  .map{
    case (uid, list1, list2) => (
      uid,
      if(list1.size==0) null else list1.reduce(_ ++ _),
      if(list2.size==0) null else list2.reduce(_ ++ _)
    )
  }.toDF("uid", "search", "nearby", "kdy", "total")

2.5 写parquet文件

def write_parquet(spark:SparkSession, pt:LocalDate) = {
  df.repartition(2).write.mode("overwrite")
    .parquet(f"/user/db_dmp/db_dmp.db/lx_features/pt=${pt}/group=add_friends")

2.6 合并group生成pivot表

def generatePivotTable(pt:LocalDate):Unit = {
  // 合并group
  val df = spark.read.option("mergeSchema", "true")
    .parquet(f"/user/db_dmp/db_dmp.db/lx_features/pt=$pt")
  // 空值需要填充0的字段
  val zeroPatchList = Array(
    "user_active_1d",
    "user_active_3d",
    "chat_friends_by_hour_total_23")

  val df_result = df.groupBy("uid")
  .agg(
    max($"user_active"("1d")) as "user_active_1d",
    max($"user_active"("3d")) as "user_active_3d",
    max($"add_friends"("scan")("1d")) as "add_friends_scan_1d",
    max($"add_friends"("scan")("3d")) as "add_friends_scan_3d",
    max($"add_friends"("scan")("7d")) as "add_friends_scan_7d"
  ).na.fill(0, zeroPatchList)
  df_result.repartition(20).write.mode("overwrite")
    .parquet(f"/user/db_dmp/db_dmp.db/features_pivot/pt=$pt")
}

与pyspark 使用情形类似,可参考: pyspark实现数据生产到parquet及hive表

3. 数据统计

def statistics(pt:LocalDate):DataFrame ={
  val df = spark.read.option("mergeSchema", "true")
    .parquet(f"/user/db_dmp/db_dmp.db/features_pivot/pt=${pt}")
  val cols = List("agree_frd_1d_cnt", "agree_kdy_7d_cnt")
  cols.map(colName => {
    df.filter(col(colName)>0)
      .groupBy(colName)
      .count()
      .withColumnRenamed(colName, "key")
      .withColumnRenamed("count", "cnt")
      .withColumn("tag", lit(colName))
      .select("tag", "key", "cnt")
  }).reduce((x, y) => x unionAll y)
}

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值