1. 流程概述
针对不同模块的数据(相互独立),启用多线程分别写到parquet文件的不同group下,然后对不同group进行union整合,产出新的parquet文件,最终落地到hive。
test.py
# coding=utf-8
import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import multiprocessing
if __name__ == '__main__':
spark = SparkSession \
.builder \
# .appName("Python Spark SQL basic example") \
# .config("user active", "some-value") \
.enableHiveSupport() \
.getOrCreate()
date = (datetime.datetime.now() + datetime.timedelta(days=-1)).strftime("%Y-%m-%d")
# date='2021-08-09'
p1 = multiprocessing.Process(target=user_click(spark, date))
p2 = multiprocessing.Process(target=user_life_cycle(spark, date))
p1.start()
p2.start()
spark.stop()
执行 python test.py
2. 模块拆解
2.1 读parquet文件
采用parquet进行中间数据存储,parquet文件的优势在于schema在文件中,避免中间表反复修改。
注意,单引号与双引号的使用!
三引号下应用三个"\\\"来进行转义!!!
def read_parquet(spark, pt):
# 7天前
last_7dt = (datetime.datetime.strptime(str(pt), "%Y-%m-%d")+ datetime.timedelta(days=-(7 - 1))).strftime("%Y-%m-%d")
df = spark.read.parquet("/user/db_dmp/db_dmp.db/lx_chat_detail/")
.filter("pt>='{0}' and pt<='{1}'"
.format(last_7dt, pt))
.select("uid", "fuid", "chatHour", "pt")
.dropDuplicates()
df.show(2)
# 注册临时表,进行查询
df.createOrReplaceTempView("friend_source")
df_final = df.groupBy("uid", "chatHour").agg(
countDistinct(when((col("sourcetype")==0) & (col("pt")=="'{}'".format(pt)), "fuid").otherwise(None))
.alias("search_1"),
countDistinct(when((col("sourcetype")==0) & (col("pt")>="{}".format(pred3ay)) & (col("pt")<="'{}'".format(pt)),"fuid").otherwise(None))
.alias("search_3")
).select('uid', 'search_1', 'search_3', 'nearby', 'chatHour')
# countDistinct(when(col("sourcetype")=="0", "fuid").otherwise(None)).alias("search")
执行 python read.py
2.2 写parquet文件
def write_parquet(spark, pt):
sql = """
select uid
,count(distinct if(sourcetype='14' and pt='{0}', fuid, null)) as agree_nearby_1d_cnt
,count(distinct if(sourcetype='14' and pt>=date_sub('{0}', 1) and pt<='{0}', fuid, null)) as agree_nearby_3d_cnt
,count(distinct if(sourcetype='14' and pt>=date_sub('{0}', 6) and pt<='{0}', fuid, null)) as agree_nearby_7d_cnt
,count(distinct if(sourcetype='28' and pt='{0}', fuid, null)) as agree_kdy_1d_cnt
,count(distinct if(sourcetype='28' and pt>=date_sub('{0}', 1) and pt<='{0}', fuid, null)) as agree_kdy_3d_cnt
,count(distinct if(sourcetype='28' and pt>=date_sub('{0}', 6) and pt<='{0}', fuid, null)) as agree_kdy_7d_cnt
from zz.zx_user_action_agree_d_incr
where pt>=date_sub('{0}', 6) and pt<='{0}'
group by uid
""".format(pt)
# 这里并不会真正执行,只有遇到Action才执行。
df = spark.sql(sql)
# 将同一类型的标签聚合到一个struct,防止字段爆炸(注意,字段名尽量不要以数字开头)
df = df.withColumn(
"agree_frd",
struct(
struct(
col("agree_nearby_1d_cnt").alias("nb_1d"),
col("agree_nearby_3d_cnt").alias("nb_3d"),
col("agree_nearby_7d_cnt").alias("nb_7d")
).alias("nearby")
struct(
col("agree_kdy_1d_cnt").alias("kdy_1d"),
col("agree_kdy_3d_cnt").alias("kdy_3d"),
col("agree_kdy_7d_cnt").alias("kdy_7d")
).alias("kdy"),
)
).select("uid", "add_friends")
# 存储分区需要根据实际情况调整, 习惯上一般不超过200M一个文件
df.repartition(35).write.mode("overwrite").parquet(
'/user/db_dmp/db_dmp.db/lx_features/pt={}/group=agree_frd'.format(pt))
withColumn 字段处理
df = df.withColumn("chatTime", to_timestamp('request_time', "yyyy-MM-dd HH:mm:ss")) \
.withColumn("chatHour", hour('chatTime')) \
.withColumn("chatWeekDay", week('chatTime')) \
.withColumn("chatDayOfWeek", date_format('chatTime', "u")) \
.select("uid", "fuid", "chatHour", "chatDayOfWeek") \
.dropDuplicates()
2.3 写入hive表
def parquet_to_hive(spark, pt):
df = spark.read.parquet("hdfs://wxlx/user/zz/zz.db/bd_features_pivot_data/pt={}/*".format(pt))
df.createOrReplaceTempView("features_temp")
sql = """
insert overwrite table db_dmp.lx_features_pivot partition(pt='{0}')
select uid
,user_register_days
from features_temp
""".format(pt)
spark.sql(sql)
或
df.write.format("hive").mode("overwrite").saveAsTable("db_dmp.features_01")
2.4 join 表链接
def table_join(spark, pt):
sql_a = """
select uid
,datediff('{0}', registered_date) as user_register_days
from zz.zx_user_information
where pt = '{0}'
""".format(pt)
df_a = spark.sql(sql_a)
df_a.createOrReplaceTempView("a_temp")
sql_b = """
select uid
,user_active_7d
from features_temp
"""
df_b = spark.sql(sql_b)
df_b.createOrReplaceTempView("b_temp")
sql = """
select a.*
,b.user_active_7d
from a_temp a
left join b_temp b
on a.uid=b.uid
"""
df = spark.sql(sql)
2.5 parquet group合并
使用spark读取parquet文件的一个优点是可以merge schema, 即来自不同文件的字段可以轻松合并到一起。但是需要注意的是,同一个key并没有合并,我们的表中uid应当是唯一键,但是merge schema并不会合并内容,需要自己合并。
一种经典的做法是使用max,前提是每个子分区的uid必须唯一,不同子分区字段名不可以重复。就像上图一样的分块阵一样。
将parquet某分区下,不同group字段合并、去重、空值补0。
from pyspark.sql.types import IntegerType
def group_union(spark, pt):
# 空值需要补0的字段
zeroPatchList = [
"user_active_1d",
"user_active_3d",
"chat_friends_by_hour_total_23"]
# 空值需要补2的字段
twoPatchList = [
"gender",
"is_active"]
# 使用mergeSchema可以将来自不同文件的字段合并到一起
df = spark.read.option("mergeSchema", "true").parquet(
"/user/db_dmp/db_dmp.db/lx_features/pt={}".format(pt))
df_result = df_features.groupBy("uid").agg(
max("user_active.av_1d").alias("user_active_1d"),
max("user_active.av_3d").alias("user_active_3d"),
max("user_active.av_7d").alias("gender"),
max("user_active.15d").alias("is_active"),
max("last_active_days").alias("last_active_days"),
max("chat_hour.total.22").cast(IntegerType()).alias("chat_hour_total_22"),
max("chat_friends_by_hour.kdy.23").alias("chat_friends_by_hour_kdy_23"),
max("user_register_days").alias("user_register_days"))
.na.fill(0, zeroPatchList).na.fill(2, twoPatchList)
或
def merge_parquet(spark, pt):
parquet_path = "/user/db_dmp/db_dmp.db/lx_features/pt=" + pt + "/group={}"
df_1 = df_features.select("uid").distinct()
df_2 = spark.read.parquet(parquet_path.format("add_friend"))
df_3 = spark.read.parquet(parquet_path.format("chat_with_friend"))
df_friends = df_1
.join(df_2, df_1['uid']==df_2['uid'], "left")
.select(df_1['uid'], df_2['add_friend'])
df_chat = df_friends
.join(df_3, df_friends['uid']==df_3['uid'], "left")
.select(df_friends['uid'], df_friends['add_friend'], df_3['chat_with_friend'])
df_chat.printSchema()
df_chat.write.format("orc").mode("overwrite").saveAsTable("portrait.features")