关于PySpark API的详细可以参考: pyspark.sql.DataFrame.createOrReplaceTempView — PySpark 3.2.1 documentation
下面用在Databricks Notebook中的一些操作场景及实现作为例子:
1. Join操作
- join多个stream并Partition输出
# event.request is delta table
# string to timestamp: to_timestamp()
# array size: size(brand_counts)
# partitionBy multiple columns: partitionBy(["publisher_id", "et"])
# select all columns of a join df: df_opp["*"]
df_opp = spark.sql("select request_id, publisher_id, event_datetime, date_format(event_datetime, 'yyyy-MM-dd-HH') as et, count, query, slot_id, request_target_values, request_target_value_counts, size(request_target_values) as req_target_val_cnt, brand_counts, size(brand_counts) as brand_cnt from event.request where dt >= '2022-03-23-00' and event_datetime >= to_timestamp('2022-03-23 00:00:00') and event_datetime < to_timestamp('2022-03-24 00:00:00')")
print(df_opp.count())
df_defimp = spark.read.option("mergeSchema", "true").parquet("/mnt/datalake/test_tom/deferred_impression")
print(df_defimp.count())
df_imp = spark.read.option("mergeSchema", "true").parquet("/mnt/datalake/test_tom/impression")
print(df_imp.count())
df_click = spark.read.option("mergeSchema", "true").parquet("/mnt/datalake/test_tom/click")
print(df_click.count())
df_join = df_opp.join(df_defimp, df_opp.request_id == df_