一、spark是什么?
spark是 数据处理 的一种方式,在工作中经常会遇见。
二、使用步骤
代码如下:
from datetime import date, timedelta
dateday = date.today() - timedelta(days=1)
day = date.today() - timedelta(days=1)
dateday = dateday.strftime("%Y%m%d")
day = day.strftime("%Y%m%d")
month = dateday[0:6]
print(dateday,month,day) #20231030 202310 20231030
day1 = (date.today() - timedelta(days=1)).strftime("%Y%m%d")
day7 = (date.today() - timedelta(days=7)).strftime("%Y%m%d")
day14 = (date.today() - timedelta(days=14)).strftime("%Y%m%d")
day30 = (date.today() - timedelta(days=30)).strftime("%Y%m%d")
print(day1,day7,day14,day30) #20231030 20231024 20231017 20231001
def exp_order_func(date1, date7):
exp_data = spark.sql(f"""select distinct cust_id, {day1} as dt1
from temp.exptable
where dt = {day7}""").distinct()
gmv_data = spark.sql(f"""select ecif_no as cust_id, sum(order_amt) as order_amt
from temp.ordertable
where dt = {day7}
and regex_replace(substr(order_date, 1, 10), '-', '') >= {date7}
and regex_replace(substr(order_date, 1, 10), '-', '') <= {date1}
group by cust_id""")
# 写法一
exp_gmv_data = exp_data.join(gmv_data, on=['cust_id'], how='inner').groupby("dt1").agg(fn.countDistinct("cust_id"),fn.sum("order_amt")).toDF("dt1", "exp_gmv_uv", "exp_gmv_amt")
# 写法二
exp_gmv2_data = exp_data.join(gmv_data, on=['cust_id'], how='inner')
exp_gmv2_data_result = exp_gmv2_data.groupby("dt1").agg(fn.countDistinct("cust_id"),fn.sum("order_amt")).toDF("dt1", "exp_gmv_uv", "exp_gmv_amt")
return exp_gmv_data
exp_order7 = exp_order_func(day1, day7).toDF("dt1", "exp_gmv_uv7", "exp_gmv_amt7")
exp_order14 = exp_order_func(day1, day14).toDF("dt1", "exp_gmv_uv14", "exp_gmv_amt14")
exp_order30 = exp_order_func(day1, day30).toDF("dt1", "exp_gmv_uv30", "exp_gmv_amt30")
result = exp_order7.join(exp_order14, on=['dt1'], how='left').join(exp_order30, on=['dt1'], how='left')
result.show(3)
二、结果如图所示:
# dt1 exp_gmv_uv7 exp_gmv_amt7 exp_gmv_uv14 exp_gmv_amt14 exp_gmv_uv30 exp_gmv_amt30
# 20231030 5 100 25 500 30 600