关于开窗函数和炸裂函数的一些思考

题目主体

公司代码,年度,1月-------------------------12月的收入金额
burk,year,tsl01,tsl02,tsl03,tsl04,tsl05,tsl06,tsl07,tsl08,tsl09,tsl10,tsl11,tsl12
853101,2010,100200,25002,19440,20550,14990,17227,40990,28778,19088,29889,10990,20990
853101,2011,19446,20556,14996,17233,40996,28784,19094,28779,19089,29890,10991,20991
853101,2012,19447,20557,14997,17234,20560,15000,17237,28780,19090,29891,10992,20992
853101,2013,20560,15000,17237,41000,17234,20560,15000,17237,41000,29892,10993,20993
853101,2014,19449,20559,14999,17236,41000,28788,28786,19096,29897,41000,28788,20994
853101,2015,100205,25007,19445,20555,17236,40999,28787,19097,29898,29894,10995,20995
853101,2016,100206,25008,19446,20556,17237,41000,28788,19098,29899,29895,10996,20996
853101,2017,100207,25009,17234,20560,15000,17237,41000,15000,17237,41000,28788,20997
853101,2018,100208,25010,41000,28788,28786,19096,29897,28786,19096,29897,10998,20998
853101,2019,100209,25011,17236,40999,28787,19097,29898,28787,19097,29898,10999,20999
846271,2010,100210,25012,17237,41000,28788,19098,29899,28788,19098,29899,11000,21000
846271,2011,100211,25013,19451,20561,15001,17238,41001,28789,19099,29900,11001,21001
846271,2012,100212,100213,20190,6484,46495,86506,126518,166529,206540,246551,286562,326573
846271,2013,100213,100214,21297,5008,44466,83924,123382,162839,202297,241755,281213,320671
846271,2014,100214,100215,22405,3531,42436,81341,120245,159150,198055,236959,275864,314769
846271,2015,100215,100216,23512,2055,19096,29897,28786,19096,29897,41000,29892,308866
846271,2016,100216,100217,24620,579,38377,76175,28788,28786,19096,29897,41000,302964
846271,2017,100217,100218,25727,898,36347,73592,40999,28787,19097,29898,29894,297062
846271,2018,100218,100219,26835,2374,34318,71009,41000,28788,19098,29899,29895,291159
846271,2019,100219,100220,27942,3850,32288,68427,17237,41000,15000,17237,41000,285257

1、统计每个公司每年按月累计收入 行转列 --> sum窗口函数
输出结果
公司代码,年度,月份,当月收入,累计收入

2、统计每个公司当月比上年同期增长率 行转列 --> lag窗口函数
公司代码,年度,月度,增长率(当月收入/上年当月收入 - 1)

建表语句

create table if not exists homework0523(
    burk string COMMENT '公司代码',
    year bigint COMMENT '年份',
    tsl01 bigint COMMENT '1月',
    tsl02 bigint COMMENT '2月',
    tsl03 bigint COMMENT '3月',
    tsl04 bigint COMMENT '4月',
    tsl05 bigint COMMENT '5月',
    tsl06 bigint COMMENT '6月',
    tsl07 bigint COMMENT '7月',
    tsl08 bigint COMMENT '8月',
    tsl09 bigint COMMENT '9月',
    tsl10 bigint COMMENT '10月',
    tsl11 bigint COMMENT '11月',
    tsl12 bigint COMMENT '12月'
)
row format delimited fields terminated by ','
stored as textfile;

insert overwrite table homework0523
values ("853101",2010,100200,25002,19440,20550,14990,17227,40990,28778,19088,29889,10990,20990)
,("853101",2011,19446,20556,14996,17233,40996,28784,19094,28779,19089,29890,10991,20991)
,("853101",2012,19447,20557,14997,17234,20560,15000,17237,28780,19090,29891,10992,20992)
,("853101",2013,20560,15000,17237,41000,17234,20560,15000,17237,41000,29892,10993,20993)
,("853101",2014,19449,20559,14999,17236,41000,28788,28786,19096,29897,41000,28788,20994)
,("853101",2015,100205,25007,19445,20555,17236,40999,28787,19097,29898,29894,10995,20995)
,("853101",2016,100206,25008,19446,20556,17237,41000,28788,19098,29899,29895,10996,20996)
,("853101",2017,100207,25009,17234,20560,15000,17237,41000,15000,17237,41000,28788,20997)
,("853101",2018,100208,25010,41000,28788,28786,19096,29897,28786,19096,29897,10998,20998)
,("853101",2019,100209,25011,17236,40999,28787,19097,29898,28787,19097,29898,10999,20999)
,("846271",2010,100210,25012,17237,41000,28788,19098,29899,28788,19098,29899,11000,21000)
,("846271",2011,100211,25013,19451,20561,15001,17238,41001,28789,19099,29900,11001,21001)
,("846271",2012,100212,100213,20190,6484,46495,86506,126518,166529,206540,246551,286562,326573)
,("846271",2013,100213,100214,21297,5008,44466,83924,123382,162839,202297,241755,281213,320671)
,("846271",2014,100214,100215,22405,3531,42436,81341,120245,159150,198055,236959,275864,314769)
,("846271",2015,100215,100216,23512,2055,19096,29897,28786,19096,29897,41000,29892,308866)
,("846271",2016,100216,100217,24620,579,38377,76175,28788,28786,19096,29897,41000,302964)
,("846271",2017,100217,100218,25727,898,36347,73592,40999,28787,19097,29898,29894,297062)
,("846271",2018,100218,100219,26835,2374,34318,71009,41000,28788,19098,29899,29895,291159)
,("846271",2019,100219,100220,27942,3850,32288,68427,17237,41000,15000,17237,41000,285257)

相关实现

select
    burk
    ,year
    ,yf
    ,sr
    ,sum(sr) over(partition by (burk,year) order by yf rows between unbounded preceding and current row) amount
from
    (select
    burk
    ,year
    ,k1 as yf
    ,v1 as sr
from homework0523
    lateral view  explode(map(1,tsl01,2,tsl02,3,tsl03,4,tsl04,5,tsl05,6,tsl06,7,tsl07,8,tsl08,9,tsl09,10,tsl10,11,tsl11,12,tsl12))  t1 as k1,v1) t11
;
-- 未解决第一年 也不知道是不是应该遇见2010是否就抛出null
select
    burk
    , year
    , yf
    , (sr / lag(sr,1,"9999") over(partition by (burk,yf) order by year)) - 1
from (select
    burk
    ,year
    ,k1 as yf
    ,v1 as sr
from homework0523
    lateral view  explode(map(1,tsl01,2,tsl02,3,tsl03,4,tsl04,5,tsl05,6,tsl06,7,tsl07,8,tsl08,9,tsl09,10,tsl10,11,tsl11,12,tsl12))  t1 as k1,v1) t11
;

关于题目的思考

更贴切的来说,炸裂函数其实就是行转列,不知道有没有列转行,但我猜测应该是有的。
构建侧视图语法

lateral  view explode(array/map...(对应语法即可)) 标识别名 as (生成多少列就可以取多少列col1[,col2,col3] 多个的话,记得用括号包含一下)

如何构建一个合适的窗口函数,或者说如何写出一段优美的SQL语句呢?

扩展题:查询首次下单后第二天连续下单的用户比率(来源尚硅谷hive习题)
抛开答案不谈,只看题目,很容易想到如下逻辑
第一步,查出连续下单的用户
第二步,查出下单的用户
第三步,处理第一步和第二步的结果

尽管思路大体不差,但写出的SQL却差的很多,如何优雅的选出最合适的SQL
步骤再细化
1、去重,牢记谓词下推原则,过滤数据,不仅仅减少处理时间,也可以简化思路,目的消除同一天用户多次下单,确保清洗结束的数据,相对而言,用户就一天一单。
2、查询连续,如何筛选出想要的连续用户,又算出用户总量,得看开窗开的好不好,尚硅谷交了我三种查询连续,lead查,开窗行查,开窗列查。
利用rank()排序,只保留用户前两单信息(相对前两单),这样就保留了所有用户,又过滤了很多数据,在最终端,直接将二者相减即可算出最后结果。
利用lead抽取,直接筛选出连续下单的用户,只要差值有1,直接抽出用户,也能写出来,但多了一步join。
如何写出rank那一种方法的优雅嘞,虽然lead也算的出来,但总觉得rank写的diao一点。
插个眼,有机会再来补充吧。

-- 代码没有写完,只写了一个大体思路,下单用户的总数是查询抽取的
select
--     count(distinct user_id)
--     ,total
    *
from
(select
    user_id
    ,datediff(lead(create_date,1,"9999-00-00") over (partition by user_id),create_date) rk
from order_info
group by user_id, create_date) t1
join
(select count(distinct user_id) as total
from order_info) t2
on (1=1)
where rk = 1;
select concat(round(sum(if(datediff(buy_date_second, buy_date_first) = 1, 1, 0)) / count(*) * 100, 1), '%') percentage
from (
         select user_id,
                min(create_date) buy_date_first,
                max(create_date) buy_date_second
         from (
                  select user_id,
                         create_date,
                         rank() over (partition by user_id order by create_date) rk
                  from (
                           select user_id,
                                  create_date
                           from order_info
                           group by user_id, create_date
                       ) t1
              ) t2
         where rk <= 2
         group by user_id
     ) t3;
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值