----题目
一.蚂蚁森林植物申领统计
问题:假设2017年1月1日开始记录低碳数据(user_low_carbon),假设2017年10月1日之前满足申领条件的用户都申领了一颗p004-胡杨,剩余的能量全部用来领取“p002-沙柳” 。统计在10月1日累计申领“p002-沙柳” 排名前10的用户信息;以及他比后一名多领了几颗沙柳。
得到的统计结果如下表样式:
user_id plant_count less_count(比后一名多领了几颗沙柳)
u_101 1000 100
u_088 900 400
u_103 500 …
1.创建表
create table user_low_carbon(user_id String,data_dt String,low_carbon int) row format delimited fields terminated by '\t'; create table plant_carbon(plant_id string,plant_name String,low_carbon int) row format delimited fields terminated by '\t';
2.加载数据
load data local inpath "/opt/module/hive/datas/user_low_carbon.txt" into table user_low_carbon; load data local inpath "/opt/module/hive/datas/plant_carbon.txt" into table plant_carbon;
3.--转化时间字段
select user_id, regexp_replace(data_dt,'/','-'), low_carbon from user_low_carbon where regexp_replace(data_dt,'/','-') <'2017-10-01' t1
4.-- 求能量总和
select t1.user_id, sum(t1.low_carbon)-(select low_carbon from plant_carbon where plant_name='胡杨') low_carbon_sum from ( select user_id, regexp_replace(data_dt,'/','-'), low_carbon from user_low_carbon where regexp_replace(data_dt,'/','-') <'2017-10-01' )t1 group by t1.user_id t2
5.--求出比后一名多领了几棵
select t2.user_id, t2.low_carbon_sum from( select t1.userId, sum(t1.low_carbon)-(select low_carbon from plant_carbon where plant_name='胡杨') from( select user_id, regexp_replace(data_dt,'/','-') low_carbon from user_id_carbon where regexp_rplace(data_dt,'/','-')<'2017-10-1' ) t1 group by t1.user_id ) t2 ) where t2.low_carbon_sum>0 order by t2.low_carbon_sum desc limit 11 ) t3
6.答案
select t4.user_id, t4.plant_count, t4.plant_count-lead(t4.plant_count,1,0) over(order by t4.plant_count desc) less_count from ( select t3.user_id, floor(t3.low_carbon_sum/(select low_carbon from plant_carbon where plant_name='沙柳')) plant_count from( select t2.user_id, t2.low_carbon_sum from ( select t1.user_id, sum(t1.low_carbon)-(select low_carbon from plant_carbon where plant_name='胡杨') low_carbon_sum from ( select user_id, regexp_replace(data_dt,'/','-'), low_carbon from user_low_carbon where regexp_replace(data_dt,'/','-') <'2017-10-01' )t1 group by t1.user_id )t2 where t2.low_carbon_sum>0 order by t2.low_carbon_sum desc limit 11 ) t3 )t4
结果:
| t4.user_id | t4.plant_count | less_count | +-------------+-----------------+-------------+ | u_007 | 66 | 3 | | u_013 | 63 | 10 | | u_008 | 53 | 7 | | u_005 | 46 | 1 | | u_010 | 45 | 1 | | u_014 | 44 | 5 | | u_011 | 39 | 2 | | u_009 | 37 | 5 | | u_006 | 32 | 9 | | u_002 | 23 | 1 | | u_004 | 22 | 22 | +-------------+-----------------+-------------+
二、蚂蚁森林低碳用户排名分析
问题:查询user_low_carbon表中每日流水记录,条件为:
用户在2017年,连续三天(或以上)的天数里,每天减少碳排放(low_carbon)都超过100g的用户低碳流水。
需要查询返回满足以上条件的user_low_carbon表中的记录流水。
例如用户u_002符合条件的记录如下,因为2017/1/2~2017/1/5连续四天的碳排放量之和都大于等于100g:
seq(key) user_id data_dt low_carbon
xxxxx10 u_002 2017/1/2 150
xxxxx11 u_002 2017/1/2 70
xxxxx12 u_002 2017/1/3 30
xxxxx13 u_002 2017/1/3 80
xxxxx14 u_002 2017/1/4 150
xxxxx14 u_002 2017/1/5 101
备注:统计方法不限于sql、procedure、python,java等
方法一:
--1.查找到在2017年的用户 select user_id, regexp_replace(data_dt,'/','-') data_dt, sum(low_carbon) low_carbon_sum from user_low_carbon where year(regexp_replace(data_dt,'/','-')) ='2017' group by user_id,regexp_replace(data_dt,'/','-') t1 --2.筛选2017年的用户里碳排放超过100g的用户 select t1.user_id, t1.data_dt, t1.low_carbon_sum from ( select user_id, regexp_replace(data_dt,'/','-') data_dt, sum(low_carbon) low_carbon_sum from user_low_carbon where year(regexp_replace(data_dt,'/','-')) ='2017' group by user_id,regexp_replace(data_dt,'/','-') )t1 where t1.low_carbon_sum>=100 t2 2021-2-1 2021-2-2 2021-2-3 2021-2-2 2021-2-3 2021-2-4 2021-2-3 2021-2-4 2021-2-5 --3.通过lag和lead开窗函数筛选第三天和第二天碳排放达标的用户,分为今天,昨天,前天,明天,后天 select t2.user_id, t2.low_carbon_sum, lag(t2.data_dt,2,'0000-00-00')over(partition by t2.user_id order by t2.data_dt) qt, lag(t2.data_dt,1,'0000-00-00')over(partition by t2.user_id order by t2.data_dt) zt, t2.data_dt, lead(t2.data_dt,1,'9999-99-99')over(partition by t2.user_id order by t2.data_dt) mt, lead(t2.data_dt,2,'9999-99-99')over(partition by t2.user_id order by t2.data_dt) ht from ( select t1.user_id, t1.data_dt, t1.low_carbon_sum from ( select user_id, regexp_replace(data_dt,'/','-') data_dt, sum(low_carbon) low_carbon_sum from user_low_carbon where year(regexp_replace(data_dt,'/','-')) ='2017' group by user_id,regexp_replace(data_dt,'/','-') )t1 where t1.low_carbon_sum>=100 )t2 t3 --4.判断 天数是否正确,今天-前天,今天-昨天,今天-明天,今天-后天这四个的低碳能量是否大于100g select t3.user_id, t3.data_dt, t3.low_carbon_sum, datediff(t3.data_dt,t3.qt) jt_qt, datediff(t3.data_dt,t3.zt) jt_zt, datediff(t3.data_dt,t3.mt) jt_mt, datediff(t3.data_dt,t3.ht) jt_ht from ( select t2.user_id, t2.low_carbon_sum, lag(t2.data_dt,2,'0000-00-00')over(partition by t2.user_id order by t2.data_dt) qt, lag(t2.data_dt,1,'0000-00-00')over(partition by t2.user_id order by t2.data_dt) zt, t2.data_dt, lead(t2.data_dt,1,'9999-99-99')over(partition by t2.user_id order by t2.data_dt) mt, lead(t2.data_dt,2,'9999-99-99')over(partition by t2.user_id order by t2.data_dt) ht from ( select t1.user_id, t1.data_dt, t1.low_carbon_sum from ( select user_id, regexp_replace(data_dt,'/','-') data_dt, sum(low_carbon) low_carbon_sum from user_low_carbon where year(regexp_replace(data_dt,'/','-')) ='2017' group by user_id,regexp_replace(data_dt,'/','-') )t1 where t1.low_carbon_sum>=100 )t2 )t3 t4 --5.查询减少碳排放大于等于两天的用户 select t4.user_id, t4.data_dt, t4.low_carbon_sum from ( select t3.user_id, t3.data_dt, t3.low_carbon_sum, datediff(t3.data_dt,t3.qt) jt_qt, datediff(t3.data_dt,t3.zt) jt_zt, datediff(t3.data_dt,t3.mt) jt_mt, datediff(t3.data_dt,t3.ht) jt_ht from ( select t2.user_id, t2.low_carbon_sum, lag(t2.data_dt,2,'0000-00-00')over(partition by t2.user_id order by t2.data_dt) qt, lag(t2.data_dt,1,'0000-00-00')over(partition by t2.user_id order by t2.data_dt) zt, t2.data_dt, lead(t2.data_dt,1,'9999-99-99')over(partition by t2.user_id order by t2.data_dt) mt, lead(t2.data_dt,2,'9999-99-99')over(partition by t2.user_id order by t2.data_dt) ht from ( select t1.user_id, t1.data_dt, t1.low_carbon_sum from ( select user_id, regexp_replace(data_dt,'/','-') data_dt, sum(low_carbon) low_carbon_sum from user_low_carbon where year(regexp_replace(data_dt,'/','-')) ='2017' group by user_id,regexp_replace(data_dt,'/','-') )t1 where t1.low_carbon_sum>=100 )t2 )t3 )t4 where (t4.jt_qt =2 and t4.jt_zt=1)or(t4.jt_zt =1 and t4.jt_mt=-1)or(t4.jt_mt =-1 and t4.jt_ht=-2) t5 --6.与(user_low_carbon)相连接 select t6.user_id, t6.data_dt, t6.low_carbon from( select t4.user_id, t4.data_dt, t4.low_carbon_sum from ( select t3.user_id, t3.data_dt, t3.low_carbon_sum, datediff(t3.data_dt,t3.qt) jt_qt, datediff(t3.data_dt,t3.zt) jt_zt, datediff(t3.data_dt,t3.mt) jt_mt, datediff(t3.data_dt,t3.ht) jt_ht from ( select t2.user_id, t2.low_carbon_sum, lag(t2.data_dt,2,'0000-00-00')over(partition by t2.user_id order by t2.data_dt) qt, lag(t2.data_dt,1,'0000-00-00')over(partition by t2.user_id order by t2.data_dt) zt, t2.data_dt, lead(t2.data_dt,1,'9999-99-99')over(partition by t2.user_id order by t2.data_dt) mt, lead(t2.data_dt,2,'9999-99-99')over(partition by t2.user_id order by t2.data_dt) ht from ( select t1.user_id, t1.data_dt, t1.low_carbon_sum from ( select user_id, regexp_replace(data_dt,'/','-') data_dt, sum(low_carbon) low_carbon_sum from user_low_carbon where year(regexp_replace(data_dt,'/','-')) ='2017' group by user_id,regexp_replace(data_dt,'/','-') )t1 where t1.low_carbon_sum>=100 )t2 )t3 )t4 where (t4.jt_qt =2 and t4.jt_zt=1)or(t4.jt_zt =1 and t4.jt_mt=-1)or(t4.jt_mt =-1 and t4.jt_ht=-2) ) t5 join ( select user_id, regexp_replace(data_dt,'/','-') data_dt, low_carbon from user_low_carbon where year(regexp_replace(data_dt,'/','-')) ='2017' )t6 on t5.user_id=t6.user_id and t5.data_dt=t6.data_dt
结果:
+-------------+-------------+----------------+ | t6.user_id | t6.data_dt | t6.low_carbon | +-------------+-------------+----------------+ | u_002 | 2017-1-2 | 150 | | u_002 | 2017-1-2 | 70 | | u_002 | 2017-1-3 | 30 | | u_002 | 2017-1-3 | 80 | | u_002 | 2017-1-4 | 150 | | u_002 | 2017-1-5 | 101 | | u_005 | 2017-1-2 | 50 | | u_005 | 2017-1-2 | 80 | | u_005 | 2017-1-3 | 180 | | u_005 | 2017-1-4 | 180 | | u_005 | 2017-1-4 | 10 | | u_008 | 2017-1-4 | 260 | | u_008 | 2017-1-5 | 360 | | u_008 | 2017-1-6 | 160 | | u_008 | 2017-1-7 | 60 | | u_008 | 2017-1-7 | 60 | | u_009 | 2017-1-2 | 70 | | u_009 | 2017-1-2 | 70 | | u_009 | 2017-1-3 | 170 | | u_009 | 2017-1-4 | 270 | | u_010 | 2017-1-4 | 90 | | u_010 | 2017-1-4 | 80 | | u_010 | 2017-1-5 | 90 | | u_010 | 2017-1-5 | 90 | | u_010 | 2017-1-6 | 190 | | u_010 | 2017-1-7 | 90 | | u_010 | 2017-1-7 | 90 | | u_011 | 2017-1-1 | 110 | | u_011 | 2017-1-2 | 100 | | u_011 | 2017-1-2 | 100 | | u_011 | 2017-1-3 | 120 | | u_011 | 2017-1-4 | 100 | | u_011 | 2017-1-5 | 100 | | u_011 | 2017-1-6 | 100 | | u_011 | 2017-1-7 | 130 | | u_011 | 2017-1-7 | 100 | | u_013 | 2017-1-2 | 150 | | u_013 | 2017-1-2 | 50 | | u_013 | 2017-1-3 | 150 | | u_013 | 2017-1-4 | 550 | | u_013 | 2017-1-5 | 350 | | u_014 | 2017-1-5 | 250 | | u_014 | 2017-1-6 | 120 | | u_014 | 2017-1-7 | 270 | | u_014 | 2017-1-7 | 20 | +-------------+-------------+----------------+
方法二:
select t1.user_id, t1.data_dt, t1.low_carbon_sum, rank()over(partition by t1.user_id order by t1.data_dt) rk from ( select user_id, regexp_replace(data_dt,'/','-') data_dt, sum(low_carbon) low_carbon_sum from user_low_carbon where year(regexp_replace(data_dt,'/','-')) ='2017' group by user_id,regexp_replace(data_dt,'/','-') )t1 where t1.low_carbon_sum>=100 t2 --等差数列less_dt select t2.user_id, t2.data_dt, t2.low_carbon_sum, date_sub(t2.data_dt,t2.rk) less_dt from ( select t1.user_id, t1.data_dt, t1.low_carbon_sum, rank()over(partition by t1.user_id order by t1.data_dt) rk from ( select user_id, regexp_replace(data_dt,'/','-') data_dt, sum(low_carbon) low_carbon_sum from user_low_carbon where year(regexp_replace(data_dt,'/','-')) ='2017' group by user_id,regexp_replace(data_dt,'/','-') )t1 where t1.low_carbon_sum>=100 )t2 t3 select t3.user_id, t3.data_dt, t3.low_carbon_sum, count(*)over(partition by t3.user_id,t3.less_dt) c_dt from ( select t2.user_id, t2.data_dt, t2.low_carbon_sum, date_sub(t2.data_dt,t2.rk) less_dt from ( select t1.user_id, t1.data_dt, t1.low_carbon_sum, rank()over(partition by t1.user_id order by t1.data_dt) rk from ( select user_id, regexp_replace(data_dt,'/','-') data_dt, sum(low_carbon) low_carbon_sum from user_low_carbon where year(regexp_replace(data_dt,'/','-')) ='2017' group by user_id,regexp_replace(data_dt,'/','-') )t1 where t1.low_carbon_sum>=100 )t2 )t3 t4 select t4.user_id, t4.data_dt, t4.low_carbon_sum from ( select t3.user_id, t3.data_dt, t3.low_carbon_sum, count(*)over(partition by t3.user_id,t3.less_dt) c_dt from ( select t2.user_id, t2.data_dt, t2.low_carbon_sum, date_sub(t2.data_dt,t2.rk) less_dt from ( select t1.user_id, t1.data_dt, t1.low_carbon_sum, rank()over(partition by t1.user_id order by t1.data_dt) rk from ( select user_id, regexp_replace(data_dt,'/','-') data_dt, sum(low_carbon) low_carbon_sum from user_low_carbon where year(regexp_replace(data_dt,'/','-')) ='2017' group by user_id,regexp_replace(data_dt,'/','-') )t1 where t1.low_carbon_sum>=100 )t2 )t3 )t4 where c_dt>=3
提供的数据说明:
user_low_carbon:
u_001 2017/1/1 10
u_001 2017/1/2 150
u_001 2017/1/2 110
u_001 2017/1/2 10
u_001 2017/1/4 50
u_001 2017/1/4 10
u_001 2017/1/6 45
u_001 2017/1/6 90
u_002 2017/1/1 10
u_002 2017/1/2 150
u_002 2017/1/2 70
u_002 2017/1/3 30
u_002 2017/1/3 80
u_002 2017/1/4 150
u_002 2017/1/5 101
u_002 2017/1/6 68
...
plant_carbon:
p001 梭梭树 17
p002 沙柳 19
p003 樟子树 146
p004 胡杨 215