目录
一、统计出每个月,每个用户的累积访问次数
select
userid,visitdate,
sum_count,sum(sum_count) over(partition by userid order by visitdate)
from
(select userid,visitdate,sum(visitcount) sum_count
from
(select userid,
date_format(regexp_replace(visitdate,'/','-'),'yyyy-MM') visitdate,
visitcount
from action) t1
group by
userid,
visitdate) t2;
解析:最里面select查询转换时间格式,第二个查询实现每个userid,每个月的visitcount求和,最外层窗口函数实现累加。
二、店铺的UV、TopN统计
有50W个京东店铺,每个顾客访问任何一个店铺的任何一个商品时都会产生一条访问日志,访问日志存储的表名为visit,访客的用户id为user_id,被访问的店铺名称为shop,请统计:
u1 a
u2 b
u1 b
u1 a
u3 c
u4 b
u1 a
u2 c
u5 b
u4 b
u6 c
。。。。。。
建表:
create table visit(user_id string,shop string) row format delimited fields terminated by '\t';
功能需求:
1、每个店铺的UV(访客数)
方式一distinct(不推荐):
select shop,count(distinct(user_id)) uv from visit group by shop;
可以实现功能,但是distinct是针对整个表去重,当数据量很大的时候,reducer的任务太重。
方式二:
select shop,count(*)
from
(select shop,user_id
from visit
group by shop,user_id) t1
group by shop;
里面的group by 两个字段实现去重
2、每个店铺访问次数top3的访客信息。输出店铺名称、访客id、访问次数
select shop,user_id,ct
from
(select *,row_number() over(partition by shop order by ct desc) rk
from
(select shop,user_id,count(user_id) ct
from visit
group by shop,user_id)t1)t2
where rk<=3;
注意点:1)having必须与group by一同使用,所以having rk<=3 错误
2)where rk <=3,row_number是虚拟列,所以作为子循环再嵌套一层查询
最终结果:
a u5 3
a u1 3
a u2 2
b u4 2
b u1 2
b u5 1
c u2 2
c u6 1
c u3 1
如果并列成绩超过3,根据业务需求考虑用rank()函数
三、蚂蚁森林
背景说明:
以下表记录了用户每天的蚂蚁森林低碳生活领取的记录流水。
用户 日期 减少碳排放(g)
user_low_carbon.user_id user_low_carbon.data_dt user_low_carbon.low_carbon
u_001 2017/1/1 10
u_001 2017/1/2 150
u_001 2017/1/2 110
u_001 2017/1/2 10
u_001 2017/1/4 50
u_001 2017/1/4 10
。。。。。。
蚂蚁森林植物换购表,用于记录申领环保植物所需要减少的碳排放量
植物编号 植物名 换购植物所需要的碳
plant_carbon.plant_id plant_carbon.plant_name plant_carbon.low_carbon
p001 梭梭树 17
p002 沙柳 19
p003 樟子树 146
p004 胡杨 215
需求一:蚂蚁森林植物申领统计
问题:假设2017年1月1日开始记录低碳数据(user_low_carbon),假设2017年10月1日之前满足申领条件的用户都申领了一颗p004-胡杨,剩余的能量全部用来领取“p002-沙柳” 。
统计在10月1日累计申领“p002-沙柳” 排名前10的用户信息;以及他比后一名多领了几颗沙柳。
得到的统计结果如下表样式:
t5.user_id t5.num_count less_count(比后一名多领了几颗沙柳)
u_007 66 3
u_013 63 10
u_008 53 7
u_005 46 1
u_010 45 1
u_014 44 5
u_011 39 2
u_009 37 5
u_006 32 9
u_002 23 1
准备阶段:
1.创建表
create table user_low_carbon(user_id String,data_dt String,low_carbon int) row format delimited fields terminated by '\t';
create table plant_carbon(plant_id string,plant_name String,low_carbon int) row format delimited fields terminated by '\t';
2.加载数据
load data local inpath "/root/data/hive/user_low_carbon.txt" into table user_low_carbon;
load data local inpath "/root/data/hive/plant_carbon.txt" into table plant_carbon;
3.设置本地模式
set hive.exec.mode.local.auto=true;
分析实战:1.统计每个用户截止到2017/10/1日期总低碳量,统计前10以及比后一名多领了几颗沙柳,所以要11个数据
select
user_id,
sum(low_carbon) sum_low_carbon
from
user_low_carbon
where
date_format(regexp_replace(data_dt,'/','-'),'yyyy-MM')<'2017-10'
group by
user_id
order by
sum_low_carbon desc
limit 11;t1
#字符串也可以比较,注意格式
select
user_id,sum(low_carbon) sum_low_carbon
from user_low_carbon
where
data_dt<'2017/10'
group by user_id
order by
sum_low_carbon desc
limit 11;t1
2.取出胡杨的能量
select low_carbon from plant_carbon where plant_id='p004';t2
3.取出沙柳的能量
select low_carbon from plant_carbon where plant_id='p002';t3
4.计算每个人申领沙柳的棵数
select
t1.user_id,
floor((t1.sum_low_carbon-t2.low_carbon)/t3.low_carbon) num_count
from
t1,t2,t3
5.按照申领沙柳棵数排序,并将下一行数据中的plant_count放置当前行
select user_id,num_count,
lead(num_count,1,'9999') over(order by num_count desc) next_num_count
from t4
limit 10;
6.求相差的沙柳棵数
select
user_id,
plant_count,
(num_count-next_num_count) less_count
from
t5;
#详细语句
select t5.user_id,t5.num_count,(t5.num_count-t5.next_num_count) less_count
from
(select user_id,num_count,
lead(num_count,1,'9999') over(order by num_count desc) next_num_count
from
(select
t1.user_id,
floor((t1.sum_low_carbon-t2.low_carbon)/t3.low_carbon) num_count
from
(select
user_id,sum(low_carbon) sum_low_carbon
from user_low_carbon
where
date_format(regexp_replace(data_dt,'/','-'),'yyyy-MM')<'2017-10'
group by user_id
order by sum_low_carbon desc
limit 11)t1,
(select low_carbon from plant_carbon where plant_id='p004')t2,
(select low_carbon from plant_carbon where plant_id='p002')t3)t4
order by num_count desc
limit 10)t5;
需求二:蚂蚁森林低碳用户排名分析
问题:查询user_low_carbon表中每日流水记录,条件为:用户在2017年,连续三天(或以上)的天数里,每天减少碳排放(low_carbon)都超过100g的用户低碳流水。
实现方法一:
1.过滤出2017年且单日低碳量超过100g
select user_id,regexp_replace(data_dt,'/','-') data_dt
from user_low_carbon
where
date_format(regexp_replace(data_dt,'/','-'),'yyyy') = 2017
group by user_id,data_dt
having sum(low_carbon)>=100;tb1
2.将前两行数据以及后两行数据的日期放置当前行
分析:分析连续三天,则需要判断当前行与前两行、后两行数据是否满足需求
select user_id,data_dt,
lag(data_dt,2,'1997-1-1') over(partition by user_id order by data_dt) lag2,
lag(data_dt,1,'1997-1-1') over(partition by user_id order by data_dt) lag1,
lead(data_dt,1,'1997-1-1') over(partition by user_id order by data_dt) lead1,
lead(data_dt,2,'1997-1-1') over(partition by user_id order by data_dt) lead2
from tb1;
3.计算当前日期跟前后两行时间的差值
select
user_id,
data_dt,
datediff(data_dt,lag2) lag2_diff,
datediff(data_dt,lag1) lag1_diff,
datediff(data_dt,lead1) lead1_diff,
datediff(data_dt,lead2) lead2_diff
from
t2;t3
4.过滤出连续3天超过100g的用户
select
user_id,
data_dt
from
t3
where
(lag2_diff=2 and lag1_diff=1)
or
(lag1_diff=1 and lead1_diff=-1)
or
(lead1_diff=-1 and lead2_diff=-2);t4
5.关联表tb1,显示row_number(),sum_low_carbon
select tb1.user_id,tb1.data_dt,tb1.sum_low_carbon,
row_number() over(partition by tb1.user_id order by tb1.data_dt) rownum
from tb1
join tb4
on tb1.user_id = tb4.user_id and tb1.data_dt=tb4.data_dt;
#详细
select tb1.user_id,tb1.data_dt,tb1.sum_low_carbon,
row_number() over(partition by tb1.user_id order by tb1.data_dt) rownum
from
(select user_id,regexp_replace(data_dt,'/','-') data_dt,sum(low_carbon) sum_low_carbon
from user_low_carbon
where
date_format(regexp_replace(data_dt,'/','-'),'yyyy') = '2017'
group by user_id,data_dt
having sum_low_carbon>=100)tb1
join
(select user_id,data_dt
from
(select user_id,data_dt,
datediff(data_dt,lag2) lag2_diff,
datediff(data_dt,lag1) lag1_diff,
datediff(data_dt,lead1) lead1_diff,
datediff(data_dt,lead2) lead2_diff
from
(select user_id,data_dt,
lag(data_dt,2,'1970-01-01') over(partition by user_id order by data_dt) lag2,
lag(data_dt,1,'1970-01-01') over(partition by user_id order by data_dt) lag1,
lead(data_dt,1,'1970-01-01') over(partition by user_id order by data_dt) lead1,
lead(data_dt,2,'1970-01-01') over(partition by user_id order by data_dt) lead2
from
(select user_id,regexp_replace(data_dt,'/','-') data_dt
from user_low_carbon
where
date_format(regexp_replace(data_dt,'/','-'),'yyyy') = 2017
group by user_id,data_dt
having sum(low_carbon)>=100) tb1)tb2)tb3
where
(lag2_diff=2 and lag1_diff=1) or
(lag1_diff=1 and lead1_diff=-1) or
(lead1_diff=-1 and lead2_diff=-2))tb4
on
tb1.user_id = tb4.user_id and tb1.data_dt=tb4.data_dt;
展示效果:
tb1.user_id tb1.data_dt tb1.sum_low_carbon rownum
u_002 2017-1-2 220 1
u_002 2017-1-3 110 2
u_002 2017-1-4 150 3
u_002 2017-1-5 101 4
u_005 2017-1-2 130 1
u_005 2017-1-3 180 2
u_005 2017-1-4 190 3
u_008 2017-1-4 260 1
u_008 2017-1-5 360 2
u_008 2017-1-6 160 3
u_008 2017-1-7 120 4
......
实现方式二:
方式一存在一个问题,如果题设是要求连续10天、20天甚至更多,那么就需要前后9行的日期放在当前行,且weher后判断条件更为复杂。
方式二采用数学等差数列思想:
date rank 差值
2020-1-2 1 1-1
2020-1-3 2 1-1
2020-1-4 3 1-1
2020-1-5 4 1-1
......
根据user_id分组后排列rank值(这里不存在重复user_id,三个函数任选其一),日期-rank所得值一样则代表日期连续
1、求出2017年超过100g的用户&时间&总数&rank
select user_id,date_format(regexp_replace(data_dt,'/','-'),'yyyy-MM-dd') data_dt,sum(low_carbon) sum_carbon,
rank() over(partition by user_id order by data_dt) rank_number
from user_low_carbon
where substring(data_dt,1,4)="2017"
group by user_id,data_dt
having sum_carbon>=100;tb1
2、求出时间与rank之间的差值
select user_id,data_dt,sum_carbon,
date_sub(data_dt,rank_number) sub
from tb1;
3、求出连续3天及以上的数据
select user_id,data_dt,
count(*) over(partition by user_id,sub) threeDays
from t2;t3
select user_id,data_dt,sub
from t3
where threeDays>=3;
#详细
select user_id,data_dt,threedays
from
(select user_id,data_dt,
count(*) over(partition by user_id,sub) threedays
from
(select user_id,data_dt,sum_carbon,
date_sub(data_dt,rank_number) sub
from
(select user_id,date_format(regexp_replace(data_dt,'/','-'),'yyyy-MM-dd') data_dt,sum(low_carbon) sum_carbon,
rank() over(partition by user_id order by data_dt) rank_number
from user_low_carbon
where substring(data_dt,1,4)="2017"
group by user_id,data_dt
having sum_carbon>=100)tb1)tb2)t3
where threedays>=3;
其中sub代表此user_id连续sub天满足需求