HIVE实战处理(一)统计用户连续登录天数

一、初级版的连续登录

1、准备数据样例

--1)对数据样例的用户按登录时间排名
with temp01 as
(select 
	user_id 
    ,login_time  
    ,row_number() over (partition by user_id order by login_time) as rnk   --用户分组排名
from  (select 'A' user_id,'2021-03-01' login_time
union all
select 'A' user_id, '2021-03-02' login_time
union all
select 'B'user_id, '2021-03-01' login_time
union all
select 'A'user_id, '2021-03-03' login_time
union all
select 'A'user_id, '2021-03-05' login_time
union all
select 'A'user_id, '2021-03-06' login_time
) user 
)
--2) 同一个用户的登录时间,减去对应的时间排名的,得到一个共同日期,按这个日期进行分组。
select
	user_id 
    ,login_time 
	,date_sub(login_time, cast(rnk as int)) as dt_start   --同一开始日期即为连续登录
from temp01 

在这里插入图片描述

可以看到用户A在2021-03-01、2021-03-02、2021-03-03三天是连续登录的;用户A在2021-03-04、2021-03-05两天是连续登录的

--3)统计用户最大的连续登录天数、以及对应的开始连续登录的时间
select 
user_id,
dt_start,
count(*) days,   --连续登录的天数
min(login_time),  --连续登录的第一天
max(login_time)   --连续登录的最后一天
from 
(select
	user_id 
    ,login_time 
	,date_sub(login_time, cast(rnk as int)) as dt_start   --同一开始日期即为连续登录
from temp01 ) t1 
group by user_id,dt_start   

在这里插入图片描述

二、进阶版的连续登录

1、根据附件创建临时表
drop table temp.tmp_test_room;
create table temp.tmp_test_room (
roomid string,                                     
pt_month string ,                                    
pt_day string
)row format delimited fields terminated by '\t';
2、导入文本数据到hive表

在控制台执行命令把文件映射成表
hdfs dfs -put ./p.txt /user/hive/warehouse/temp.db/tmp_test_room

3、数据分析
  1. 对用户登录的天数进行排名
select roomid,to_date(pt_day) pt_day,row_number()over(partition by roomid order by to_date(pt_day) asc) rn
from temp.tmp_test_room where roomid=9999589 and pt_day between '2017-01-01' and '2017-01-16';


结果:
±---------±------------±----±-+
| roomid | pt_day | rn |
±---------±------------±----±-+
| 9999589 | 2017-01-01 | 1 |
| 9999589 | 2017-01-02 | 2 |
| 9999589 | 2017-01-04 | 3 |
| 9999589 | 2017-01-05 | 4 |
| 9999589 | 2017-01-06 | 5 |
| 9999589 | 2017-01-07 | 6 |
| 9999589 | 2017-01-08 | 7 |
| 9999589 | 2017-01-09 | 8 |
| 9999589 | 2017-01-10 | 9 |
| 9999589 | 2017-01-12 | 10 |
| 9999589 | 2017-01-13 | 11 |
| 9999589 | 2017-01-14 | 12 |
| 9999589 | 2017-01-15 | 13 |
| 9999589 | 2017-01-16 | 14 |

2)对用户登录累计天数进行分组

select roomid,date_sub(pt_day,rn) from 
(select roomid,to_date(pt_day) pt_day,row_number()over(partition by roomid order by to_date(pt_day) asc) rn
from temp.tmp_test_room where roomid=9999589 and pt_day between '2017-01-01' and '2017-01-16') x
group by roomid,date_sub(pt_day,rn) ;

结果:
±---------±------------±-+
| roomid | _c1 |
±---------±------------±-+
| 9999589 | 2016-12-31 |
| 9999589 | 2017-01-01 |
| 9999589 | 2017-01-02 |

3)找到用户登录各个时段的最大、最小日期

select roomid,min(pt_day) continuity_first_day,max(pt_day) continuity_last_day,count(*) continuity_days
from (select roomid,to_date(pt_day) pt_day,row_number()over(partition by roomid order by to_date(pt_day) asc) rn
from temp.tmp_test_room where roomid=9999589 and pt_day between '2017-01-01' and '2017-01-16') x
group by roomid,date_sub(pt_day,rn);

结果:
±---------±----------------------±---------------------±-----------------±-+
| roomid | continuity_first_day | continuity_last_day | continuity_days |
±---------±----------------------±---------------------±-----------------±-+
| 9999589 | 2017-01-01 | 2017-01-02 | 2 |
| 9999589 | 2017-01-04 | 2017-01-10 | 7 |
| 9999589 | 2017-01-12 | 2017-01-16 | 5 |
±---------±----------------------±---------------------±-----------------±-+

4、连续登录天数记录统计(周六日除外)
with t1 as
(select roomid,min(pt_day) continuity_first_day,max(pt_day) continuity_last_day,count(*) continuity_days
from (select roomid,to_date(pt_day) pt_day,row_number()over(partition by roomid order by to_date(pt_day) asc) rn
from temp.tmp_test_room ) x
group by roomid,date_sub(pt_day,rn)
)

select 	 
	roomid ,
	continuity_first_day,
	continuity_last_day,
	max(continuity_days) continuity_days, --用户累计登录天数 
	count(*) except_weekend_continuity_days  --用户除周六日外的累计登录天数
from 
(select  
	roomid ,
	continuity_first_day,
	continuity_last_day,
	continuity_days,
	date_add(continuity_first_day,pos) as login_day,  --爆裂开各个连续登录时段的每一天
	pmod(datediff(date_add(continuity_first_day,pos), '2012-01-01'), 7) week_which_day   --判断日期是周几
from t1 lateral view posexplode(split(space(datediff(continuity_last_day,continuity_first_day)),' ')) tf as pos,val
) t  where week_which_day not in ('6','0')
group by roomid , continuity_first_day,continuity_last_day
5、求用户登录天数最多的时间段

======对多个用户的打卡记录进行统计,并找到用户的累计打卡时间最长的时段===================

with t1 as
(select employee_no,min(date_col) continuity_first_day,max(date_col) continuity_last_day,count(*) continuity_days
from (select employee_no,to_date(date_col) date_col,row_number()over(partition by employee_no order by to_date(date_col) asc) rn
from temp.temp_lz_attendance_190723_190822_1 where is_work='Y') x
group by employee_no,date_sub(date_col,rn)
),

t2(select 
	employee_no,
	continuity_first_day,
	continuity_last_day,
	max(continuity_days) continuity_days, --用户累计登录天数 
	count(*) except_weekend_continuity_days  --用户除周六日外的累计登录天数
from 
(select 
	employee_no,
	continuity_first_day,
	continuity_last_day,
	continuity_days,
	date_add(continuity_first_day,pos) as login_day,  --爆裂开各个连续登录时段的每一天
	pmod(datediff(date_add(continuity_first_day,pos), '2012-01-01'), 7) week_which_day   --判断日期是周几
from t1 lateral view posexplode(split(space(datediff(continuity_last_day,continuity_first_day)),' ')) tf as pos,val
) t  where week_which_day not in ('6','0')
group by employee_no,continuity_first_day,continuity_last_day
  )

select * from (
select employee_no,continuity_first_day,continuity_last_day , except_weekend_continuity_days, row_number() over(partition by employee_no order by except_weekend_continuity_days desc) rn   --对用户的连续打卡时长降序排列
 from t2 
) t 
where rn=1
  • 2
    点赞
  • 16
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值