业务背景
现有收集到用户的页面点击行为日志数据,数据格式如下:
user_id click_time
A,2020-05-15 01:30:00
A,2020-05-15 01:35:00
A,2020-05-15 02:00:00
A,2020-05-15 03:00:10
A,2020-05-15 03:05:00
B,2020-05-15 02:03:00
B,2020-05-15 02:29:40
B,2020-05-15 04:00:00
会话概念:用户的一次会话含义是指用户进入系统开始到用户离开算作一次会话,离开或者重新开始一次会话的概念是指用户的两次行为事件差值大于30分钟,判断条件是只要两次时间差值大于30分钟就属于两次会话。
需求
对用户的日志数据打上会话内序号,如下
A,2020-05-15 01:30:00,1
A,2020-05-15 01:35:00,2
A,2020-05-15 02:00:00,3
A,2020-05-15 03:00:10,1
A,2020-05-15 03:05:00,2
B,2020-05-15 02:03:00,1
B,2020-05-15 02:29:40,2
B,2020-05-15 04:00:00,1
准备
在Hive中完成数据加载,使用Impala sql完成指标统计…
--创建表
drop table if exists user_clicklog;
create table user_clicklog (
user_id string,
click_time string
)
row format delimited fields terminated by ",";
--加载数据
load data local inpath '/root/impala_data/clicklog.dat' into table user_clicklog;
实现
1.使用lag()函数求出click_time前一行的值
2.将每一行click_time与前一行click_time转为时间戳进行相减,然后除以60转换为分钟数,使用casewhen 语句,大于30的则为1 小于30 的则为0
SELECT user_id, click_time
, (UNIX_TIMESTAMP(click_time) - UNIX_TIMESTAMP(lag(click_time) OVER (PARTITION BY user_id ORDER BY click_time))) / 60 AS lagTime
, CASE
WHEN (UNIX_TIMESTAMP(click_time) - UNIX_TIMESTAMP(lag(click_time) OVER (PARTITION BY user_id ORDER BY click_time))) / 60 > 30 THEN 1
ELSE 0
END AS fzField
FROM user_clicklog;
3.使用sum()根据user_id 分组,click_time排序(当加了order by 后sum是计算的第一行到当前行的累加)这样遇到为1的时候(时间间隔大于30分钟)数值就会改变,变成不同的分了。
SELECT user_id, lagTime, fzField, SUM(fzField) OVER (PARTITION BY user_id ORDER BY click_time) AS logTime
FROM (
SELECT user_id, click_time
, (UNIX_TIMESTAMP(click_time) - UNIX_TIMESTAMP(lag(click_time) OVER (PARTITION BY user_id ORDER BY click_time))) / 60 AS lagTime
, CASE
WHEN (UNIX_TIMESTAMP(click_time) - UNIX_TIMESTAMP(lag(click_time) OVER (PARTITION BY user_id ORDER BY click_time))) / 60 > 30 THEN 1
ELSE 0
END AS fzField
FROM user_clicklog
) t1;
4.使用排序函数row_number()进行排序 窗口为user_id +logtime
SELECT user_id, click_time, row_number() OVER (PARTITION BY user_id, logTime ORDER BY click_time) AS num
FROM (
SELECT user_id, click_time, fzField, SUM(fzField) OVER (PARTITION BY user_id ORDER BY click_time) AS logTime
FROM (
SELECT user_id, click_time
, (UNIX_TIMESTAMP(click_time) - UNIX_TIMESTAMP(lag(click_time) OVER (PARTITION BY user_id ORDER BY click_time))) / 60 AS lagTime
, CASE
WHEN (UNIX_TIMESTAMP(click_time) - UNIX_TIMESTAMP(lag(click_time) OVER (PARTITION BY user_id ORDER BY click_time))) / 60 > 30 THEN 1
ELSE 0
END AS fzField
FROM user_clicklog
) t1
) t2;
结果
最终得到的结果如下: