*会话划分,浏览窗口划分, 浏览记录分段划分
题目: 有浏览记录表, user_id(用户编号), page_id(界面编号), view_timestamp(浏览时间戳)
规定同一个用户,在相邻两次浏览时间间隔不超过60s, 算做是同义词浏览(在同一个浏览窗口, 同一个会话); 为浏览记录表增加一个浏览id(浏览窗口)字段, 同一个浏览窗口浏览id相同;
1. 数据准备
WITH live_events AS (
SELECT * FROM (
VALUES (1001, 1, 1700454270 ), (1002, 3, 1700455200 )
, (1001, 1, 1700454286 ), (1003, 2, 1700454270 )
, (1001, 2, 1700454312 ), (1003, 1, 1700454299 )
, (1001, 1, 1700454384 ), (1003, 2, 1700454351 )
, (1001, 2, 1700454412 ), (1003, 3, 1700454388 )
, (1001, 1, 1700454436 ), (1003, 1, 1700454658 )
, (1002, 2, 1700454270 ), (1003, 2, 1700454699 )
, (1002, 1, 1700454288 ), (1004, 2, 1700454270 )
, (1002, 2, 1700454321 ), (1004, 3, 1700454354 )
, (1002, 3, 1700454348 ), (1004, 2, 1700454465 )
, (1002, 1, 1700454536 ), (1004, 3, 1700454852 )
, (1002, 2, 1700454921 ), (1004, 2, 1700454901 )
, (1002, 2, 1700454989 ), (1004, 3, 1700455183 )
) AS table_name(user_id, page_id, view_timestamp)
)
2. 代码实现
SELECT
user_id, page_id, view_timestamp
, LAG(view_timestamp, 1 , view_timestamp) OVER(PARTITION BY user_id ORDER BY view_timestamp) AS last_timestamp
FROM live_events
user_id | page_id | view_timestamp | last_timestamp |
---|
1001 | 1 | 1700454270 | 1700454270 |
1001 | 1 | 1700454286 | 1700454270 |
1001 | 2 | 1700454312 | 1700454286 |
1001 | 1 | 1700454384 | 1700454312 |
1001 | 2 | 1700454412 | 1700454384 |
1001 | 1 | 1700454436 | 1700454412 |
1002 | 2 | 1700454270 | 1700454270 |
1002 | 1 | 1700454288 | 1700454270 |
1002 | 2 | 1700454321 | 1700454288 |
… | … | … | … |
SELECT
user_id, page_id, view_timestamp, last_timestamp
, IF (view_timestamp - last_timestamp >= 60 , 1 , 0) AS tab
FROM (
SELECT
user_id, page_id, view_timestamp
, LAG(view_timestamp, 1 , view_timestamp) OVER(PARTITION BY user_id ORDER BY view_timestamp) AS last_timestamp
FROM live_events
) a
user_id | page_id | view_timestamp | last_timestamp | tab |
---|
1001 | 1 | 1700454270 | 1700454270 | 0 |
1001 | 1 | 1700454286 | 1700454270 | 0 |
1001 | 2 | 1700454312 | 1700454286 | 0 |
1001 | 1 | 1700454384 | 1700454312 | 1 |
1001 | 2 | 1700454412 | 1700454384 | 0 |
1001 | 1 | 1700454436 | 1700454412 | 0 |
1002 | 2 | 1700454270 | 1700454270 | 0 |
1002 | 1 | 1700454288 | 1700454270 | 0 |
1002 | 2 | 1700454321 | 1700454288 | 0 |
1002 | 3 | 1700454348 | 1700454321 | 0 |
1002 | 1 | 1700454536 | 1700454348 | 1 |
… | … | … | … | … |
SELECT
user_id, page_id, view_timestamp, last_timestamp, tab
, SUM(tab) OVER(PARTITION BY user_id ORDER BY view_timestamp) +1 AS tab2
FROM (
SELECT
user_id, page_id, view_timestamp, last_timestamp
, IF (view_timestamp - last_timestamp >= 60 , 1 , 0) AS tab
FROM (
SELECT
user_id, page_id, view_timestamp
, LAG(view_timestamp, 1 , view_timestamp) OVER(PARTITION BY user_id ORDER BY view_timestamp) AS last_timestamp
FROM live_events
) a
) b ;
user_id | page_id | view_timestamp | last_timestamp | tab | tab2 |
---|
1001 | 1 | 1700454270 | 1700454270 | 0 | 1 |
1001 | 1 | 1700454286 | 1700454270 | 0 | 1 |
1001 | 2 | 1700454312 | 1700454286 | 0 | 1 |
1001 | 1 | 1700454384 | 1700454312 | 1 | 2 |
1001 | 2 | 1700454412 | 1700454384 | 0 | 2 |
1001 | 1 | 1700454436 | 1700454412 | 0 | 2 |
1002 | 2 | 1700454270 | 1700454270 | 0 | 1 |
1002 | 1 | 1700454288 | 1700454270 | 0 | 1 |
1002 | 2 | 1700454321 | 1700454288 | 0 | 1 |
1002 | 3 | 1700454348 | 1700454321 | 0 | 1 |
1002 | 1 | 1700454536 | 1700454348 | 1 | 2 |
1002 | 2 | 1700454921 | 1700454536 | 1 | 3 |
1002 | 2 | 1700454989 | 1700454921 | 1 | 4 |
… | … | … | … | … | … |
SELECT
user_id, page_id, view_timestamp
, CONCAT_WS('_', user_id, tab2) AS conversation_id
FROM (
SELECT
user_id, page_id, view_timestamp, last_timestamp, tab
, SUM(tab) OVER(PARTITION BY user_id ORDER BY view_timestamp) +1 AS tab2
FROM (
SELECT
user_id, page_id, view_timestamp, last_timestamp
, IF (view_timestamp - last_timestamp >= 60 , 1 , 0) AS tab
FROM (
SELECT
user_id, page_id, view_timestamp
, LAG(view_timestamp, 1 , view_timestamp) OVER(PARTITION BY user_id ORDER BY view_timestamp) AS last_timestamp
FROM live_events
) a
) b
) c
;
user_id | page_id | view_timestamp | conversation_id |
---|
1001 | 1 | 1700454270 | 1001_1 |
1001 | 1 | 1700454286 | 1001_1 |
1001 | 2 | 1700454312 | 1001_1 |
1001 | 1 | 1700454384 | 1001_2 |
1001 | 2 | 1700454412 | 1001_2 |
1001 | 1 | 1700454436 | 1001_2 |
1002 | 2 | 1700454270 | 1002_1 |
1002 | 1 | 1700454288 | 1002_1 |
1002 | 2 | 1700454321 | 1002_1 |
1002 | 3 | 1700454348 | 1002_1 |
1002 | 1 | 1700454536 | 1002_2 |
… | … | … | … |
end