JSON数组解析
{"tbl":[{"col1":"张三","col2":17},{"col1":"王五","col2":18}]}
期待结果:
col1 | col2 |
---|---|
张三 | 17 |
王五 | 18 |
SELECT get_json_object(obj, '$.col1') col1
, get_json_object(obj, '$.col2') col2
FROM (
SELECT regexp_replace(regexp_replace(get_json_object(binlog, '$.tbl'), '^\\[|]$', ''), '\\},\\{', '\\};\\{') AS new_col
FROM ods_log
) temp
LATERAL VIEW explode(split(new_col, ';')) t AS obj;
计算直播间每个时刻的人数
原始数据:live_room
live_id | in_out | ts |
---|---|---|
1 | 1 | 1663169946 |
1 | 1 | 1663169946 |
1 | -1 | 1663169946 |
1 | 1 | 1663169947 |
1 | 1 | 1663169947 |
1 | 1 | 1663169947 |
1 | 1 | 1663169948 |
2 | 1 | 1663169946 |
2 | 1 | 1663169946 |
2 | -1 | 1663169946 |
2 | -1 | 1663169947 |
2 | -1 | 1663169947 |
2 | 1 | 1663169947 |
2 | 1 | 1663169948 |
- in_out字段代表用户进入直播间或者离开直播间,当值为1时代表进入,值为-1时代表离开
with tbl_filter as ( --对每个直播间每个时刻的进出人数做累加
SELECT live_id,
ts,
sum(in_out) viewer_num
FROM live_room
GROUP BY live_id, ts
)
SELECT live_id,
ts,
sum(in_out) over(partition by live_id, ts ORDER BY ts) total_viewer
FROM tbl_filter
统计互相关注的用户
原始数据:user_follow
user_id | follow_ids |
---|---|
10051 | [10052,10053] |
10052 | [10051] |
10053 | [10052,10055,10056] |
10054 | [10052,10053] |
10055 | [10056] |
10056 | [10055] |
with tbl_flat as ( --拍平,方便后续操作
SELECT user_id, follow_id
FROM user_follow
LATERAL VIEW explode(split(follow_ids, ',')) t AS follow_id
)
--将拼接的id拆分开
SELECT split(rels, '_')[0] AS user_l, split(rels, '_')[1] AS user_r
FROM (
--统计相同key出现的次数,同一个key出现两次意味着互相关注
SELECT rels, count(*) AS num
FROM (
--将用户id和关注id进行拼接,将id大的放在前面,确保10051和10052在两次拼接时,结果都是10052_10051
SELECT if(user_id > follow_id, concat_ws('_', user_id, follow_id), concat_ws('_', follow_id, user_id)) AS rels
FROM tbl_flat
WHERE follow_id IS NOT NULL
) temp
GROUP BY rels
HAVING num > 1
) r
统计连续登录三天的用户
原始数据:user_login_detail
user_id | login_time |
---|---|
1 | 2022-09-13 |
1 | 2022-09-14 |
1 | 2022-09-15 |
2 | 2022-09-10 |
2 | 2022-09-12 |
2 | 2022-09-13 |
2 | 2022-09-15 |
3 | 2022-09-13 |
3 | 2022-09-14 |
--方法一:通过row_number实现
SELECT user_id,
start_date,
count(start_date) continua_num
FROM
(SELECT user_id,
date_add(date_sub(login_time, rank), 1) start_date
FROM
(SELECT *,
row_number() over(partition by user_id ORDER BY login_time) rank
FROM
(SELECT user_id,
login_time
FROM user_login_detail
GROUP BY user_id, login_time) map_result) filter_result) temp_result
GROUP BY user_id, start_date
HAVING continua_num >= 3
--方法二:通过lag或者lead实现
with tbl_map as (
SELECT user_id,
login_time,
lag(login_time, 1) over(partition by user_id ORDER BY login_time) pre_time,
lag(login_time, 2) over(partition by user_id ORDER BY login_time) pre2_time
FROM user_login_detail
)
SELECT DISTINCT user_id
FROM tbl_map
WHERE login_time = pre_time
AND login_time = pre2_time
计算每个用户最大的连续登录天数,可以间隔一天
- 用户在9月1日、9月3日登录了,计算为连续登录3天
原始数据:login_detail
user_id | login_date |
---|---|
1 | 2022-09-10 |
2 | 2022-09-10 |
1 | 2022-09-11 |
2 | 2022-09-11 |
1 | 2022-09-14 |
2 | 2022-09-13 |
预期结果:
user_id | max_contin_days |
---|---|
1 | 2 |
2 | 4 |
with tbl_more_info as ( --在原有数据基础上,增加上一次登录的日期
SELECT user_id,
login_date,
lag(login_date) over(partition by user_id ORDER BY login_date) pre_date
FROM login_detail
),
tbl_flag as ( --计算本次登录和上次登录间隔时间;并判断间隔时间是否在允许范围内,如果在允许范围内打标记为0,反之打1
SELECT user_id,
login_date,
if(
datediff(login_date, pre_date) IN (1,2),
0,1) contin_flag
FROM tbl_more_info
),
tbl_contin as ( --增加分组标识,在tbl_flag中之所以不符合条件的标识为1,就是为了在累加的时候能实现分组,每次不符合连续条件时都会使分组编号加一
SELECT user_id,
login_date,
sum(contin_flag) over(partition by user_id ORDER BY login_date) group_flag
FROM tbl_flag
),
tbl_temp_result as ( --统计每个用户的所有连续登录周期
SELECT user_id,
datediff(max(login_date), min(login_date)) + 1 contin_days
FROM tbl_contin
GROUP BY user_id, group_flag
)
--取出每个用户最大的连续登录日期
SELECT user_id,
max(contin_days) max_contin_days
FROM tbl_temp_result
计算各部门去掉最高薪资和最低薪资后的平均薪资
原始数据:user_salary
user_id | department_id | salary |
---|---|---|
1 | 1 | 10000 |
2 | 1 | 20000 |
3 | 1 | 50000 |
4 | 1 | 40000 |
5 | 2 | 20000 |
6 | 2 | 20000 |
7 | 2 | 30000 |
8 | 2 | 40000 |
SELECT department_id,
avg(salary) avg_salary
FROM
(SELECT *,
rank() over(partition by department ORDER BY salary) min_salary_flag,
rank() over(partition by department ORDER BY salary desc) max_salary_flag
FROM user_salary ) filter_salary
WHERE min_salary_flag <> 1
AND max_salary_flag <> 1
GROUP BY department_id
查询高价值用户
- 高价值用户定义:有过7天内购买次数超过3次(含),且购买总金额超过1000(含)的用户
原始数据:shop_detail
user_id | buy_date | amount |
---|---|---|
1 | 2022-09-10 | 100 |
1 | 2022-09-11 | 200 |
2 | 2022-09-13 | 400 |
2 | 2022-09-14 | 800 |
1 | 2022-09-16 | 800 |
1 | 2022-09-17 | 600 |
2 | 2022-09-22 | 400 |
3 | 2022-09-22 | 400 |
3 | 2022-09-23 | 300 |
3 | 2022-09-24 | 100 |
3 | 2022-09-25 | 100 |
期望结果:
user_id |
---|
1 |
SELECT DISTINCT user_id
FROM (
SELECT *,
count(1) OVER (PARTITION BY user_id ORDER BY buy_date ASC RANGE BETWEEN 6 PRECEDING AND CURRENT ROW) AS pur_num,
sum(amount) OVER (PARTITION BY user_id ORDER BY buy_date ASC RANGE BETWEEN 6 PRECEDING AND CURRENT ROW) AS pur_amount
FROM shop_detail
) temp
WHERE pur_num >= 3
AND pur_amount >= 1000;
统计登录时长TOP3
统计:
- 登陆时间最长的Top3用户和最长的时间段
- 同时在线人数最多的时间段
原始数据:user_login
user_id | type | time |
---|---|---|
1 | in | 2022-08-01 08:00:00 |
2 | in | 2022-08-01 08:15:00 |
3 | in | 2022-08-01 09:00:00 |
1 | out | 2022-08-01 09:15:00 |
4 | in | 2022-08-01 09:20:00 |
4 | out | 2022-08-01 09:35:00 |
5 | in | 2022-08-01 09:31:00 |
3 | out | 2022-08-01 09:31:00 |
4 | in | 2022-08-01 10:20:00 |
4 | out | 2022-08-01 11:35:00 |
- type字段:in表示登录,out表示登出
--top3和最长时段
SELECT uid, time, rtime
FROM (
SELECT uid, time, rtime, row_number() OVER (ORDER BY duration DESC) AS rk
FROM (
SELECT *, row_number() OVER (PARTITION BY uid ORDER BY duration DESC) AS rk
FROM (
SELECT *, unix_timestamp(time) - unix_timestamp(rtime) AS duration
FROM (
SELECT *
, if(type = 'out', lag(time, 1) OVER (PARTITION BY uid ORDER BY time ASC), NULL) AS rtime
FROM user_login
) a
) b
WHERE duration IS NOT NULL
) c
WHERE rk = 1
) d
WHERE rk <= 3;
--同时在线人数最多的时间段
SELECT num, time
FROM (
SELECT *, rank() OVER (ORDER BY num DESC) AS rk
FROM (
SELECT uid, sum(stat) OVER (ORDER BY time ASC) AS num, time
FROM (
SELECT uid
, if(type = 'in', 1, -1) AS stat
, time
FROM user_login
) a
) b
) c
WHERE rk = 1;
统计首次缴费月份
- 统计每个用户每段工作经历首次缴纳社保月份,用户张三在滴滴工作后离职去字节,在字节工作一段时间后又回到滴滴,算为三段工作经历
原始数据:social_detail
card_no | com_name | pay_month |
---|---|---|
1 | 滴滴 | 2021-08 |
1 | 滴滴 | 2021-09 |
1 | 滴滴 | 2021-10 |
1 | 滴滴 | 2021-11 |
1 | 字节 | 2021-12 |
1 | 字节 | 2022-01 |
1 | 滴滴 | 2022-02 |
1 | 滴滴 | 2022-03 |
期望结果:
card_no | com_name | pay_month |
---|---|---|
1 | 滴滴 | 2021-08 |
1 | 字节 | 2021-12 |
1 | 滴滴 | 2022-02 |
SELECT card_no, com_name, pay_month
FROM (
SELECT *, lag(com_name, 1) OVER (PARTITION BY card_no ORDER BY pay_month ASC) AS flag
FROM social_detail
) more_info
WHERE flag IS NULL
OR flag <> com_name;
留存
- 利用用户登录表统计3、7、15日留存
- 用户登录表分为按天分区和不分区两种,方便展示计算方式
原始数据:login_detail
user_id | login_date |
---|---|
1 | 2022-09-10 |
2 | 2022-09-10 |
1 | 2022-09-11 |
2 | 2022-09-11 |
1 | 2022-09-14 |
SELECT user_id
FROM (
SELECT *,
datediff(login_date, window_first_date) AS date_diff
FROM (
SELECT *,
min(login_date) OVER (PARTITION BY user_id ) AS first_login_date,
first_value(login_date) OVER (PARTITION BY user_id ORDER BY login_date ASC RANGE BETWEEN 6 PRECEDING AND CURRENT ROW) AS window_first_date
FROM login_detail
) temp
WHERE window_first_date = first_login_date
) result
WHERE date_diff = 6;
统计30天内第二次购买金额
- 统计某个月的收购用户,在其首购后30天内的第二单复购金额
- 首购:平台内首次购买
原始数据:tbl_order,dt是分区键,增量分区
ord_id | ord_amt | user_id | ord_time | dt |
---|---|---|---|---|
100010 | 12.3 | 500001 | 2022-01-01 | 2022-01-01 |
100011 | 122.3 | 500002 | 2022-01-01 | 2022-01-01 |
100012 | 212.3 | 500003 | 2022-01-01 | 2022-01-01 |
100013 | 312.3 | 500001 | 2022-01-02 | 2022-01-02 |
100014 | 312.3 | 500001 | 2022-01-02 | 2022-01-02 |
whit tbl_data as (
select *,
row_number() over(partition by user_id order by ord_time asc) rk,
nvl(data_diff(ord_time, lead(ord_time) over(partition by user_id ord_time asc)), null) diff
from tbl_order where dt <= 'current_date'
),
tbl_filter as (
select user_id from tbl_data where rk = 1 and ord_time >= '2022-01-01' and ord_time <= '2022-01-31'
),
tbl_ord as (
select * from tbl_data where rk = 2 and diff <= 30 and diff is not null
)
select tbl_filter.user_id, tbl_ord.ord_amt from tbl_filter inner join tbl_ord on tbl_filter.user_id = tbl_ord.user_id;
为每个用户推荐和他相似的用户所购买过的产品
- 推荐时需要排除用户自己购买过的产品
- 曾经购买过2种或者2种以上相同商品的用户认为是相似用户
原始数据:orders
order_id | uid | gid |
---|---|---|
10001 | 1810 | 95968 |
10002 | 1810 | 95968,99987 |
10003 | 7351 | 97773 |
10004 | 7351 | 95968,99987 |
10005 | 7351 | 96663 |
10006 | 6228 | 96663 |
10007 | 6228 | 95968 |
10008 | 6228 | 91972 |
期望结果:
uid | gid |
---|---|
1810 | 97773 |
1810 | 96663 |
7351 | 91972 |
6228 | 97773 |
6228 | 99987 |
解题思路:
- 首先对原始数据去重,同一个用户可能购买过多次同一个产品
- 利用商品id进行关联,关联时要限定user_id不相同,避免自己和自己的关联
- 将user_id进行拼接(concat_ws(“_”, a.user_id, b.user_id)拼接的时候还需要进行字典排序),并根据product_id进行去重(a用户和b用户都买过商品一,关联后会产生两条记录)
- 同一个key出现次数大于等于两次的为相似用户,此时可以获取到相似用户的列表
WITH tbl_fmap AS (--对购买商品列进行展开
SELECT uid, gid
FROM dwd_order
LATERAL VIEW explode(split(gids, ',')) t AS gid
),
tbl_dis AS (--用户可能多次购买同一个商品,因此需要根据用户和商品进行去重
SELECT uid, gid
FROM tbl_fmap
GROUP BY uid, gid
),
tbl_conn AS (--利用商品id进行关联,关联时要限定user_id不相同,避免自己和自己的关联,因为测试时使用的hive版本高于2.2.0,因此可以使用不等式连接。将关联上的用户id进行拼接,并在拼接时对待拼接的进行排序
SELECT if(tbl_dis.uid > t.uid, concat_ws('_', t.uid, tbl_dis.uid), concat_ws('_', tbl_dis.uid, t.uid)) AS uids,
tbl_dis.gid AS gid
FROM tbl_dis
INNER JOIN tbl_dis t
ON tbl_dis.gid = t.gid AND tbl_dis.uid <> t.uid
),
tbl_relation AS (--两个用户在关联时会生成两条结果,一条是由左侧用户生成,一条是由右侧用户生成,但是本质是一条数据,因此需要做一次去重操作,去重后同一个key出现次数大于等于两次的为相似用户,此时可以获取到相似用户的列表
SELECT uids, count(gid) AS num
FROM (
SELECT uids, gid
FROM tbl_conn
GROUP BY uids, gid
) t
GROUP BY uids
HAVING num > 1
),
tbl_relations AS (--生成相似用户列表,因为互为相似用户,因此需要进行union all操作
SELECT split(uids, '_')[0] uid, split(uids, '_')[1] urid
FROM tbl_relation
UNION ALL
SELECT split(uids, '_')[1] uid, split(uids, '_')[0] urid
FROM tbl_relation
),
tbl_recommend AS (--通过两次关联获取推荐商品,第一次关联时获取到相似用户购买的商品,第二次关联是需要去除用户以前购买过的商品
SELECT t.uid AS uid, t.gid AS gid, tbl_dis.uid AS flag
FROM (
SELECT tbl_relations.uid AS uid, tbl_dis.gid AS gid
FROM tbl_relations
LEFT JOIN tbl_dis ON tbl_relations.urid = tbl_dis.uid
) t
LEFT JOIN tbl_dis
ON t.uid = tbl_dis.uid AND t.gid = tbl_dis.gid
)
SELECT uid, gid FROM tbl_recommend WHERE flag IS NULL;