SQL面试题

JSON数组解析

{"tbl":[{"col1":"张三","col2":17},{"col1":"王五","col2":18}]}

期待结果:

col1col2
张三17
王五18
SELECT get_json_object(obj, '$.col1') col1
	, get_json_object(obj, '$.col2') col2
FROM (
	SELECT regexp_replace(regexp_replace(get_json_object(binlog, '$.tbl'), '^\\[|]$', ''), '\\},\\{', '\\};\\{') AS new_col
	FROM ods_log
) temp
	LATERAL VIEW explode(split(new_col, ';')) t AS obj;

计算直播间每个时刻的人数

原始数据:live_room

live_idin_outts
111663169946
111663169946
1-11663169946
111663169947
111663169947
111663169947
111663169948
211663169946
211663169946
2-11663169946
2-11663169947
2-11663169947
211663169947
211663169948
  • in_out字段代表用户进入直播间或者离开直播间,当值为1时代表进入,值为-1时代表离开
with tbl_filter as ( --对每个直播间每个时刻的进出人数做累加
    SELECT live_id,
           ts,
           sum(in_out) viewer_num
    FROM live_room
    GROUP BY  live_id, ts
)
SELECT  live_id,
        ts,
        sum(in_out) over(partition by live_id, ts ORDER BY ts) total_viewer
FROM tbl_filter

统计互相关注的用户

原始数据:user_follow

user_idfollow_ids
10051[10052,10053]
10052[10051]
10053[10052,10055,10056]
10054[10052,10053]
10055[10056]
10056[10055]
with tbl_flat as ( --拍平,方便后续操作
SELECT user_id, follow_id
FROM user_follow
LATERAL VIEW explode(split(follow_ids, ',')) t AS follow_id
)
--将拼接的id拆分开
SELECT split(rels, '_')[0] AS user_l, split(rels, '_')[1] AS user_r
FROM (
	--统计相同key出现的次数,同一个key出现两次意味着互相关注
	SELECT rels, count(*) AS num
	FROM (
		--将用户id和关注id进行拼接,将id大的放在前面,确保10051和10052在两次拼接时,结果都是10052_10051
		SELECT if(user_id > follow_id, concat_ws('_', user_id, follow_id), concat_ws('_', follow_id, user_id)) AS rels
		FROM tbl_flat
		WHERE follow_id IS NOT NULL
	) temp
	GROUP BY rels
	HAVING num > 1
) r

统计连续登录三天的用户

原始数据:user_login_detail

user_idlogin_time
12022-09-13
12022-09-14
12022-09-15
22022-09-10
22022-09-12
22022-09-13
22022-09-15
32022-09-13
32022-09-14
--方法一:通过row_number实现
SELECT user_id,
       start_date,
       count(start_date) continua_num
FROM 
    (SELECT user_id,
         	date_add(date_sub(login_time, rank), 1) start_date 
        FROM 
        (SELECT *,
         		row_number() over(partition by user_id ORDER BY  login_time) rank
        FROM 
            (SELECT user_id,
         			login_time
            FROM user_login_detail
            GROUP BY  user_id, login_time) map_result) filter_result) temp_result
GROUP BY  user_id, start_date
HAVING continua_num >= 3

--方法二:通过lag或者lead实现
with tbl_map as (
    SELECT user_id,
           login_time,
           lag(login_time, 1) over(partition by user_id ORDER BY login_time) pre_time, 
           lag(login_time, 2) over(partition by user_id ORDER BY login_time) pre2_time
    FROM user_login_detail
)
SELECT DISTINCT user_id
FROM tbl_map
WHERE login_time = pre_time
        AND login_time = pre2_time

计算每个用户最大的连续登录天数,可以间隔一天

  • 用户在9月1日、9月3日登录了,计算为连续登录3天

原始数据:login_detail

user_idlogin_date
12022-09-10
22022-09-10
12022-09-11
22022-09-11
12022-09-14
22022-09-13

预期结果:

user_idmax_contin_days
12
24
with tbl_more_info as ( --在原有数据基础上,增加上一次登录的日期
    SELECT user_id,
           login_date,
           lag(login_date) over(partition by user_id ORDER BY login_date) pre_date
    FROM login_detail
),
tbl_flag as ( --计算本次登录和上次登录间隔时间;并判断间隔时间是否在允许范围内,如果在允许范围内打标记为0,反之打1
    SELECT user_id,
         login_date,
         if(
            datediff(login_date, pre_date) IN (1,2),
         0,1) contin_flag
    FROM tbl_more_info
),
tbl_contin as ( --增加分组标识,在tbl_flag中之所以不符合条件的标识为1,就是为了在累加的时候能实现分组,每次不符合连续条件时都会使分组编号加一
    SELECT user_id,
           login_date,
           sum(contin_flag) over(partition by user_id ORDER BY login_date) group_flag
    FROM tbl_flag
),
tbl_temp_result as ( --统计每个用户的所有连续登录周期
    SELECT user_id,
           datediff(max(login_date), min(login_date)) + 1 contin_days
    FROM tbl_contin
    GROUP BY  user_id, group_flag
)
--取出每个用户最大的连续登录日期
SELECT user_id,
       max(contin_days) max_contin_days
FROM tbl_temp_result

计算各部门去掉最高薪资和最低薪资后的平均薪资

原始数据:user_salary

user_iddepartment_idsalary
1110000
2120000
3150000
4140000
5220000
6220000
7230000
8240000
SELECT department_id,
         avg(salary) avg_salary
FROM 
    (SELECT *,
         rank() over(partition by department ORDER BY salary) min_salary_flag, 
         rank() over(partition by department ORDER BY salary desc) max_salary_flag
    FROM user_salary ) filter_salary 
WHERE min_salary_flag <> 1
        AND max_salary_flag <> 1
GROUP BY  department_id

查询高价值用户

  • 高价值用户定义:有过7天内购买次数超过3次(含),且购买总金额超过1000(含)的用户

原始数据:shop_detail

user_idbuy_dateamount
12022-09-10100
12022-09-11200
22022-09-13400
22022-09-14800
12022-09-16800
12022-09-17600
22022-09-22400
32022-09-22400
32022-09-23300
32022-09-24100
32022-09-25100

期望结果:

user_id
1
SELECT DISTINCT user_id
FROM (
	SELECT *, 
		count(1) OVER (PARTITION BY user_id ORDER BY buy_date ASC RANGE BETWEEN 6 PRECEDING AND CURRENT ROW) AS pur_num, 
		sum(amount) OVER (PARTITION BY user_id ORDER BY buy_date ASC RANGE BETWEEN 6 PRECEDING AND CURRENT ROW) AS pur_amount 
	FROM shop_detail
) temp
WHERE pur_num >= 3
	AND pur_amount >= 1000;

统计登录时长TOP3

统计:

  1. 登陆时间最长的Top3用户和最长的时间段
  2. 同时在线人数最多的时间段

原始数据:user_login

user_idtypetime
1in2022-08-01 08:00:00
2in2022-08-01 08:15:00
3in2022-08-01 09:00:00
1out2022-08-01 09:15:00
4in2022-08-01 09:20:00
4out2022-08-01 09:35:00
5in2022-08-01 09:31:00
3out2022-08-01 09:31:00
4in2022-08-01 10:20:00
4out2022-08-01 11:35:00
  • type字段:in表示登录,out表示登出
--top3和最长时段
SELECT uid, time, rtime
FROM (
	SELECT uid, time, rtime, row_number() OVER (ORDER BY duration DESC) AS rk
	FROM (
		SELECT *, row_number() OVER (PARTITION BY uid ORDER BY duration DESC) AS rk
		FROM (
			SELECT *, unix_timestamp(time) - unix_timestamp(rtime) AS duration
			FROM (
				SELECT *
					, if(type = 'out', lag(time, 1) OVER (PARTITION BY uid ORDER BY time ASC), NULL) AS rtime
				FROM user_login
			) a
		) b
		WHERE duration IS NOT NULL
	) c
	WHERE rk = 1
) d
WHERE rk <= 3;
--同时在线人数最多的时间段
SELECT num, time
FROM (
	SELECT *, rank() OVER (ORDER BY num DESC) AS rk
	FROM (
		SELECT uid, sum(stat) OVER (ORDER BY time ASC) AS num, time
		FROM (
			SELECT uid
				, if(type = 'in', 1, -1) AS stat
				, time
			FROM user_login
		) a
	) b
) c
WHERE rk = 1;

统计首次缴费月份

  • 统计每个用户每段工作经历首次缴纳社保月份,用户张三在滴滴工作后离职去字节,在字节工作一段时间后又回到滴滴,算为三段工作经历

原始数据:social_detail

card_nocom_namepay_month
1滴滴2021-08
1滴滴2021-09
1滴滴2021-10
1滴滴2021-11
1字节2021-12
1字节2022-01
1滴滴2022-02
1滴滴2022-03

期望结果:

card_nocom_namepay_month
1滴滴2021-08
1字节2021-12
1滴滴2022-02
SELECT card_no, com_name, pay_month
FROM (
	SELECT *, lag(com_name, 1) OVER (PARTITION BY card_no ORDER BY pay_month ASC) AS flag
	FROM social_detail
) more_info
WHERE flag IS NULL
	OR flag <> com_name;

留存

  • 利用用户登录表统计3、7、15日留存
  • 用户登录表分为按天分区和不分区两种,方便展示计算方式

原始数据:login_detail

user_idlogin_date
12022-09-10
22022-09-10
12022-09-11
22022-09-11
12022-09-14
SELECT user_id
FROM (
	SELECT *, 
		datediff(login_date, window_first_date) AS date_diff
	FROM (
		SELECT *, 
			min(login_date) OVER (PARTITION BY user_id ) AS first_login_date, 
			first_value(login_date) OVER (PARTITION BY user_id ORDER BY login_date ASC RANGE BETWEEN 6 PRECEDING AND CURRENT ROW) AS window_first_date
		FROM login_detail
	) temp
	WHERE window_first_date = first_login_date
) result
WHERE date_diff = 6;

统计30天内第二次购买金额

  • 统计某个月的收购用户,在其首购后30天内的第二单复购金额
  • 首购:平台内首次购买

原始数据:tbl_order,dt是分区键,增量分区

ord_idord_amtuser_idord_timedt
10001012.35000012022-01-012022-01-01
100011122.35000022022-01-012022-01-01
100012212.35000032022-01-012022-01-01
100013312.35000012022-01-022022-01-02
100014312.35000012022-01-022022-01-02
whit tbl_data as (
    select *, 
        row_number() over(partition by user_id order by ord_time asc) rk, 
        nvl(data_diff(ord_time, lead(ord_time) over(partition by user_id ord_time asc)), null) diff 
    from tbl_order where dt <= 'current_date'
),
tbl_filter as (
    select user_id from tbl_data where rk = 1 and ord_time >= '2022-01-01' and ord_time <= '2022-01-31'
),
tbl_ord as (
    select * from tbl_data where rk = 2 and diff <= 30 and diff is not null
)
select tbl_filter.user_id, tbl_ord.ord_amt from tbl_filter inner join tbl_ord on tbl_filter.user_id = tbl_ord.user_id;

为每个用户推荐和他相似的用户所购买过的产品

  • 推荐时需要排除用户自己购买过的产品
  • 曾经购买过2种或者2种以上相同商品的用户认为是相似用户

原始数据:orders

order_iduidgid
10001181095968
10002181095968,99987
10003735197773
10004735195968,99987
10005735196663
10006622896663
10007622895968
10008622891972

期望结果:

uidgid
181097773
181096663
735191972
622897773
622899987

解题思路:

  1. 首先对原始数据去重,同一个用户可能购买过多次同一个产品
  2. 利用商品id进行关联,关联时要限定user_id不相同,避免自己和自己的关联
  3. 将user_id进行拼接(concat_ws(“_”, a.user_id, b.user_id)拼接的时候还需要进行字典排序),并根据product_id进行去重(a用户和b用户都买过商品一,关联后会产生两条记录)
  4. 同一个key出现次数大于等于两次的为相似用户,此时可以获取到相似用户的列表
WITH tbl_fmap AS (--对购买商品列进行展开
    SELECT uid, gid
    FROM dwd_order
    LATERAL VIEW explode(split(gids, ',')) t AS gid
),
tbl_dis AS (--用户可能多次购买同一个商品,因此需要根据用户和商品进行去重
    SELECT uid, gid
    FROM tbl_fmap
    GROUP BY uid, gid
),
tbl_conn AS (--利用商品id进行关联,关联时要限定user_id不相同,避免自己和自己的关联,因为测试时使用的hive版本高于2.2.0,因此可以使用不等式连接。将关联上的用户id进行拼接,并在拼接时对待拼接的进行排序
    SELECT if(tbl_dis.uid > t.uid, concat_ws('_', t.uid, tbl_dis.uid), concat_ws('_', tbl_dis.uid, t.uid)) AS uids, 
        tbl_dis.gid AS gid
    FROM tbl_dis
    	INNER JOIN tbl_dis t
    	ON tbl_dis.gid = t.gid AND tbl_dis.uid <> t.uid
),
tbl_relation AS (--两个用户在关联时会生成两条结果,一条是由左侧用户生成,一条是由右侧用户生成,但是本质是一条数据,因此需要做一次去重操作,去重后同一个key出现次数大于等于两次的为相似用户,此时可以获取到相似用户的列表
    SELECT uids, count(gid) AS num
    FROM (
    	SELECT uids, gid
    	FROM tbl_conn
    	GROUP BY uids, gid
    ) t
    GROUP BY uids
    HAVING num > 1
),
tbl_relations AS (--生成相似用户列表,因为互为相似用户,因此需要进行union all操作
    SELECT split(uids, '_')[0] uid, split(uids, '_')[1] urid
    FROM tbl_relation
    UNION ALL
    SELECT split(uids, '_')[1] uid, split(uids, '_')[0] urid
    FROM tbl_relation
),
tbl_recommend AS (--通过两次关联获取推荐商品,第一次关联时获取到相似用户购买的商品,第二次关联是需要去除用户以前购买过的商品
    SELECT t.uid AS uid, t.gid AS gid, tbl_dis.uid AS flag
    FROM (
    	SELECT tbl_relations.uid AS uid, tbl_dis.gid AS gid
    	FROM tbl_relations
    		LEFT JOIN tbl_dis ON tbl_relations.urid = tbl_dis.uid
    ) t
    	LEFT JOIN tbl_dis
    	ON t.uid = tbl_dis.uid AND t.gid = tbl_dis.gid
)
SELECT uid, gid FROM tbl_recommend WHERE flag IS NULL;
  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值