1.从用户登录明细表(user_login_detail)中查询每天的新增用户数,若一个用户在某天登录了,且在这一天之前没登录过,则任务该用户为这一天的新增用户
需求结果
2.所用到的表和数据
--登录明细表
CREATE TABLE user_login_detail
(
`user_id` string comment '用户id',
`ip_address` string comment 'ip地址',
`login_ts` string comment '登录时间',
`logout_ts` string comment '登出时间'
) COMMENT '用户登录明细表'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
--数据装载
INSERT overwrite table user_login_detail
VALUES ('101', '180.149.130.161', '2021-09-21 08:00:00', '2021-09-27 08:30:00'),
('101', '180.149.130.161', '2021-09-27 08:00:00', '2021-09-27 08:30:00'),
('101', '180.149.130.161', '2021-09-28 09:00:00', '2021-09-28 09:10:00'),
('101', '180.149.130.161', '2021-09-29 13:30:00', '2021-09-29 13:50:00'),
('101', '180.149.130.161', '2021-09-30 20:00:00', '2021-09-30 20:10:00'),
('102', '120.245.11.2', '2021-09-22 09:00:00', '2021-09-27 09:30:00'),
('102', '120.245.11.2', '2021-10-01 08:00:00', '2021-10-01 08:30:00'),
('102', '180.149.130.174', '2021-10-01 07:50:00', '2021-10-01 08:20:00'),
('102', '120.245.11.2', '2021-10-02 08:00:00', '2021-10-02 08:30:00'),
('103', '27.184.97.3', '2021-09-23 10:00:00', '2021-09-27 10:30:00'),
('103', '27.184.97.3', '2021-10-03 07:50:00', '2021-10-03 09:20:00'),
('104', '27.184.97.34', '2021-09-24 11:00:00', '2021-09-27 11:30:00'),
('104', '27.184.97.34', '2021-10-03 07:50:00', '2021-10-03 08:20:00'),
('104', '27.184.97.34', '2021-10-03 08:50:00', '2021-10-03 10:20:00'),
('104', '120.245.11.89', '2021-10-03 08:40:00', '2021-10-03 10:30:00'),
('105', '119.180.192.212', '2021-10-04 09:10:00', '2021-10-04 09:30:00'),
('106', '119.180.192.66', '2021-10-04 08:40:00', '2021-10-04 10:30:00'),
('106', '119.180.192.66', '2021-10-05 21:50:00', '2021-10-05 22:40:00'),
('107', '219.134.104.7', '2021-09-25 12:00:00', '2021-09-27 12:30:00'),
('107', '219.134.104.7', '2021-10-05 22:00:00', '2021-10-05 23:00:00'),
('107', '219.134.104.7', '2021-10-06 09:10:00', '2021-10-06 10:20:00'),
('107', '27.184.97.46', '2021-10-06 09:00:00', '2021-10-06 10:00:00'),
('108', '101.227.131.22', '2021-10-06 09:00:00', '2021-10-06 10:00:00'),
('108', '101.227.131.22', '2021-10-06 22:00:00', '2021-10-06 23:00:00'),
('109', '101.227.131.29', '2021-09-26 13:00:00', '2021-09-27 13:30:00'),
('109', '101.227.131.29', '2021-10-06 08:50:00', '2021-10-06 10:20:00'),
('109', '101.227.131.29', '2021-10-08 09:00:00', '2021-10-08 09:10:00'),
('1010', '119.180.192.10', '2021-09-27 14:00:00', '2021-09-27 14:30:00'),
('1010', '119.180.192.10', '2021-10-09 08:50:00', '2021-10-09 10:20:00');
3. 答案
思路一:首次登录即是用户新增,使用rank()开窗得出首次登录的日期,根据日期进行分组,即可得到结果
3.1过滤掉用户每日多次登录的数据
select user_id,
date_format(login_ts, 'yyyy-MM-dd') login_date
from user_login_detail
group by user_id,
date_format(login_ts, 'yyyy-MM-dd');
运行结果
3.2rank()开窗以user_id进行分组,登录日期进行排序,得出排名
select user_id,
login_date,
rank() over (
partition by
user_id
order by
login_date
) rk
from (
select user_id,
date_format(login_ts, 'yyyy-MM-dd') login_date
from user_login_detail
group by user_id,
date_format(login_ts, 'yyyy-MM-dd')
) t1;
运行结果
3.3筛选出第一名,即是用户的第一次登录,再根据筛选后的结果根据日期进行分组,得到结果
select login_date login_date_first,
count(*) user_count
from (
select user_id,
login_date,
rank() over (
partition by
user_id
order by
login_date
) rk
from (
select user_id,
date_format(login_ts, 'yyyy-MM-dd') login_date
from user_login_detail
group by user_id,
date_format(login_ts, 'yyyy-MM-dd')
) t1
) t2
where rk = 1
group by login_date;
运行结果
思路二:首次登录即是用户新增,使用first_value()开窗得出首次登录的日期,根据日期进行分组,即可得到结果
3.1过滤掉用户欸日多次登录的数据
select user_id,
substr(login_ts, 1, 10) login_date
from user_login_detail
group by user_id,
substr(login_ts, 1, 10);
运行结果
3.2使用first_value()开窗得到用户的首次登录日期
select user_id, login_date, first_value(login_date) over (partition by user_id order by login_date) login_date_first
from (
select user_id,
substr(login_ts, 1, 10) login_date
from user_login_detail
group by user_id,
substr(login_ts, 1, 10)) t1;
运行结果
3.3筛选出登录日期与首次登录日期相等的记录,然后对登录日期进行分组,得到结果
select login_date_first, count(*) user_count
from (
select user_id,
login_date,
first_value(login_date) over (partition by user_id order by login_date) login_date_first
from (
select user_id,
substr(login_ts, 1, 10) login_date
from user_login_detail
group by user_id,
substr(login_ts, 1, 10)) t1) t2
where login_date = login_date_first
group by login_date_first;
运行结果