历史活跃区间表:
hive>create table active_range_test(
user_id string,
first_dt string,
range_start string,
range_end string
)
partitioned by (dt string)
row format delimited fields terminated by ','
;
数据:
range.txt(历史数据)
linux>vi range.txt
a1,2023-03-01,2023-03-01,2023-03-04
a1,2023-03-01,2023-03-06,2023-03-08
b1,2023-03-01,2023-03-03,2023-03-04
b1,2023-03-01,2023-03-05,2023-03-08
c1,2023-03-01,2023-03-01,2023-03-06
c1,2023-03-01,2023-03-07,9999-12-31
d1,2023-03-05,2023-03-05,9999-12-31
#a1用户3.1-3.4号来过,3.6-3.8号来过
载入数据
hive>load data local inpath '/root/tmp_data/data/range.txt' into table active_range_test partition (dt='2023-03-10');
T+1日
活跃用户表:
hive>create table dau_test(user_id string)
partitioned by (dt string)
row format delimited fields terminated by ',';
vi dau.txt(当日数据)
linux> vi dau.txt
a1
d1
e1
b1
hive>load data local inpath '/root/tmp_data/data/dau.txt' into table dau_test partition (dt='2023-03-11');
如果有需要可使用如下命令跳过第一行:
hive > alter table active_range_test set TBLPROPERTIES ('skip.header.line.count'='1');
计算:
法1:--nvl()
hive>with active as(
select * from active_range_test where dt='2023-03-10'
),
dau as(
select * from dau_test where dt='2023-03-11'
)
select
nvl(a.user_id,d.user_id) as user_id,
nvl(a.first_dt,d.dt) as first_dt,
nvl(a.range_start,d.dt) as range_start,
if(a.range_end='9999-12-31' and d.user_id is null,a.dt,nvl(a.range_end,'9999-12-31')) as range_end
from active a full join dau d
on a.user_id=d.user_id;
结果:
+----------+-------------+--------------+-------------+--+
| user_id | first_dt | range_start | range_end |
+----------+-------------+--------------+-------------+--+
| a1 | 2023-03-01 | 2023-03-06 | 2023-03-08 |
| a1 | 2023-03-01 | 2023-03-01 | 2023-03-04 |
| b1 | 2023-03-01 | 2023-03-05 | 2023-03-08 |
| b1 | 2023-03-01 | 2023-03-03 | 2023-03-04 |
| c1 | 2023-03-01 | 2023-03-07 | 2023-03-10 |
| c1 | 2023-03-01 | 2023-03-01 | 2023-03-06 |
| d1 | 2023-03-05 | 2023-03-05 | 9999-12-31 |
| e1 | 2023-03-11 | 2023-03-11 | 9999-12-31 |
+----------+-------------+--------------+-------------+--+
nvl()可代替if判断是否为空值
hive>select nvl(null,1);
+------+--+
| _c0 |
+------+--+
| 1 |
+------+--+
hive>select nvl(2,null);
+------+--+
| _c0 |
+------+--+
| 2 |
+------+--+
hive>select nvl(null,null);
+-------+--+
| _c0 |
+-------+--+
| NULL |
+-------+--+
法2:--case when
hive>with active as(
select * from active_range_test where dt='2023-03-10'
),
dau as(
select * from dau_test where dt='2023-03-11'
)
select
nvl(a.user_id,d.user_id) as user_id,
nvl(a.first_dt,d.dt) as first_dt,
nvl(a.range_start,d.dt) as range_start,
case
when range_end='9999-12-31' and d.user_id is null then '2023-03-10'
when d.user_id is not null and a.user_id is null then '9999-12-31'
else a.range_end
end as range_end
from active a full join dau d
on a.user_id=d.user_id;
#nvl(a.user_id,d.user_id) as user_id=if(a.user.id is null,d.user_id,a.user_id);
结果:
+----------+-------------+--------------+-------------+--+
| user_id | first_dt | range_start | range_end |
+----------+-------------+--------------+-------------+--+
| a1 | 2023-03-01 | 2023-03-06 | 2023-03-08 |
| a1 | 2023-03-01 | 2023-03-01 | 2023-03-04 |
| b1 | 2023-03-01 | 2023-03-05 | 2023-03-08 |
| b1 | 2023-03-01 | 2023-03-03 | 2023-03-04 |
| c1 | 2023-03-01 | 2023-03-07 | 2023-03-10 |
| c1 | 2023-03-01 | 2023-03-01 | 2023-03-06 |
| d1 | 2023-03-05 | 2023-03-05 | 9999-12-31 |
| e1 | 2023-03-11 | 2023-03-11 | 9999-12-31 |
+----------+-------------+--------------+-------------+--+
计算历史用户 以前来了的,今天也来了,不算新增用户
hive>with active as(
select user_id,min(first_dt) as first_dt from active_range_test where dt='2023-03-10'
group by user_id
having max(range_end)!='9999-12-31'
),
dau as(
select * from dau_test where dt='2023-03-11'
)
select
active.user_id as user_id,
active.first_dt as first_dt,
dau.dt as range_start,
'9999-12-31' as range_end
from active join dau
on active.user_id=dau.user_id;
结果:
+----------+-------------+--------------+-------------+--+
| user_id | first_dt | range_start | range_end |
+----------+-------------+--------------+-------------+--+
| a1 | 2023-03-01 | 2023-03-11 | 9999-12-31 |
| b1 | 2023-03-01 | 2023-03-11 | 9999-12-31 |
+----------+-------------+--------------+-------------+--+
最后历史活跃区间表计算结果(T+1更新):
hive>with one as(
select * from active_range_test where dt='2023-03-10'
),
two as(
select * from dau_test where dt='2023-03-11'
),
active as(
select user_id,min(first_dt) as first_dt from active_range_test where dt='2023-03-10'
group by user_id
having max(range_end)!='9999-12-31'
),
dau as(
select * from dau_test where dt='2023-03-11'
)
select
nvl(a.user_id,d.user_id) as user_id,
nvl(a.first_dt,d.dt) as first_dt,
nvl(a.range_start,d.dt) as range_start,
if(a.range_end='9999-12-31' and d.user_id is null,a.dt,nvl(a.range_end,'9999-12-31')) as range_end
from one a full join two d
on a.user_id=d.user_id
union all
select
active.user_id as user_id,
active.first_dt as first_dt,
dau.dt as range_start,
'9999-12-31' as range_end
from active join dau
on active.user_id=dau.user_id;
结果:
+--------------+---------------+------------------+----------------+--+
| _u1.user_id | _u1.first_dt | _u1.range_start | _u1.range_end |
+--------------+---------------+------------------+----------------+--+
| a1 | 2023-03-01 | 2023-03-06 | 2023-03-08 |
| a1 | 2023-03-01 | 2023-03-01 | 2023-03-04 |
| b1 | 2023-03-01 | 2023-03-05 | 2023-03-08 |
| b1 | 2023-03-01 | 2023-03-03 | 2023-03-04 |
| c1 | 2023-03-01 | 2023-03-07 | 2023-03-10 |
| c1 | 2023-03-01 | 2023-03-01 | 2023-03-06 |
| d1 | 2023-03-05 | 2023-03-05 | 9999-12-31 |
| e1 | 2023-03-11 | 2023-03-11 | 9999-12-31 |
| a1 | 2023-03-01 | 2023-03-11 | 9999-12-31 |
| b1 | 2023-03-01 | 2023-03-11 | 9999-12-31 |
+--------------+---------------+------------------+----------------+--+
hive>select user_id,min(first_dt) as first_dt from active_range_test where dt='2023-03-10'
group by user_id
having max(range_end)!='9999-12-31';
结果:
+----------+-------------+--+
| user_id | first_dt |
+----------+-------------+--+
| a1 | 2023-03-01 |
| b1 | 2023-03-01 |
+----------+-------------+--+