Hive 计算历史活跃区间表

 历史活跃区间表:

hive>create table active_range_test(
user_id         string,
first_dt        string,
range_start     string,
range_end       string
)
partitioned by (dt string)
row format delimited fields terminated by ','
;

数据:
range.txt(历史数据)

linux>vi range.txt
a1,2023-03-01,2023-03-01,2023-03-04
a1,2023-03-01,2023-03-06,2023-03-08
b1,2023-03-01,2023-03-03,2023-03-04
b1,2023-03-01,2023-03-05,2023-03-08
c1,2023-03-01,2023-03-01,2023-03-06
c1,2023-03-01,2023-03-07,9999-12-31
d1,2023-03-05,2023-03-05,9999-12-31
#a1用户3.1-3.4号来过,3.6-3.8号来过

载入数据

hive>load data local inpath '/root/tmp_data/data/range.txt' into table active_range_test partition (dt='2023-03-10');

T+1日 
活跃用户表:

hive>create table dau_test(user_id string)
partitioned by (dt string)
row format delimited fields terminated by ',';

vi dau.txt(当日数据)

linux> vi dau.txt
a1
d1
e1
b1

hive>load data local inpath '/root/tmp_data/data/dau.txt' into table dau_test partition (dt='2023-03-11');

如果有需要可使用如下命令跳过第一行:

hive > alter table active_range_test set TBLPROPERTIES ('skip.header.line.count'='1');

计算:

法1:--nvl()

hive>with active as(
select * from active_range_test where dt='2023-03-10'
),
dau as(
select * from dau_test where dt='2023-03-11'
)
select 
nvl(a.user_id,d.user_id)  as user_id,
nvl(a.first_dt,d.dt)      as first_dt,
nvl(a.range_start,d.dt)   as range_start,
if(a.range_end='9999-12-31' and d.user_id is null,a.dt,nvl(a.range_end,'9999-12-31')) as range_end
from active a full join dau d
on a.user_id=d.user_id;

结果:
+----------+-------------+--------------+-------------+--+
| user_id  |  first_dt   | range_start  |  range_end  |
+----------+-------------+--------------+-------------+--+
| a1       | 2023-03-01  | 2023-03-06   | 2023-03-08  |
| a1       | 2023-03-01  | 2023-03-01   | 2023-03-04  |
| b1       | 2023-03-01  | 2023-03-05   | 2023-03-08  |
| b1       | 2023-03-01  | 2023-03-03   | 2023-03-04  |
| c1       | 2023-03-01  | 2023-03-07   | 2023-03-10  |
| c1       | 2023-03-01  | 2023-03-01   | 2023-03-06  |
| d1       | 2023-03-05  | 2023-03-05   | 9999-12-31  |
| e1       | 2023-03-11  | 2023-03-11   | 9999-12-31  |
+----------+-------------+--------------+-------------+--+


nvl()可代替if判断是否为空值
hive>select nvl(null,1);
+------+--+
| _c0  |
+------+--+
| 1    |
+------+--+
hive>select nvl(2,null);
+------+--+
| _c0  |
+------+--+
| 2    |
+------+--+
hive>select nvl(null,null);
+-------+--+
|  _c0  |
+-------+--+
| NULL  |
+-------+--+

法2:--case when 

hive>with active as(
select * from active_range_test where dt='2023-03-10'
),
dau as(
select * from dau_test where dt='2023-03-11'
)
select 
nvl(a.user_id,d.user_id)  as user_id,
nvl(a.first_dt,d.dt)      as first_dt,
nvl(a.range_start,d.dt)   as range_start,
case 
when range_end='9999-12-31' and d.user_id is null then '2023-03-10'
when d.user_id is not null and a.user_id is null  then '9999-12-31'
else a.range_end
end as range_end
from active a full join dau d
on a.user_id=d.user_id;

#nvl(a.user_id,d.user_id)  as user_id=if(a.user.id is null,d.user_id,a.user_id);

结果:
+----------+-------------+--------------+-------------+--+
| user_id  |  first_dt   | range_start  |  range_end  |
+----------+-------------+--------------+-------------+--+
| a1       | 2023-03-01  | 2023-03-06   | 2023-03-08  |
| a1       | 2023-03-01  | 2023-03-01   | 2023-03-04  |
| b1       | 2023-03-01  | 2023-03-05   | 2023-03-08  |
| b1       | 2023-03-01  | 2023-03-03   | 2023-03-04  |
| c1       | 2023-03-01  | 2023-03-07   | 2023-03-10  |
| c1       | 2023-03-01  | 2023-03-01   | 2023-03-06  |
| d1       | 2023-03-05  | 2023-03-05   | 9999-12-31  |
| e1       | 2023-03-11  | 2023-03-11   | 9999-12-31  |
+----------+-------------+--------------+-------------+--+


计算历史用户 以前来了的,今天也来了,不算新增用户

hive>with active as(
select user_id,min(first_dt) as first_dt  from active_range_test  where  dt='2023-03-10'
group by user_id
having max(range_end)!='9999-12-31'
),
dau as(
select * from dau_test where dt='2023-03-11'
)
select 
active.user_id   as user_id,
active.first_dt  as first_dt,
dau.dt            as range_start,
'9999-12-31'      as range_end
from active join dau 
on active.user_id=dau.user_id;

结果:
+----------+-------------+--------------+-------------+--+
| user_id  |  first_dt   | range_start  |  range_end  |
+----------+-------------+--------------+-------------+--+
| a1       | 2023-03-01  | 2023-03-11   | 9999-12-31  |
| b1       | 2023-03-01  | 2023-03-11   | 9999-12-31  |
+----------+-------------+--------------+-------------+--+

最后历史活跃区间表计算结果(T+1更新):

hive>with one as(
select * from active_range_test where dt='2023-03-10'
),
two as(
select * from dau_test where dt='2023-03-11'
),
active as(
select user_id,min(first_dt) as first_dt  from active_range_test  where  dt='2023-03-10'
group by user_id
having max(range_end)!='9999-12-31'
),
dau as(
select * from dau_test where dt='2023-03-11'
)
select 
nvl(a.user_id,d.user_id)  as user_id,
nvl(a.first_dt,d.dt)      as first_dt,
nvl(a.range_start,d.dt)   as range_start,
if(a.range_end='9999-12-31' and d.user_id is null,a.dt,nvl(a.range_end,'9999-12-31')) as range_end
from one a full join two d
on a.user_id=d.user_id
union all
select 
active.user_id   as user_id,
active.first_dt  as first_dt,
dau.dt            as range_start,
'9999-12-31'      as range_end
from active join dau 
on active.user_id=dau.user_id;

结果:
+--------------+---------------+------------------+----------------+--+
| _u1.user_id  | _u1.first_dt  | _u1.range_start  | _u1.range_end  |
+--------------+---------------+------------------+----------------+--+
| a1           | 2023-03-01    | 2023-03-06       | 2023-03-08     |
| a1           | 2023-03-01    | 2023-03-01       | 2023-03-04     |
| b1           | 2023-03-01    | 2023-03-05       | 2023-03-08     |
| b1           | 2023-03-01    | 2023-03-03       | 2023-03-04     |
| c1           | 2023-03-01    | 2023-03-07       | 2023-03-10     |
| c1           | 2023-03-01    | 2023-03-01       | 2023-03-06     |
| d1           | 2023-03-05    | 2023-03-05       | 9999-12-31     |
| e1           | 2023-03-11    | 2023-03-11       | 9999-12-31     |
| a1           | 2023-03-01    | 2023-03-11       | 9999-12-31     |
| b1           | 2023-03-01    | 2023-03-11       | 9999-12-31     |
+--------------+---------------+------------------+----------------+--+

hive>select user_id,min(first_dt) as first_dt  from active_range_test  where  dt='2023-03-10'
group by user_id
having max(range_end)!='9999-12-31';

结果:
+----------+-------------+--+
| user_id  |  first_dt   |
+----------+-------------+--+
| a1       | 2023-03-01  |
| b1       | 2023-03-01  |
+----------+-------------+--+
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值