hive中[增量/全量比对]算法

60 篇文章 2 订阅
本文介绍了如何使用增量比对和全量比对算法,通过ods.user_update表处理新增和变更数据,并结合user_his拉链表,更新用户资料。涉及数据合并、状态判断和日期处理,以实现实时用户数据的维护和历史记录管理。
摘要由CSDN通过智能技术生成

参考:https://blog.csdn.net/weixin_40444678/article/details/81083614

1、原始表

CREATE EXTERNAL TABLE ods.user (
  user_num STRING COMMENT '用户编号',
  mobile STRING COMMENT '手机号码',
  reg_date STRING COMMENT '注册日期'
)
COMMENT '用户资料表'
PARTITIONED BY (dt string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n'
STORED AS ORC
LOCATION '/ods/user';

2、每日新增及更新表

CREATE EXTERNAL TABLE ods.user_update (
  user_num STRING COMMENT '用户编号',
  mobile STRING COMMENT '手机号码',
  reg_date STRING COMMENT '注册日期'
COMMENT '每日用户资料更新表'
)
PARTITIONED BY (dt string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n'
STORED AS ORC
LOCATION '/ods/user_update';

3、拉链表

CREATE EXTERNAL TABLE dws.user_his (
  user_num STRING COMMENT '用户编号',
  mobile STRING COMMENT '手机号码',
  reg_date STRING COMMENT '用户编号',
  t_start_date ,
  t_end_date
)COMMENT '用户资料拉链表'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n'
STORED AS ORC
LOCATION '/dws/user_his';

4、增量比对算法

算法说明:

ods.user_update表里存放新增的及有变更的数据,对于新删除了的数据没法记录,故叫做“增量比对”算法。
1、union前面的是按已有历史拉链表里计算,包含关链的(与update关联上的)及不需要改变的(与update关联不上的,包含end_time是9999-12-31的及end_time是其他时间的,即状态正常的及已经结束的)。
2、union后面的是新增开链的数据,包含纯新开链的及有变更的开链(在union之前,这部分已经关链)。
3、这里感觉应该使用union all,以提高效率,两边应该不会有重复数据。

INSERT OVERWRITE TABLE dws.user_his
SELECT * FROM
(
    SELECT A.user_num,
           A.mobile,
           A.reg_date,
           A.t_start_time,
           CASE
                WHEN A.t_end_time = '9999-12-31' AND B.user_num IS NOT NULL THEN '2017-01-01'
                ELSE A.t_end_time
           END AS t_end_time
    FROM dws.user_his AS A
    LEFT JOIN ods.user_update AS B
    ON A.user_num = B.user_num
UNION
    SELECT C.user_num,
           C.mobile,
           C.reg_date,
           '2017-01-02' AS t_start_time,
           '9999-12-31' AS t_end_time
    FROM ods.user_update AS C
) AS T;

5、增量比对算法带参

INSERT OVERWRITE TABLE dws.user_his
SELECT * FROM
(
    SELECT A.user_num,
           A.mobile,
           A.reg_date,
           A.modified_time
           A.t_start_time,
           CASE
                WHEN A.t_end_time = '9999-12-31' AND B.user_num IS NOT NULL THEN date_add($date,-1)
                ELSE A.t_end_time
           END AS t_end_time
    FROM dws.user_his AS A
    LEFT JOIN ods.user_update AS B
    ON A.user_num = B.user_num
UNION
    SELECT C.user_num,
           C.mobile,
           C.reg_date,
           C.modified_time,
           $date AS t_start_time,
           '9999-12-31' AS t_end_time
    FROM ods.user_update AS C
) AS T;

6、全量比对算法

ods.user_full是当日全量表,表结构略去。

INSERT OVERWRITE TABLE dws.user_his
SELECT * FROM
(
    SELECT A.user_num,
           A.mobile,
           A.reg_date,
           A.t_start_time,
           CASE
                WHEN A.t_end_time = '9999-12-31' AND B.user_num IS NULL THEN date_add($date,-1)   --老拉链表里有,当天全量表里没有,则数据关链;新增不重开链
                WHEN A.t_end_time = '9999-12-31' AND B.user_num IS NOT NULL and (A.mobile <> B.mobile or A.reg_date<>B.reg_date) THEN date_add($date,-1)   --老拉链表和当天全量表都有,但值发生了变化,则数据关链;同时新增开链操作
                ELSE A.t_end_time   --其他(1、已经关链的数据;2、未关链但老拉链表和当天全量对比信息没有变化的数据)end_time不变
           END t_end_time
    FROM dws.user_his A
    LEFT JOIN ods.user_full B
    ON A.user_num = B.user_num
UNION ALL
    SELECT A.user_num,
           A.mobile,
           A.reg_date,
           $date t_start_time,
           '9999-12-31' t_end_time
    FROM ods.user_full A
   INNER JOIN (select b1.*,row_number()over(partition by b1.user_num order by b1.t_end_time desc) rn from dws.user_his b1 where t_start_time<=date_add($date,-1)) B
    ON A.user_num = B.user_num
   WHERE b.rn=1 and (A.mobile <> B.mobile or A.reg_date<>B.reg_date)   --比对值变更关链又新开链数据
UNION ALL
    SELECT A.user_num,
           A.mobile,
           A.reg_date,
           $date t_start_time,
           '9999-12-31' t_end_time
    FROM ods.user_full A
    LEFT JOIN dws.user_his B
    ON A.user_num = B.user_num
   WHERE B.user_num IS NULL   --纯新增开链数据
) T;

7、全量比对算法考虑回滚

拉链数据支持从历史上某一天跑到当天。跑数时需谨慎,因为一旦重跑,那天之后的数据将消失。

INSERT OVERWRITE TABLE dws.user_his
SELECT * FROM
(
    SELECT A.user_num,
           A.mobile,
           A.reg_date,
           A.t_start_time,
           CASE
                WHEN A.t_end_time = '9999-12-31' AND B.user_num IS NOT NULL and (A.mobile <> B.mobile or A.reg_date<>B.reg_date) THEN date_add($date,-1)   --老拉链表和当天全量表都有,但值发生了变化,则数据关链;同时新增开链操作
                WHEN A.t_end_time >= date_add($date,-1) AND B.user_num IS NULL THEN date_add($date,-1)   --老拉链表里有,当天全量表里没有(包含状态正常的需要关链,及关链日期在date_add($date,-1)之后的两种情况),则数据关链到date_add($date,-1);新增不重开链
                ELSE A.t_end_time   --其他(1、已经关链且关链日期小于date_add($date,-1)的数据;2、未关链但老拉链表和当天全量对比信息没有变化的数据)end_time不变
           END t_end_time
    FROM dws.user_his A
    LEFT JOIN ods.user_full B
    ON A.user_num = B.user_num
   WHERE A.t_start_time<=date_add($date,-1)   --限制跑数$date的前一天数据日期,$date及其之后日期的数据都将删掉,所以,日期选择要慎重
UNION ALL
    SELECT A.user_num,
           A.mobile,
           A.reg_date,
           $date t_start_time,
           '9999-12-31' t_end_time
    FROM ods.user_full A
   INNER JOIN (select b1.*,row_number()over(partition by b1.user_num order by b1.t_end_time desc) rn from dws.user_his b1 where t_start_time<=date_add($date,-1)) B
    ON A.user_num = B.user_num
   WHERE B.rn=1 and ((A.mobile <> B.mobile or A.reg_date<>B.reg_date) or B.t_end_time<date_add($date,-1))   --当前全量表数据与历史开始日期小于date_add($date,-1)的比对值变更关链又新开链数据;另外,包含了一部分内容没变化中间少若干天数据,后来又重新来过,重新开链的数据
UNION ALL
    SELECT A.user_num,
           A.mobile,
           A.reg_date,
           $date t_start_time,
           '9999-12-31' t_end_time
    FROM ods.user_full A
    LEFT JOIN (select * from dws.user_his where t_start_time<=date_add($date,-1)) B   --限制历史拉链表的数据为date_add($date,-1)及之前的
    ON A.user_num = B.user_num
   WHERE B.user_num IS NULL   --纯新增开链数据
) T;

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值