二百三十七、Hive——DWS层生成每个清洗字段的异常情况记录

一、目的

在Hive中对每种业务数据的清洗字段的异常数据进行记录

例如这张图,上面是原始数据,下面是每台雷达每天的异常字段的记录

二、实施步骤

(一)建表

create  table  if not exists  dws_data_clean_record_queue(
    data_type      int        comment '1:转向比,2:统计,3:评价,4:区域,5:过车,6:静态排队,7:动态排队,8:轨迹,9:事件数据,10:事件资源',
    device_no      string     comment '设备编号',
    create_time    timestamp  comment '创建时间',
    field_name     string     comment '字段名',
    field_value    string     comment '字段值'
)
comment '静态排队数据清洗记录表'
partitioned by (day string)
stored as orc
;

(二)SQL

1、运行比较简单,但代码比较难

with t3 as(
select
       device_no,
       case when device_no is null then CONCAT('device_no:','null')  END AS device_no_value,
       create_time,
       case when lane_no < 0 or lane_no >255 then CONCAT('lane_no:', CAST(lane_no AS STRING)) END AS lane_no_value,
       case when queue_len < 0 or queue_len > 500 then CONCAT('queue_len:', CAST(queue_len AS STRING))  END AS queue_len_value,
       case when queue_head < 0 or queue_head > 500 then  CONCAT('queue_head:', CAST(queue_head AS STRING))  END AS queue_head_value,
       case when queue_tail < 0 or queue_tail > 500 then  CONCAT('queue_tail:', CAST(queue_tail AS STRING))  END AS queue_tail_value,
       case when queue_count < 0 or queue_count > 100  then  CONCAT('queue_count:', CAST(queue_count AS STRING))  END AS queue_count_value,
       concat_ws(',',
                case when device_no is null then CONCAT('device_no:','null') end ,
                case when lane_no < 0 or lane_no >255 then CONCAT('lane_no:', CAST(lane_no AS STRING)) END ,
                case when queue_len < 0 or queue_len > 500 then CONCAT('queue_len:', CAST(queue_len AS STRING))  END,
                case when queue_head < 0 or queue_head > 500 then  CONCAT('queue_head:', CAST(queue_head AS STRING))  END,
                case when queue_tail < 0 or queue_tail > 500 then  CONCAT('queue_tail:', CAST(queue_tail AS STRING))  END,
                case when queue_count < 0 or queue_count > 100  then  CONCAT('queue_count:', CAST(queue_count AS STRING))  END
                ) AS kv_pairs  ,
       day
from (select
        t1.device_no,
        substr(create_time,1,19)                        create_time ,
        get_json_object(list_json,'$.laneNo')           lane_no,
        get_json_object(list_json,'$.queueLen')         queue_len,
        get_json_object(list_json,'$.queueHead')        queue_head,
        get_json_object(list_json,'$.queueTail')        queue_tail,
        get_json_object(list_json,'$.queueCount')       queue_count,
        date(t1.create_time)                            day
    from (
    select
       get_json_object(queue_json,'$.deviceNo')         device_no,
       get_json_object(queue_json,'$.createTime')       create_time,
       get_json_object(queue_json,'$.queueList')        queue_list
    from hurys_dc_ods.ods_queue
        where day='2024-05-15'
        ) as t1
lateral view explode(split(regexp_replace(regexp_replace(queue_list,
                                                '\\[|\\]','') ,
                        '\\}\\,\\{','\\}\\;\\{'),
                  '\\;')
          )list_queue as list_json
    )  as t2
        )
insert  overwrite  table  hurys_dc_dws.dws_data_clean_record_queue partition(day)
select
    '6' data_type,
    t3.device_no,
    create_time,
    split(pair, ':')[0] AS field_name,
    split(pair, ':')[1] AS field_value,
    day
from t3
lateral view explode(split(t3.kv_pairs , ',')) exploded_table AS pair
where device_no_value is not null or queue_len_value is not null or lane_no_value is not null
or queue_head_value is not null or queue_tail_value is not null or queue_count_value is not null
;

注意 

(1)t1是解析JSON一级原始数据

(2)t2是解析JSON二级原始数据,得到所有的字段

(3)t3是利用case when和CONCAT,对每个检测字段的字段名和异常值进行拼接。最重要的是,利用concat_ws生成各种检测字段的键值对kv_pairs

(4)最后,则是利用lateral view explode(split)对键值对进行炸开,然后切分每个字段,形成field_name和field_value

(5)另外,最后where指定条件,键值对里面的字段总要非空

2、运行比较耗资源、但代码简单

with t2 as(
    select
        t1.device_no,
        substr(create_time,1,19)                        create_time ,
        get_json_object(list_json,'$.laneNo')           lane_no,
        get_json_object(list_json,'$.queueLen')         queue_len,
        get_json_object(list_json,'$.queueHead')        queue_head,
        get_json_object(list_json,'$.queueTail')        queue_tail,
        get_json_object(list_json,'$.queueCount')       queue_count,
        date(t1.create_time)                            day
from (
    select
       get_json_object(queue_json,'$.deviceNo')         device_no,
       get_json_object(queue_json,'$.createTime')       create_time,
       get_json_object(queue_json,'$.queueList')        queue_list
    from hurys_dc_ods.ods_queue
        where day='2024-05-15'
) as t1
lateral view explode(split(regexp_replace(regexp_replace(queue_list,
                                                '\\[|\\]','') ,
                        '\\}\\,\\{','\\}\\;\\{'),
                  '\\;')
          )list_queue as list_json
)
insert  overwrite  table  hurys_dc_dws.dws_data_clean_record_queue partition(day)
select
       '6' data_type,
       device_no,
       create_time,
       'device_no' field_name ,
       case when device_no is null then device_no END AS field_value ,
       day
from t2
where device_no is null
union all
select
       '6' data_type,
       device_no,
       create_time,
       'lane_no'  field_name ,
       case when lane_no < 0 or lane_no >255 then lane_no END AS field_value ,
       day
from t2
where lane_no < 0 or lane_no >255
union all
select
       '6' data_type,
       device_no,
       create_time,
       'queue_len'  field_name ,
       case when queue_len < 0 or queue_len > 500 then queue_len END AS field_value ,
       day
from t2
where  queue_len < 0 or queue_len > 500
union all
select
       '6' data_type,
       device_no,
       create_time,
       'queue_head'  field_name ,
       case when queue_head < 0 or queue_head > 500 then queue_head END AS field_value ,
       day
from t2
where  queue_head < 0 or queue_head > 500
union all
select
       '6' data_type,
       device_no,
       create_time,
       'queue_tail'  field_name ,
       case when queue_tail < 0 or queue_tail > 500 then queue_tail END AS field_value ,
       day
from t2
where  queue_tail < 0 or queue_tail > 500
union all
select
       '6' data_type,
       device_no,
       create_time,
       'queue_count'  field_name ,
       case when queue_count < 0 or queue_count > 100 then queue_count END AS field_value ,
       day
from t2
where queue_count < 0 or queue_count > 100
;

(1)特点:每个字段union判断 笨方法

(三)查看表结果

花了一天时间终于搞定,又前进了一小步!加油啊,少年

2024年5月22日续写

(四)海豚调度任务T+1插入

#! /bin/bash
source /etc/profile

nowdate=`date --date='0 days ago' "+%Y%m%d"`
yesdate=`date -d yesterday +%Y-%m-%d`

hive -e "
use hurys_dc_dws;

set hive.vectorized.execution.enabled=false;

set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions.pernode=1000;
set hive.exec.max.dynamic.partitions=2000;


with t3 as(

select
       device_no,
       case when device_no is null then CONCAT('device_no:','null')  END AS device_no_value,
       create_time,
       case when lane_no < 0 or lane_no >255 then CONCAT('lane_no:', CAST(lane_no AS STRING)) END AS lane_no_value,
       case when queue_len < 0 or queue_len > 500 then CONCAT('queue_len:', CAST(queue_len AS STRING))  END AS queue_len_value,
       case when queue_head < 0 or queue_head > 500 then  CONCAT('queue_head:', CAST(queue_head AS STRING))  END AS queue_head_value,
       case when queue_tail < 0 or queue_tail > 500 then  CONCAT('queue_tail:', CAST(queue_tail AS STRING))  END AS queue_tail_value,
       case when queue_count < 0 or queue_count > 100  then  CONCAT('queue_count:', CAST(queue_count AS STRING))  END AS queue_count_value,
       concat_ws(',',
                case when device_no is null then CONCAT('device_no:','null') end ,
                case when lane_no < 0 or lane_no >255 then CONCAT('lane_no:', CAST(lane_no AS STRING)) END ,
                case when queue_len < 0 or queue_len > 500 then CONCAT('queue_len:', CAST(queue_len AS STRING))  END,
                case when queue_head < 0 or queue_head > 500 then  CONCAT('queue_head:', CAST(queue_head AS STRING))  END,
                case when queue_tail < 0 or queue_tail > 500 then  CONCAT('queue_tail:', CAST(queue_tail AS STRING))  END,
                case when queue_count < 0 or queue_count > 100  then  CONCAT('queue_count:', CAST(queue_count AS STRING))  END
                ) AS kv_pairs  ,
       day
from (select
        t1.device_no,
        substr(create_time,1,19)                        create_time ,
        get_json_object(list_json,'$.laneNo')           lane_no,
        get_json_object(list_json,'$.queueLen')         queue_len,
        get_json_object(list_json,'$.queueHead')        queue_head,
        get_json_object(list_json,'$.queueTail')        queue_tail,
        get_json_object(list_json,'$.queueCount')       queue_count,
        date(t1.create_time)                            day
    from (
    select
       get_json_object(queue_json,'$.deviceNo')         device_no,
       get_json_object(queue_json,'$.createTime')       create_time,
       get_json_object(queue_json,'$.queueList')        queue_list
    from hurys_dc_ods.ods_queue
        where day= '$yesdate'
        ) as t1
lateral view explode(split(regexp_replace(regexp_replace(queue_list,
                                                '\\\\[|\\\\]','') ,    
                                 '\\\\}\\\\,\\\\{','\\\\}\\\\;\\\\{'), 
                   '\\\\;')
          )list_queue as list_json
    )  as t2
where day = '$yesdate'
        )
insert  overwrite  table  hurys_dc_dws.dws_data_clean_record_queue partition(day)
select
    '6' data_type,
    t3.device_no,
    create_time,
    split(pair, ':')[0] AS field_name,
    split(pair, ':')[1] AS field_value,
    day
from t3
lateral view explode(split(t3.kv_pairs , ',')) exploded_table AS pair
where device_no_value is not null or queue_len_value is not null or lane_no_value is not null
or queue_head_value is not null or queue_tail_value is not null or queue_count_value is not null
"

(五)海豚任务执行以及表数据验证

1、海豚任务执行

2、表数据验证

2024-05-21表分区的数据已经有啦!!!

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

天地风雷水火山泽

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值