一:数仓搭建
1:完备性。
要保证所需要的数据全部到达数仓。
2:准备性。
etl,和数据的计算校验,确保输出的数据准确。
3:一致性。
确保输出端口一致,防止输出数据不准。
4:时效性。
每天的定时调度。
5:规范性。
表名,字段名要进行规范化处理。
6:稳定性。
确保数仓稳定。
二:数仓校验
1:建表语句
-- 建表
--drop table if exists dm.dim_dk_vehicle_info_dqc;
create table if not exists dim.xxxx
( chk_diff string comment '僵尸表校验'
,duplicate_pk_cnt string comment '重复主键数'
,null_cnt string comment '非空数量'
,false_value_cnt string comment '内容判断'
,active_num string comment '同一车辆激活时间俩天有不同的数量'
,produce_num string comment '同一车辆生产时间俩天有不同的数量'
) comment 'aaaa'
partitioned by (
p_dt string comment '分区日期(yyyy-MM-dd)')
row format delimited
fields terminated by '\t'
collection items terminated by '\n'
stored as orc
;
-- 建表
--drop table if exists ibd_dw_dm.dws_data_monitor_dim_dk_vehicle;
create table ibd_dw_dm.dim_dk_vehicle_info_dqc
(p_dt varchar(20) comment '时间'
,chk_diff bigint comment '僵尸表校验'
,duplicate_pk_cnt bigint comment '重复主键数'
,null_cnt bigint comment'非空数量'
,false_value_cnt bigint comment '内容判断'
,active_num bigint comment '同一车辆激活时间俩天有不同的数量'
,produce_num bigint comment '同一车辆生产时间俩天有不同的数量'
,PRIMARY KEY (p_dt)
) comment '监控dim层车辆信息表'
;
2。sql
--当天
drop table if exists temp_db.temp_data_monitor_table_info_day_02;
create table temp_db.temp_data_monitor_table_info_day_02
as
select *
from dim.aaaa
where p_dt = '${dealDate}'
;
-- 前一天
drop table if exists temp_db.temp_data_monitor_table_info_day_02_yesterday;
create table temp_db.temp_data_monitor_table_info_day_02_yesterday
as
select *
from dim.aaaa
where p_dt = date_sub('${dealDate}', 1)
;
-- 主键
with tmp_pk
as
(
select '${dealDate}' as p_dt
, count(1) as duplicate_pk_cnt -- 重复主键数
from
(
select vehicle_id
, ble_id
, manufacturer_id
, manufacturer_name
,model_brand
,produce_time
, count(1) as cnt
from temp_db.temp_data_monitor_table_info_day_02
group by vehicle_id
, ble_id
, manufacturer_id
, manufacturer_name
,model_brand
,produce_time
having cnt > 1
) t
)
,
-- 关键字段非空
tmp_null
as
(
select '${dealDate}' as p_dt
, count(1) - count(concat(vehicle_id,ble_id,produce_time,manufacturer_id))
as null_cnt -- 非空数量
from temp_db.temp_data_monitor_table_info_day_02
)
,
-- 数据量和僵尸表--数据条数是否多于昨天
tmp_cnt
as
(
select '${dealDate}' as p_dt
, count(a.vehicle_id) as last_cnt -- 前一天数据量
, count(b.vehicle_id) as cur_cnt -- 当天数据量
, case when
count(case when a.vehicle_id is null then b.vehicle_id end) > 0
or count(case when b.vehicle_id is null then a.vehicle_id end) > 0
then 0
else 1
end as chk_diff -- 僵尸表校验
from temp_db.temp_data_monitor_table_info_day_02_yesterday a -- 前一天
full join temp_db.temp_data_monitor_table_info_day_02 b -- 当天
on a.vehicle_id = b.vehicle_id and a.ble_id = b.ble_id and a.manufacturer_id = b.manufacturer_id and a.manufacturer_name = b.manufacturer_name and a.model_brand = b.model_brand and a.produce_time = b.produce_time
)
,
-- 字段值--车厂id和name是否匹配
tmp_value
as
(
select '${dealDate}' as p_dt
,'dim_dk_vehicle_info' as table_name
,sum(case
when a.manufacturer_name =name then 0
else 1 end)
as false_value_cnt
from temp_db.temp_data_monitor_table_info_day_02 as a
left join dim.dim_dk_common_manufacturer_info as b
on a.manufacturer_id=b.id
)
,
-- 对激活日期进行校验
tmp_active
as
(
select
'${dealDate}' as p_dt
,count(*) as active_num
from temp_db.temp_data_monitor_table_info_day_02_yesterday as a
left join temp_db.temp_data_monitor_table_info_day_02 as b
on a.vehicle_id=b.vehicle_id
where a.first_active_time <> b.first_active_time
)
,
-- 对生产日期进行校验
tmp_produce
as
(
select
'${dealDate}' as p_dt
,count(*) as produce_num
from temp_db.temp_data_monitor_table_info_day_02_yesterday as a
left join temp_db.temp_data_monitor_table_info_day_02 as b
on a.vehicle_id=b.vehicle_id
where a.produce_time <> b.produce_time
)
insert overwrite table dim.bbbb partition(p_dt='${dealDate}')
select
case when a.chk_diff = 0 then 1 else 0 end as chk_diff -- 数据条数是否多于昨天
,case when b.duplicate_pk_cnt=0 then 1 else 0 end as duplicate_pk_cnt -- 重复主键数
,case when c.null_cnt-3 = 0 then 1 else 0 end as null_cnt -- 非空数量
,case when d.false_value_cnt = 0 then 1 else 0 end as false_value_cnt --车厂id和name是否匹配
,case when e.active_num = 0 then 1 else 0 end as active_num --同一车辆激活时间俩天有不同的数量
,case when f.produce_num = 0 then 1 else 0 end as produce_num --同一车辆生产时间俩天有不同的数量
from tmp_cnt a
left join tmp_pk b
on a.p_dt = b.p_dt
left join tmp_null c
on a.p_dt = c.p_dt
left join tmp_value d
on a.p_dt = d.p_dt
left join tmp_active e
on a.p_dt = e.p_dt
left join tmp_produce f
on a.p_dt = f.p_dt
;
三:数据支持。
给提需求方做报表展示。