背景:
表tmp.biz中只有id , biz_info 两个字段,目的是解析出字段biz_info里,key值对应的value值数据,以供需求方使用。
原始要清洗解析的数据 , 如下:
id='44788'时
biz_info字段原样:
[
{
"visitHosCode":"H107201",
"visitFirstDeptName":"内科",
"visitSecondDeptCode":"4010504",
"visitSecondDeptName":"肝胆胰中医特色门诊",
"priority":50,
"visitTimeText":"周二上午"
},
{
"visitHosCode":"H107201",
"visitFirstDeptName":"内科",
"visitSecondDeptCode":"4010501",
"visitSecondDeptName":"脾胃病科一区门诊",
"priority":60,
"visitTimeText":"周三上午"
},
{
"visitHosCode":"H107201",
"visitFirstDeptName":"感染科",
"visitSecondDeptCode":"4140001",
"visitSecondDeptName":"肝炎门诊",
"priority":50,
"visitTimeText":""
},
{
"visitHosCode":"H107201",
"visitFirstDeptName":"国际部",
"visitSecondDeptCode":"G401501",
"visitSecondDeptName":"国际部脾胃病科一区",
"priority":5,
"visitTimeText":"周二下午"
}
]
方法一:自我总结的方法
原来1行数据,现在变为多行【因为这种方式有几个花括号,就变为几行 ; " 行数=花括号数,列数=1个花括号中key的个数 "】;
有使用lateral view explode函数;
select
id
,get_json_object(t3,'$.visitHosCode') as visitHosCode
,get_json_object(t3,'$.visitFirstDeptName') as visitFirstDeptName
,get_json_object(t3,'$.visitSecondDeptCode') as visitSecondDeptCode
,get_json_object(t3,'$.visitSecondDeptName') as visitSecondDeptName
,get_json_object(t3,'$.priority') as priority
,get_json_object(t3,'$.visitTimeText') as visitTimeText
from
(
select
id,
if(item not like'%}%',concat(item,'}'),item) t3 --调整成正常的json串【因为item字段会有{ 或 ,{ 样式的数据】
from
(
select
id,biz_info
,regexp_replace(biz_info,'\\]','')t1 --去除左括号
,regexp_replace(regexp_replace(biz_info,'\\]',''),'\\[','')t2 --去除右括号
from tmp.biz
)a lateral view explode(split(t2,'},'))t as item
)b
;
查询出的结果值:
id visithoscode visitfirstdeptname visitseconddeptcode visitseconddeptname priority visittimetext
44788 H107201 内科 4010504 肝胆胰中医特色门诊 50 周二上午
44788 H107201 内科 4010501 脾胃病科一区门诊 60 周三上午
44788 H107201 感染科 4140001 肝炎门诊 50
44788 H107201 国际部 G401501 国际部脾胃病科一区 5 周二下午
方法二:不借助于自定义函数实现
原来1行数据,现在还是为1行【因为没有使用lateral view explode函数】;
需写多个get_json_object()函数;
select
id,
get_json_object(biz_info, "$.[0].visitHosCode") as visitHosCode_1,
get_json_object(biz_info, "$.[0].visitFirstDeptName") as visitFirstDeptName_1,
get_json_object(biz_info, "$.[0].visitSecondDeptCode") as visitSecondDeptCode_1,
get_json_object(biz_info, "$.[0].visitSecondDeptName") as visitSecondDeptName_1,
get_json_object(biz_info, "$.[0].priority") as priority_1,
get_json_object(biz_info, "$.[0].visitTimeText") as visitTimeText_1,
get_json_object(biz_info, "$.[1].visitHosCode") as visitHosCode_2,
get_json_object(biz_info, "$.[1].visitFirstDeptName") as visitFirstDeptName_2,
get_json_object(biz_info, "$.[1].visitSecondDeptCode") as visitSecondDeptCode_2,
get_json_object(biz_info, "$.[1].visitSecondDeptName") as visitSecondDeptName_2,
get_json_object(biz_info, "$.[1].priority") as priority_2,
get_json_object(biz_info, "$.[1].visitTimeText") as visitTimeText_2,
get_json_object(biz_info, "$.[2].visitHosCode") as visitHosCode_3,
get_json_object(biz_info, "$.[2].visitFirstDeptName") as visitFirstDeptName_3,
get_json_object(biz_info, "$.[2].visitSecondDeptCode") as visitSecondDeptCode_3,
get_json_object(biz_info, "$.[2].visitSecondDeptName") as visitSecondDeptName_3,
get_json_object(biz_info, "$.[2].priority") as priority_3,
get_json_object(biz_info, "$.[2].visitTimeText") as visitTimeText_3,
get_json_object(biz_info, "$.[3].visitHosCode") as visitHosCode_4,
get_json_object(biz_info, "$.[3].visitFirstDeptName") as visitFirstDeptName_4,
get_json_object(biz_info, "$.[3].visitSecondDeptCode") as visitSecondDeptCode_4,
get_json_object(biz_info, "$.[3].visitSecondDeptName") as visitSecondDeptName_4,
get_json_object(biz_info, "$.[3].priority") as priority_4,
get_json_object(biz_info, "$.[3].visitTimeText") as visitTimeText_4
from tmp.biz
查询出的结果值:
id visithoscode visitfirstdeptname visitseconddeptcode visitseconddeptname priority visittimetext
44788 H107201 内科 4010504 肝胆胰中医特色门诊 50 周二上午
44788 H107201 内科 4010501 脾胃病科一区门诊 60 周三上午
44788 H107201 感染科 4140001 肝炎门诊 50
44788 H107201 国际部 G401501 国际部脾胃病科一区 5 周二下午
缺点:需要写多个get_json_object()函数
方法三:借助于自定义函数实现
原来1行数据,现在变为多行【因为这种方式有几个花括号,就变为几行 ; " 行数=花括号数,列数=1个花括号中key的个数 "】;
有使用lateral view explode 与 lateral view json_tuple函数; hue查询工具中不能运行 , 需要再Airflow调度中才能运行;
CREATE TABLE if not exists `tmp.biz_result`(
`id` string,
`visithoscode` string,
`visitfirstdeptname` string,
`visitseconddeptcode` string,
`visitseconddeptname` string,
`priority` string,
`visittimetext` string
)COMMENT '解析后的结果表'
stored as parquet
location '/hive/xxx/tmp.db/biz_result'
TBLPROPERTIES ('parquet.compression'='SNAPPY');
--需要手写自定UDF , 之后调用自定义的UDF函数
add jar /home/...路径/udf/tmp/json_array.jar;
create temporary function json_array as 'com.benmu.udf.Json_array';
insert overwrite table tmp.biz_result
select
id,visitHosCode,visitFirstDeptName,visitSecondDeptCode,visitSecondDeptName,priority,visitTimeText
from
(
select
id,biz_info
from
tmp.biz
)a
lateral view explode(json_array(biz_info)) t as item
lateral view json_tuple(item,'visitHosCode','visitFirstDeptName','visitSecondDeptCode','visitSecondDeptName','priority','visitTimeText')t1 as visitHosCode,visitFirstDeptName,visitSecondDeptCode,visitSecondDeptName,priority,visitTimeText
;
查询出的结果值:
hive> select * from tmp.biz_result;
id visithoscode visitfirstdeptname visitseconddeptcode visitseconddeptname priority visittimetext
44788 H107201 内科 4010504 肝胆胰中医特色门诊 50 周二上午
44788 H107201 内科 4010501 脾胃病科一区门诊 60 周三上午
44788 H107201 感染科 4140001 肝炎门诊 50
44788 H107201 国际部 G401501 国际部脾胃病科一区 5 周二下午