Hive解析任务-将json的多个属性拆分成多条记录

需求环境:

在hive表dwb.dwb_r_thrid_data中,data字段存放有json字符串

需要从json字符串中,解析到需要的字段:将一个json里面的属性data.loanInfo.mobile.timeScopes.D360、data.loanInfo.mobile.timeScopes.D90所包含的字段分别解析成一条记录,并且将D360、D90也作为字段timeScope的值解析到该条记录中。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
{
    "code":"0",
    "data":{
        "loanInfo":{
            "mobile":{
                "timeScopes":{
                    "D360":{
                        "maxOverdueDays":-1,
                        "loanTenantCount":0,
                        "monthsFromFirstLoan":-1,
                        "averageLoanGapDays":-1,
                        "averageLoanAmount":0,
                        "averageTenantGapDays":-1,
                        "loanCount":0,
                        "maxLoanAmount":0,
                        "daysFromLastLoan":-1,
                        "overdueTenantCount":-1,
                        "queryCount":0,
                        "monthsFromLastOverdue":-1,
                        "maxLoanPeriodDays":0,
                        "remainingAmount":-1,
                        "monthsForNormalRepay":-1,
                        "overdueLoanCount":-1,
                        "overdueFor2TermTenantCount":-1
                    },
                    "D90":{
                        "maxOverdueDays":-1,
                        "loanTenantCount":0,
                        "averageLoanGapDays":-1,
                        "averageLoanAmount":0,
                        "averageTenantGapDays":-1,
                        "overdueLoanCount":-1,
                        "overdueFor2TermTenantCount":-1,
                        "loanCount":0,
                        "maxLoanAmount":0,
                        "overdueTenantCount":-1,
                        "queryCount":0,
                        "maxLoanPeriodDays":0
                    }
                }
            },
            "pid":{
                "timeScopes":{
                    "D360":{
                        "maxOverdueDays":-1,
                        "loanTenantCount":0,
                        "monthsFromFirstLoan":-1,
                        "averageLoanGapDays":-1,
                        "averageLoanAmount":0,
                        "averageTenantGapDays":-1,
                        "loanCount":0,
                        "maxLoanAmount":0,
                        "daysFromLastLoan":-1,
                        "overdueTenantCount":-1,
                        "queryCount":0,
                        "monthsFromLastOverdue":-1,
                        "maxLoanPeriodDays":0,
                        "remainingAmount":-1,
                        "monthsForNormalRepay":-1,
                        "overdueLoanCount":-1,
                        "overdueFor2TermTenantCount":-1
                    },
                    "D90":{
                        "maxOverdueDays":-1,
                        "loanTenantCount":0,
                        "averageLoanGapDays":-1,
                        "averageLoanAmount":0,
                        "averageTenantGapDays":-1,
                        "overdueLoanCount":-1,
                        "overdueFor2TermTenantCount":-1,
                        "loanCount":0,
                        "maxLoanAmount":0,
                        "overdueTenantCount":-1,
                        "queryCount":0,
                        "maxLoanPeriodDays":0
                    }
                }
            },
            "deviceId":{
                "timeScopes":{
                    "D360":{
                        "loanTenantCount":0,
                        "loanCount":0,
                        "queryCount":0
                    },
                    "D90":{
                        "loanTenantCount":0,
                        "loanCount":0,
                        "queryCount":0
                    }
                }
            }
        },
        "blacklist":{
            "mobile":{
                "lastConfirmAtDays":-1,
                "lastConfirmStatus":"",
                "blackLevel":"none",
                "last6MTenantCount":0,
                "last6MQueryCount":0,
                "last12MMaxConfirmStatus":""
            },
            "pid":{
                "lastConfirmAtDays":-1,
                "lastConfirmStatus":"",
                "blackLevel":"none",
                "last6MTenantCount":0,
                "last6MQueryCount":0,
                "last12MMaxConfirmStatus":""
            },
            "deviceId":{
                "lastConfirmAtDays":-1,
                "lastConfirmStatus":"",
                "blackLevel":"none",
                "last6MTenantCount":0,
                "last6MQueryCount":0,
                "last12MMaxConfirmStatus":""
            }
        }
    },
    "message":"请求成功"
}

表结构形如:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
create table if not exists dwb.dwb_r_morpho_loaninfo_mobile(
       apply_risk_id                        string comment  "风控ID",
       dp_data_id                           string comment  "dp_dataID",
       maxOverdueDays                       string,
       loanTenantCount                      string,
       monthsFromFirstLoan                  string,
       averageLoanGapDays                   string,
       averageLoanAmount                    string,
       averageTenantGapDays                 string,
       loanCount                            string,
       maxLoanAmount                        string,
       daysFromLastLoan                     string,
       overdueTenantCount                   string,
       queryCount                           string,
       monthsFromLastOverdue                string,
       maxLoanPeriodDays                    string,
       remainingAmount                      string,
       monthsForNormalRepay                 string,
       overdueLoanCount                     string,
       overdueFor2TermTenantCount           string,
       timeScope                            string comment  "时间期限",
       morpho_created_at                    string comment  "创建时间",
       etl_time                             string comment  "etl处理时间"
) comment 'moblie' 
 PARTITIONED BY (dt string  comment '分区日期')
 row format delimited fields terminated by '\001'
 NULL DEFINED AS ''
 stored as orc;

接下来就开始表演吧。

如果是json数组,可以很方便拆分

我们都知道对于一条json里面值为json数组的属性,hive可以将其获取到并且进行拆分成多条记录:

如以下infoquerybean属性:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
{
    "overduemoreamt":"0",
    "loancount":"0",
    "loanbal":"0",
    "outstandcount":"0",
    "queryatotalorg":"最近***********",
    "loanamt":"0",
    "overdueamt":"0",
    "generationcount":"0",
    "msgContent":"成功!",
    "generationamount":"0",
    "overduemorecount":"0",
    "totalorg":"*************",
    "infoquerybean":[
        {
            "s_value":"审批",
            "ddate":"2018-09-10",
            "ordernum":"1"
        },
        {
            "s_value":"审批",
            "ddate":"2018-09-06",
            "ordernum":"2"
        },
        {
            "s_value":"审批",
            "ddate":"2018-08-21",
            "ordernum":"3"
        },
        {
            "s_value":"审批",
            "ddate":"2018-08-09",
            "ordernum":"4"
        },
        {
            "s_value":"审批",
            "ddate":"2018-07-28",
            "ordernum":"5"
        },
        {
            "s_value":"审批",
            "ddate":"2018-07-27",
            "ordernum":"6"
        }
    ],
    "overduecount":"0",
    "msgCode":"200"
}

可以通过split拆分成结果,插入到形状如下的表中:

1
2
3
4
5
6
7
8
9
10
11
12
13
create table if not exists dwb.dwb_r_nifa_share_detail_n(
       apply_risk_id                        string comment  "风控ID",
       dp_data_id                           string comment  "dp_dataID",
       nifa_share_detail_ordernum string comment '序号',
       nifa_share_detail_ddate string comment '查询日期',
       nifa_share_detail_s_value string comment '查询原因',
       nifa_share_created_at                string comment  "创建时间",
       etl_time                             string comment  "etl处理时间"
) comment 'table test' 
 PARTITIONED BY (dt string  comment '分区日期')
 row format delimited fields terminated by '\001'
 NULL DEFINED AS ''
 stored as orc;

通过

1
explode(split(default.get_json_path(a.data,'infoquerybean'),'@\\|@'))

将其拆分成多条记录,完整sql见下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
dt=$1
hive<<!
set mapreduce.job.queuename=root.dw;
set hive.support.concurrency=false;

insert overwrite table dwb.dwb_r_nifa_share_detail_n partition(dt='$dt')
select 
      a.apply_risk_id,
      a.dp_data_id,
      nifa_share_detail_ordernum,
      nifa_share_detail_ddate,
      nifa_share_detail_s_value,
      from_unixtime(cast(a.timestamp/1000 as bigint),'yyyy-MM-dd HH:mm:ss') as nifa_share_created_at,
      current_timestamp() as etl_time
from (select dwb_r_thrid_data.apply_risk_id,dwb_r_thrid_data.dp_data_id,dwb_r_thrid_data.data,dwb_r_thrid_data.timestamp 
       from dwb.dwb_r_thrid_data   where  channel_name = 'nifa_prod' and interface_name = 'share' and get_json_object(data,'$.msgCode') = '200' and dwb_r_thrid_data.dt='$dt'
 ) a 
 lateral view explode(split(default.get_json_path(a.data,'infoquerybean'),'@\\\\|@')) b as infoquerybean
 lateral view default.json_tuple2(b.infoquerybean,'ordernum','ddate','ordernum') c as nifa_share_detail_ordernu,nifa_share_detail_ddate, nifa_share_detail_s_value
;
!

得到结果:

得到结果得到结果

顺着json数组思路,改造json样式

通过get_json_object()方法,得到两个json属性,通过concat拼接成json数组,就可以像上面那样拆分成多条记录。(测试阶段的样例都使用了设定分区dt,限制条数,因为这样测试起来很快,只需要三秒!!!!????)

1.通过get_json_object方法

1
select get_json_object(td.data,"$.data.loanInfo.mobile.timeScopes.D360") d3,get_json_object(td.data,"$.data.loanInfo.mobile.timeScopes.D90")  d9  from dwb.dwb_r_thrid_data td  where channel_name ='morpho' and interface_name ='query' and dt='20190218' limit 5

得到

2.拼接获取的D360和D90字段

1
select td.*,concat(regexp_replace(get_json_object(td.data,"$.data.loanInfo.mobile.timeScopes.D360"),'}',',"timeScope":"D360"}'),"|",regexp_replace(get_json_object(td.data,"$.data.loanInfo.mobile.timeScopes.D90"),'}',',"timeScope":"D90"}')) ts  from dwb.dwb_r_thrid_data td  where  channel_name ='morpho' and interface_name ='query' and dt='20190218' limit 5

拼接的字符串样式,通过”|”分隔两个对象

1
{"maxOverdueDays":-1,"monthsFromFirstLoan":-1,"loanTenantCount":0,"averageLoanGapDays":-1,"averageTenantGapDays":-1,"averageLoanAmount":0,"loanCount":0,"maxLoanAmount":0,"overdueTenantCount":-1,"daysFromLastLoan":-1,"queryCount":0,"monthsFromLastOverdue":-1,"maxLoanPeriodDays":0,"remainingAmount":-1,"monthsForNormalRepay":-1,"overdueLoanCount":-1,"overdueFor2TermTenantCount":-1,"timeScope":"D360"}|{"maxOverdueDays":-1,"loanTenantCount":0,"averageLoanGapDays":-1,"averageTenantGapDays":-1,"averageLoanAmount":0,"overdueLoanCount":-1,"overdueFor2TermTenantCount":-1,"loanCount":0,"overdueTenantCount":-1,"maxLoanAmount":0,"queryCount":0,"maxLoanPeriodDays":0,"timeScope":"D90"}

3. 最终通过分隔符进行切分

对于涉及到分隔符,转义字符的个数,请参考该文章数仓-解决hive处理异常json命令行转义字符的问题

1
2
3
4
5
6
7
8
9
10
11
12
13
14
select 
      a.apply_risk_id,
      a.dp_data_id,
      c.*,
      from_unixtime(cast(a.timestamp/1000 as bigint),'yyyy-MM-dd HH:mm:ss') as morpho_created_at,
      current_timestamp() as etl_time
from (select td.*,concat(regexp_replace(get_json_object(td.data,"$.data.loanInfo.mobile.timeScopes.D360"),'}',',"timeScope":"D360"}'),"|",regexp_replace(get_json_object(td.data,"$.data.loanInfo.mobile.timeScopes.D90"),'}',',"timeScope":"D90"}')) ts
           from dwb.dwb_r_thrid_data td
           where
             channel_name ='morpho' and interface_name ='query' and td.dt='20190218' limit 5
     ) a
lateral view explode(split(a.ts,'\\\\|')) b as list
lateral view default.json_tuple2(b.list,'maxOverdueDays','loanTenantCount','monthsFromFirstLoan','averageLoanGapDays','averageLoanAmount','averageTenantGapDays','loanCount','maxLoanAmount','daysFromLastLoan','overdueTenantCount','queryCount','monthsFromLastOverdue','maxLoanPeriodDays','remainingAmount','monthsForNormalRepay','overdueLoanCount','overdueFor2TermTenantCount','timeScope') 
c as maxOverdueDays,loanTenantCount,monthsFromFirstLoan,averageLoanGapDays,averageLoanAmount,averageTenantGapDays,loanCount,maxLoanAmount,daysFromLastLoan,overdueTenantCount,queryCount,monthsFromLastOverdue,maxLoanPeriodDays,remainingAmount,monthsForNormalRepay,overdueLoanCount,overdueFor2TermTenantCount,timeScope

搞定

得到结果得到结果

完整样例

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
dt=$1

hive<<!
set mapreduce.job.queuename=root.dw;
set hive.support.concurrency=false;

insert overwrite table dwb.dwb_r_morpho_loaninfo_mobile partition(dt='$dt')
select 
      a.apply_risk_id,
      a.dp_data_id,
      c.*,
      from_unixtime(cast(a.timestamp/1000 as bigint),'yyyy-MM-dd HH:mm:ss') as morpho_created_at,
      current_timestamp() as etl_time
from (select td.*,concat(regexp_replace(get_json_object(td.data,"$.data.loanInfo.mobile.timeScopes.D360"),'}',',"timeScope":"D360"}'),"|",regexp_replace(get_json_object(td.data,"$.data.loanInfo.mobile.timeScopes.D90"),'}',',"timeScope":"D90"}')) ts
           from dwb.dwb_r_thrid_data td   where channel_name ='morpho' and interface_name ='query' 
     ) a
lateral view explode(split(a.ts,'\\\\|')) b as list
lateral view default.json_tuple2(b.list,'maxOverdueDays','loanTenantCount','monthsFromFirstLoan','averageLoanGapDays','averageLoanAmount','averageTenantGapDays','loanCount','maxLoanAmount','daysFromLastLoan','overdueTenantCount','queryCount','monthsFromLastOverdue','maxLoanPeriodDays','remainingAmount','monthsForNormalRepay','overdueLoanCount','overdueFor2TermTenantCount','timeScope') 
c as maxOverdueDays,loanTenantCount,monthsFromFirstLoan,averageLoanGapDays,averageLoanAmount,averageTenantGapDays,loanCount,maxLoanAmount,daysFromLastLoan,overdueTenantCount,queryCount,monthsFromLastOverdue,maxLoanPeriodDays,remainingAmount,monthsForNormalRepay,overdueLoanCount,overdueFor2TermTenantCount,timeScope
;
!

参考:HIVE: lateral view explode & json_turpe 实现 json数组行转列&字段拆分

原文:http://www.gangtieguo.cn/2019/03/12/Hive解析任务/

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值