1、准备要处理数据的临时表
drop table if exists xxlvc0103_json;
create table xxlvc0103_json as
select pt_day,uid,parms['value'] json_value
from oss_bi_all_api_play_feedback_active
where pt_day between '2019-01-01' and '2019-01-03'
and parms['type']=7;
2、基础数据查看
hive> select * from xxlvc0103_json where uid=1414554412 limit 10;
OK
2019-01-01 1414554412 [{"enterType":1,"roomId":"87016658","time":1878903}]
2019-01-01 1414554412 [{"enterType":1,"roomId":"87016658","time":539579},{"enterType":3,"roomId":"60784259","time":5322},{"enterType":2,"roomId":"88028786","time":6923}]
2019-01-03 1414554412 [{"enterType":1,"roomId":"87016658","time":65044}]
2019-01-02 1414554412 [{"enterType":1,"roomId":"87016658","time":1638660}]
2019-01-01 1414554412 [{"enterType":1,"roomId":"87016658","time":1532060}]
2019-01-01 1414554412 [{"enterType":1,"roomId":"87016658","time":2033}]
2019-01-03 1414554412 [{"enterType":1,"roomId":"87016658","time":385518}]
2019-01-02 1414554412 [{"enterType":1,"roomId":"87016658","time":37454}]
2019-01-02 1414554412 [{"enterType":1,"roomId":"87016658","time":51099}]
2019-01-03 1414554412 [{"enterType":1,"roomId":"87016658","time":191371},{"enterType":3,"roomId":"87823728","time":1908}]
Time taken: 0.09 seconds, Fetched: 10 row(s)
3、array嵌套map数据的处理
先将数据进行有目的的替换,以满足hive中array、map复杂数据类型的要求;
然后利用LATERAL VIEW功能将list分开;
最后进行str_to_map操作,转换成我们想要的结构。
drop table if exists xxlvc0103_json_map;
create table xxlvc0103_json_map as
select pt_day,uid,str_to_map(list_col) value_map
from xxlvc0103_json
LATERAL VIEW explode(split(regexp_replace(regexp_replace(json_value,'\\},\\{','&'),'\"|\\[|\\]|\\{|\\}',''),'&')) list_str as list_col;
4、转换后的数据查看
hive> select * from xxlvc0103_json_map limit 10;
OK
2019-01-01 458410394 {"enterType":"1","roomId":"3449045","time":"3676"}
2019-01-01 1418279372 {"enterType":"1","roomId":"26216121","time":"14718"}
2019-01-01 138687495 {"enterType":"1","roomId":"87823728","time":"2614"}
2019-01-01 138687495 {"enterType":"1","roomId":"26216121","time":"2341"}
2019-01-01 138687495 {"enterType":"1","roomId":"87958988","time":"2812"}
2019-01-01 138687495 {"enterType":"1","roomId":"23882827","time":"3563"}
2019-01-01 1379155169 {"enterType":"1","roomId":"82416066","time":"26776"}
2019-01-01 803130862 {"enterType":"1","roomId":"3449045","time":"74915"}
2019-01-01 1418279372 {"enterType":"1","roomId":"82416066","time":"14964"}
2019-01-01 1418279372 {"enterType":"1","roomId":"87066404","time":"21867"}
Time taken: 0.071 seconds, Fetched: 10 row(s)
5、根据需求进行的数据汇总处理
select pt_day,value_map['enterType'] enterType,bigint(sum(value_map['time'])) level_time,count(distinct uid) uid_cnt,count(*) cnt
from xxlvc0103_json_map
group by pt_day,value_map['enterType'];
OK
2019-01-01 1 291795705016 24776 78926
2019-01-01 2 113509309 1746 5412
2019-01-01 3 128406659 1496 3832
2019-01-02 1 9512144176 21529 73883
2019-01-02 2 151180360 1847 5837
2019-01-02 3 225749773 1580 4503
2019-01-03 1 294769932990 25669 85037
2019-01-03 2 120973438 2025 6184
2019-01-03 3 308381650 1744 4990
Time taken: 29.418 seconds, Fetched: 9 row(s)
6、总结说明
问题总会有解决方法的,关键要有耐心、不焦躁。