要处理的字段样式:BaseFieldUDF.py
line_list = ['1601837183327|{"cm":{"ln":"-69.1","sv":"V2.0.8","os":"8.1.4","g":"K7T9H5GM@gmail.com","mid":"1","nw":"4G","l":"es","vc":"19","hw":"640*960","ar":"MX","uid":"1","t":"1601825265783","la":"23.6","md":"Huawei-9","vn":"1.2.1","ba":"Huawei","sr":"T"},"ap":"app","et":[{"ett":"1601833164855","en":"display","kv":{"goodsid":"0","action":"1","extend1":"2","place":"4","category":"48"}},{"ett":"1601776175979","en":"notification","kv":{"ap_time":"1601750646753","action":"1","type":"1","content":""}},{"ett":"1601783920469","en":"active_foreground","kv":{"access":"","push_id":"2"}},{"ett":"1601756375929","en":"comment","kv":{"p_comment_id":1,"addtime":"1601795584064","praise_count":66,"other_id":5,"comment_id":9,"reply_count":129,"userid":2,"content":"韵凉"}},{"ett":"1601763155223","en":"praise","kv":{"target_id":3,"id":9,"type":3,"add_time":"1601773879208","userid":4}}]} 2020-10-05']
自定义函数:
# -*- coding:utf8 -*-
import sys
import json
line_list = ['1601837183327|{"cm":{"ln":"-69.1","sv":"V2.0.8","os":"8.1.4","g":"K7T9H5GM@gmail.com","mid":"1","nw":"4G","l":"es","vc":"19","hw":"640*960","ar":"MX","uid":"1","t":"1601825265783","la":"23.6","md":"Huawei-9","vn":"1.2.1","ba":"Huawei","sr":"T"},"ap":"app","et":[{"ett":"1601833164855","en":"display","kv":{"goodsid":"0","action":"1","extend1":"2","place":"4","category":"48"}},{"ett":"1601776175979","en":"notification","kv":{"ap_time":"1601750646753","action":"1","type":"1","content":""}},{"ett":"1601783920469","en":"active_foreground","kv":{"access":"","push_id":"2"}},{"ett":"1601756375929","en":"comment","kv":{"p_comment_id":1,"addtime":"1601795584064","praise_count":66,"other_id":5,"comment_id":9,"reply_count":129,"userid":2,"content":"韵凉"}},{"ett":"1601763155223","en":"praise","kv":{"target_id":3,"id":9,"type":3,"add_time":"1601773879208","userid":4}}]} 2020-10-05']
def get_fields():
public_fileds = "mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,nw,ln,la,t"
pf_list = public_fileds.split(",")
return pf_list
for line in line_list:
#for line in sys.stdin:
public_val = []
detail_str = line.strip()
detail_list = detail_str.split("|")
# 校验数据完整性
if len(detail_list) != 2 and detail_list[0] is None:
pass
# 解析公共字段
try:
# print(detail_list[1])
json_str = json.loads(detail_list[1].split(" ")[0])
if "cm" in json_str:
pf_json = json_str["cm"]
pf_list = get_fields()
for field in pf_list:
if field in pf_json:
public_val.append(pf_json[field])
# public_val.append("\t")
else:
public_val.append("\t")
# 解析服务器时间和时间字段
server_time = detail_list[0]
if "ap" in json_str:
ap_json = json_str["ap"]
public_val.append(str(ap_json))
# public_val.append("\t")
else:
public_val.append("\t")
if "et" in json_str:
event_json = json_str["et"]
public_val.append(str(event_json))
# public_val.append("\t")
else:
public_val.append("\t")
except Exception as e:
print(e)
print("\t".join(public_val))
public_val.clear()
"""此处有个坑,在jion的时候,需要join的字符必须时str,不可以为int,list等别的数据类型"""
上传BaseFieldUDF.py 至服务器
测试代码:解析成功
hive中执行: 出现如下,表示解析成功
add file /opt/module/script/BaseFieldUDF.py;
执行sql:
select transform(e) USING 'python3 BaseFieldUDF.py' AS (mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,nw,ln,la,t,ap,et) from ods_event_log where dt='2020-10-05' limit 1;