[hadoop@hadoop102 /opt/soft1]$ll /opt/module/hive/ |grep hivefunction-1.0-SNAPSHOT.jar
-rw-r–r--. 1 hadoop hadoop 5528 Feb 10 02:19 hivefunction-1.0-SNAPSHOT.jar
hive (gmall)> add jar /opt/module/hive/hivefunction-1.0-SNAPSHOT.jar;
Added [/opt/module/hive/hivefunction-1.0-SNAPSHOT.jar] to class path
Added resources: [/opt/module/hive/hivefunction-1.0-SNAPSHOT.jar]
4.2 DWD层事件表数据解析
4.2.1 创建基础明细表
明细表用于存储ODS层原始表转换过来的明细数据。
1)创建事件日志基础明细表
hive (gmall)>
drop table if exists dwd_base_event_log;
CREATE EXTERNAL TABLE dwd_base_event_log(
mid_id
string,
user_id
string,
version_code
string,
version_name
string,
lang
string,
source
string,
os
string,
area
string,
model
string,
brand
string,
sdk_version
string,
gmail
string,
height_width
string,
app_time
string,
network
string,
lng
string,
lat
string,
event_name
string,
event_json
string,
server_time
string)
PARTITIONED BY (dt
string)
stored as parquet
location ‘/warehouse/gmall/dwd/dwd_base_event_log/’;
2)说明:其中event_name和event_json用来对应事件名和整个事件。这个地方将原始日志1对多的形式拆分出来了。操作的时候我们需要将原始日志展平,需要用到UDF和UDTF。
4.2.2 自定义UDF函数(解析公共字段)
4.2.3 自定义UDTF函数(解析具体事件字段)
2)打包
3)将hivefunction-1.0-SNAPSHOT上传到hadoop102的/opt/module/hive/
4)将jar包添加到Hive的classpath
hive (gmall)> add jar /opt/module/hive/hivefunction-1.0-SNAPSHOT.jar;
5)创建临时函数与开发好的java class关联
hive (gmall)>
create temporary function base_analizer as ‘com.atguigu.udf.BaseFieldUDF’;
create temporary function flat_analizer as ‘com.atguigu.udtf.EventJsonUDTF’;
4.2.4 解析事件日志基础明细表
1)解析事件日志基础明细表
hive (gmall)>
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table dwd_base_event_log
PARTITION (dt=‘2019-02-10’)
select
mid_id,
user_id,
version_code,
version_name,
lang,
source,
os,
area,
model,
brand,
sdk_version,
gmail,
height_width,
app_time,
network,
lng,
lat,
event_name,
event_json,
server_time
from
(
select
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[0] as mid_id,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[1] as user_id,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[2] as version_code,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[3] as version_name,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[4] as lang,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[5] as source,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[6] as os,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[7] as area,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[8] as model,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[9] as brand,
split(bas