一、dwd 层介绍
1、对用户行为数据解析。
2、对核心数据进行判空过滤。
3、对业务数据采用维度模型重新建模,即维度退化。
二、dwd 层用户行为数据
2.1 用户行为启动表 dwd_start_log
1、数据来源
ods_start_log -> dwd_start_log
2、表的创建
drop table if exists dwd_start_log;
CREATE EXTERNAL TABLE dwd_start_log(
`mid_id` string,
`user_id` string,
`version_code` string,
`version_name` string,
`lang` string,
`source` string,
`os` string,
`area` string,
`model` string,
`brand` string,
`sdk_version` string,
`gmail` string,
`height_width` string,
`app_time` string,
`network` string,
`lng` string,
`lat` string,
`entry` string,
`open_ad_type` string,
`action` string,
`loading_time` string,
`detail` string,
`extend1` string
)
PARTITIONED BY (dt string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_start_log/'
TBLPROPERTIES('parquet.compression'='lzo');
数据由 parquet 存储,再由 lzo 压缩,数据采用 parquet 存储方式,是可以支持切片的,不需要再对数据创建索引。parquet 存储不仅压缩效率高,而且查询速度也快。
3、加载数据
insert overwrite table dwd_start_log
PARTITION (dt='2020-03-10')
select
get_json_object(line,'$.mid') mid_id,
get_json_object(line,'$.uid') user_id,
get_json_object(line,'$.vc') version_code,
get_json_object(line,'$.vn') version_name,
get_json_object(line,'$.l') lang,
get_json_object(line,'$.sr') source,
get_json_object(line,'$.os') os,
get_json_object(line,'$.ar') area,
get_json_object(line,'$.md') model,
get_json_object(line,'$.ba') brand,
get_json_object(line,'$.sv') sdk_version,
get_json_object(line,'$.g') gmail,
get_json_object(line,'$.hw') height_width,
get_json_object(line,'$.t') app_time,
get_json_object(line,'$.nw') network,
get_json_object(line,'$.ln') lng,
get_json_object(line,'$.la') lat,
get_json_object(line,'$.entry') entry,
get_json_object(line,'$.open_ad_type') open_ad_type,
get_json_object(line,'$.action') action,
get_json_object(line,'$.loading_time') loading_time,
get_json_object(line,'$.detail') detail,
get_json_object(line,'$.extend1') extend1
from ods_start_log where dt='2020-03-10';
4、dwd 层用户行为启动表脚本 ods_to_dwd_log.sh
#!/bin/bash
# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n "$1" ] ;then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
sql="
insert overwrite table "$APP".dwd_start_log PARTITION (dt='$do_date')
select
get_json_object(line,'$.mid') mid_id,
get_json_object(line,'$.uid') user_id,
get_json_object(line,'$.vc') version_code,
get_json_object(line,'$.vn') version_name,
get_json_object(line,'$.l') lang,
get_json_object(line,'$.sr') source,
get_json_object(line,'$.os') os,
get_json_object(line,'$.ar') area,
get_json_object(line,'$.md') model,
get_json_object(line,'$.ba') brand,
get_json_object(line,'$.sv') sdk_version,
get_json_object(line,'$.g') gmail,
get_json_object(line,'$.hw') height_width,
get_json_object(line,'$.t') app_time,
get_json_object(line,'$.nw') network,
get_json_object(line,'$.ln') lng,
get_json_object(line,'$.la') lat,
get_json_object(line,'$.entry') entry,
get_json_object(line,'$.open_ad_type') open_ad_type,
get_json_object(line,'$.action') action,
get_json_object(line,'$.loading_time') loading_time,
get_json_object(line,'$.detail') detail,
get_json_object(line,'$.extend1') extend1
from "$APP".ods_start_log where dt='$do_date';"
$hive -e "$sql"
2.2 用户行为事件表数据
1、数据来源及数据拆分
2、创建基础明细表 dwd_base_event_log
明细表用于存储 ODS 层原始表转换过来的明细数据。
(1) 建表
drop table if exists dwd_base_event_log;
CREATE EXTERNAL TABLE dwd_base_event_log(
`mid_id` string,
`user_id` string,
`version_code` string,
`version_name` string,
`lang` string,
`source` string,
`os` string,
`area` string,
`model` string,
`brand` string,
`sdk_version` string,
`gmail` string,
`height_width` string,
`app_time` string,
`network` string,
`lng` string,
`lat` string,
`event_name` string,
`event_json` string,
`server_time` string
)
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_base_event_log/'
TBLPROPERTIES('parquet.compression'='lzo');
说明:其中 event_name 和 event_json 用来对应事件名和整个事件。这个地方将原始日志 1 对多的形式拆分出来了。操作的时候我们需要将原始日志展平,需要用到 UDF 和 UDTF。
(2) 自定义 udf(base_analize)
udf 函数特点:一行进一行出。
A、思路
B、代码
package com.atguigu.udf;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.json.JSONException;
import org.json.JSONObject;
/**
* @description: 自定义 UDF 用于解析公共字段
* @author: hyr
* @time: 2020/4/21 14:00
*/
public class BaseFieldUDF extends UDF {
public String evaluate(String line, String key) throws JSONException {
String[] log = line.split("\\|");
if (log.length != 2 || StringUtils.isBlank(log[1])){
return "";
}
JSONObject basejson = new JSONObject(log[1].trim());
String result = "";
// 获取服务器时间
if ("st".equals(key)){
result = log[0].trim();
}else if ("et".equals(key)){
// 获取事件数组
if (basejson.has("et")){
result = basejson.getString("et");
}
}else {
JSONObject cm = basejson.getJSONObject("cm");
// 获取 key 对应的公共字段的 value
if (cm.has(key)){
result = cm.getString(key);
}
}
return result;
}
/**
* 用于数据测试
*/
public static void main(String[] args) throws JSONException {
String line = "1583769785914|{\"cm\":{\"ln\":\"-79.5\",\"sv\":\"V2.3.1\",\"os\":\"8.2.5\",\"g\":\"1MF7Y4W4@gmail.com\",\"mid\":\"0\",\"nw\":\"WIFI\",\"l\":\"en\",\"vc\":\"14\",\"hw\":\"640*1136\",\"ar\":\"MX\",\"uid\":\"0\",\"t\":\"1583695967769\",\"la\":\"-45.9\",\"md\":\"HTC-13\",\"vn\":\"1.1.5\",\"ba\":\"HTC\",\"sr\":\"K\"},\"ap\":\"app\",\"et\":[{\"ett\":\"1583700386001\",\"en\":\"newsdetail\",\"kv\":{\"entry\":\"1\",\"goodsid\":\"0\",\"news_staytime\":\"12\",\"loading_time\":\"15\",\"action\":\"2\",\"showtype\":\"1\",\"category\":\"16\",\"type1\":\"102\"}},{\"ett\":\"1583706290595\",\"en\":\"notification\",\"kv\":{\"ap_time\":\"1583702836945\",\"action\":\"3\",\"type\":\"4\",\"content\":\"\"}},{\"ett\":\"1583681747595\",\"en\":\"active_foreground\",\"kv\":{\"access\":\"1\",\"push_id\":\"3\"}},{\"ett\":\"1583725227310\",\"en\":\"active_background\",\"kv\":{\"active_source\":\"1\"}},{\"ett\":\"1583743888737\",\"en\":\"comment\",\"kv\":{\"p_comment_id\":0,\"addtime\":\"1583697745229\",\"praise_count\":901,\"other_id\":5,\"comment_id\":2,\"reply_count\":163,\"userid\":9,\"content\":\"诸帛咕死添共项饶伞锯产荔讯胆遇卖吱载舟沮稀蓟\"}}]}";
String mid = new BaseFieldUDF().evaluate(line, "et");
System.out.println(mid);
}
}
(3) 自定义 udtf(flat_analizer)
udtf 函数特点:一行进多行出。
A、思路
B、代码
package com.atguigu.udtf;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.json.JSONArray;
import org.json.JSONException;
import java.util.ArrayList;
/**
* @description: 自定义 udtf 用于展开业务字段
* @author: hyr
* @time: 2020/4/21 14:44
*/
public class EventJsonUDTF extends GenericUDTF {
// 该方法中,我们将指定输出参数的名称和参数类型
public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException{
ArrayList<String> fieldNames = new ArrayList<>();
ArrayList<ObjectInspector> fieldOIs = new ArrayList<>();
fieldNames.add("event_name");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
fieldNames.add("event_json");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
}
// 输入 1 条记录,输出若干条记录
@Override
public void process(Object[] objects) throws HiveException {
// 获取传入的 et
String input = objects[0].toString();
// 如果传入的数据为空,直接返回过滤掉该数据
if (StringUtils.isBlank(input)){
return;
}else {
// 获取一共有几个事件
try {
JSONArray ja = new JSONArray(input);
if (ja == null){
return;
}
// 循环遍历每一个事件
for (int i = 0; i < ja.length(); i++) {
String[] result = new String[2];
try {
// 取出每个事件名称
result[0] = ja.getJSONObject(i).getString("en");
// 取出每一个事件整体
result[1] = ja.getString(i);
} catch (JSONException e) {
continue;
}
// 将结果返回
forward(result);
}
} catch (JSONException e) {
e.printStackTrace();
}
}
}
// 当没有记录处理的时候该方法会被调用,用来清理代码或者产生额外的输出
@Override
public void close() throws HiveException {
}
}
(4) 解析事件日志基础明细表
insert overwrite table dwd_base_event_log partition(dt='2020-03-10')
select
base_analizer(line,'mid') as mid_id,
base_analizer(line,'uid') as user_id,
base_analizer(line,'vc') as version_code,
base_analizer(line,'vn') as version_name,
base_analizer(line,'l') as lang,
base_analizer(line,'sr') as source,
base_analizer(line,'os') as os,
base_analizer(line,'ar') as area,
base_analizer(line,'md') as model,
base_analizer(line,'ba') as brand,
base_analizer(line,'sv') as sdk_version,
base_analizer(line,'g') as gmail,
base_analizer(line,'hw') as height_width,
base_analizer(line,'t') as app_time,
base_analizer(line,'nw') as network,
base_analizer(line,'ln') as lng,
base_analizer(line,'la') as lat,
event_name,
event_json,
base_analizer(line,'st') as server_time
from ods_event_log lateral view flat_analizer(base_analizer(line,'et')) tmp_flat as event_name,event_json
where dt='2020-03-10' and base_analizer(line,'et')<>'';
(5) 事件日志解析脚本 ods_to_dwd_base_log.sh
#!/bin/bash
# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n "$1" ] ;then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
sql=
"
INSERT overwrite table "$APP".dwd_base_event_log partition(dt='$do_date')
select
base_analizer(line,'mid') as mid_id,
base_analizer(line,'uid') as user_id,
base_analizer(line,'vc') as version_code,
base_analizer(line,'vn') as version_name,
base_analizer(line,'l') as lang,
base_analizer(line,'sr') as source,
base_analizer(line,'os') as os,
base_analizer(line,'ar') as area,
base_analizer(line,'md') as model,
base_analizer(line,'ba') as brand,
base_analizer(line,'sv') as sdk_version,
base_analizer(line,'g') as gmail,
base_analizer(line,'hw') as height_width,
base_analizer(line,'t') as app_time,
base_analizer(line,'nw') as network,
base_analizer(line,'ln') as lng,
base_analizer(line,'la') as lat,
event_name,
event_json,
base_analizer(line,'st') as server_time
from
"$APP".ods_event_log lateral view flat_an