数仓分层搭建操作流程
此文档为当时学习数仓整理的操作流程,初学者可以当做参考。
一、数据格式
1.启动日志表基本格式:
{
"action":"1",
"ar":"MX",
"ba":"HTC",
"detail":"",
"en":"start",
"entry":"2",}
2.事件日志表基本格式(埋点数据基本格式):
1540934156385|{
"ap": "gmall",
"cm": {
"uid": "1234",
"vc": "2",
"vn": "1.0",
"la": "EN",
"sr": "",
"os": "7.1.1",
"ar": "CN",
"md": "BBB100-1",
"ba": "blackberry",
"sv": "V2.2.1",
"g": "abc@gmail.com",
"hw": "1620x1080",
"t": "1506047606608",
"nw": "WIFI",
"ln": 0
},
"et": [
{
display "ett": "1506047605364", //客户端事件产生时间
"en": "display", //事件名称
"kv": { //事件结果,以key-value形式自行定义
"goodsid": "236",
"action": "1",
"extend1": "1",
"place": "2",
"category": "75"
}
},{
"ett": "1552352626835",
"en": "active_background",
"kv": {
"active_source": "1"
}
}
]
}
二、数仓搭建之 ODS 层
1.创建启动日志表:
– 如果要创建的表已经存在,则先删除
drop table if exists ods_start_log;
– 创建外部表,字段就是一个 String 类型的 json 数据
create external table ods_start_log (
line
string)
– 根据日期进行分区
partitioned by (
dt
string)
– Lzo压缩格式处理 Lzo --> text
stored as
inputformat ‘com.hadoop.mapred.DeprecatedLzoTextInputFormat’
outputformat ‘org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat’
– 设置数据存储的位置
location ‘/warehouse/ceshi/ods/ods_start_log’;
2.创建事件日志表:
drop table if exists ods_event_log;
create external table ods_event_log (line
string)
partitioned by (dt
string)
stored as
inputformat ‘com.hadoop.mapred.DeprecatedLzoTextInputFormat’
outputformat ‘org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat’
location ‘/warehouse/ceshi/ods/ods_event_log’;
三、ODS 层加载数据脚本:
#!/bin/bash
APP=ceshi
hive=/opt/module/hive-1.2.1/bin/hive
#如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
#[ -n 变量值 ] 判断变量的值,是否为空
if [-n "$1"] ; then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
echo "===日志日期为 $do_date==="
sql="
load data inpath 'hdfs数据路径/$do_date' into table "$APP".ods_start_log partition(dt='$do_date');
load data inpath 'hdfs数据路径/$do_date' into table "$APP".ods_event_log partition(dt='$do_date');
"$hive -e "$sql"
增加脚本权限:
chmod 777 ods_log.sh
运行脚本:
ods_log.sh $do_date(例如:2020-05-20)
查看导入数据
select * from ods_start_log where dt='2020-05-20'
select * from ods_event_log where dt='2020-05-20'
四、数仓搭建之 DWD 层
DWD 层启动表
drop table if exists dwd_start_log;
create external table dwd_start_log (
`action` string,
`ar` string,
`ba` string,
`detail` string,
`en` string,
`entry` string,
)
partitioned by(`dt` string)
location '/warehouse/ceshi/dwd/dwd_start_log/';
五、DWD层启动表加载数据脚本
#!/bin/bash
APP=ceshi #(数据库)
hive=/opt/module/hive-1.2.1/bin/hive
if [ -n "$1" ] ;then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
echo "===日志日期为 $do_date==="
sql="
insert table "$APP".dwd_start_log PARTITION (dt='$do_date')
select get_json_object(line,'$.start') start,
get_json_object(line,'$.action') action,
get_json_object(line,'$.ar') address,
get_json_object(line,'$.ba') badata,
get_json_object(line,'$.detail') detail,
get_json_object(line,'$.entry') entry
from "$APP".ods_start_log where dt='$do_date';"
$hive -e "$sql"
同上设置权限操作
六、DWD层事件日志表
drop table if exists dwd_base_event_log;
create external table dwd_base_event_log (
`mid_id` string,
`user_id` string,
`version_code` string,
`version_name` string,
`lang` string,
`source` string,
`os` string,
`area` string,
`model` string,
`brand` string,
`sdk_version` string,
`gmail` string,
`height_width` string,
`app_time` string,
`network` string,
`lng` string,
`lat` string,
`event_name` string, (事件名称)
`event_json` string, (事件)
`server_time` string (系统时间)
)
partitioned by (`dt` string)
stored as parquet
location '/warehouse/ceshi/dwd/dwd_base_event_log/';
七、自定义 UDF 函数(解析公共字段 cm)
自定义 UDTF 函数(解析具体事件字段)
创建 maven 工程:hive-function。
创建包 com.study.udtf
在 pom.xml 添加下列内容:
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.study</groupId>
<artifactId>hivefunction</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<project.build.sourceEncoding>UTF8</project.build.sourceEncoding>
<hive.version>1.2.1</hive.version>
</properties>
<dependencies>
<!--添加hive依赖-->
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>${hive.version}</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>2.3.2</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
1.在包com.study.udtf创建类BaseFieldUDF,用来解析公共字段
package com.study.udf;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.json.JSONException;
import org.json.JSONObject;
public class BaseFieldUDF extends UDF {
public String evaluate(String line, String jsonKeysString) {
StringBuilder sb = new StringBuilder();
// 切割 jsonKeys 得到一个个 key 值
String[] keys = jsonKeysString.split(",");
// 处理 line ,得到 服务器时间 和 json
String[] logContents = line.split("\\|");
// 合法性校验
if (logContents.length != 2 || StringUtils.isBlank(logContents[1])) {
return "";
}
// 处理 json
try {
JSONObject jsonObject = new JSONObject(logContents[1]);
// 获取公共字段 cm 中的数据
JSONObject base = jsonObject.getJSONObject("cm");
// 循环遍历取值
for (String key : keys) {
String fieldName = key.trim();
if (base.has(fieldName)) {
sb.append(base.getString(fieldName)).append("\t");
} else {
sb.append("\t");
}
}
// 添加事件字段
sb.append(jsonObject.getString("et")).append("\t");
// 添加服务器时间
sb.append(logContents[0]).append("\t");
} catch (JSONException e) {
e.printStackTrace();
}
return sb.toString();
}
}
2.在包 com.study.udf 创建类 BaseFieldUDF,用来解析公共字段
package com.study.udtf;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.json.JSONArray;
import org.json.JSONException;
import java.util.ArrayList;
public class EventJsonUDTF extends GenericUDTF {
// 在该方法中,指定输出参数的名称和参数类型
@Override
public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
ArrayList<String> fieldNames = new ArrayList<>();
ArrayList<ObjectInspector> fieldOIs = new ArrayList<>();
fieldNames.add("event_name");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
fieldNames.add("event_json");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
}
// 输入一条记录,输出若干条结果
@Override
public void process(Object[] objects) throws HiveException {
// 获取传入的 et
String input = objects[0].toString();
// 如果传进来的数据为空,直接返回过滤掉该数据
if (StringUtils.isBlank(input)) {
return;
} else {
try {
// 获取一共有几个事件
JSONArray ja = new JSONArray(input);
if (ja == null) {
return;
}
// 遍历每一个事件
for (int i = 0; i < ja.length(); i++) {
String[] result = new String[2];
try {
// 获取每个事件的名称
result[0] = ja.getJSONObject(i).getString("en");
// 取出每一个事件的整体
result[1] = ja.getString(i);
} catch (JSONException e) {
continue;
}
// 将结果返回
forward(result);
}
} catch (JSONException e) {
e.printStackTrace();
}
}
}
// 当没有记录处理的时候该方法会被调用,用来清理代码或者产生额外的输出
@Override
public void close() throws HiveException {
}
}
八、DWD 层数据解析脚本
#!/bin/bash
APP=ceshi
hive=/opt/module/hive-1.2.1/bin/hive
if [ -n "$1" ]; then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
sql="
add jar /opt/module/hive-1.2.1/hive-function-1.0-SNAPSHOT.jar;
create temporary function base_analizer as 'com.study.udf.BaseFieldUDF';
create temporary function flat_analizer as 'com.study.udtf.EventJsonUDTF';
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table "$APP".dwd_base_event_log
partition (dt='$do_date')
select
mid_id,
user_id,
version_code,
version_name,
lang,
source,
os,
area,
model,
brand,
sdk_version,
gmail,
height_width,
app_time,
network,
lng,
lat,
event_name,
event_json,
server_time
from(
select
split(base_analizer(line, 'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[0] as mid_id,
split(base_analizer(line, 'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[1] as user_id,
split(base_analizer(line, 'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[2] as version_code,
split(base_analizer(line, 'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[3] as version_name,
split(base_analizer(line, 'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[4] as lang,
split(base_analizer(line, 'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[5] as source,
split(base_analizer(line, 'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[6] as os,
split(base_analizer(line, 'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[7] as area,
split(base_analizer(line, 'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[8] as model,
split(base_analizer(line, 'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[9] as brand,
split(base_analizer(line, 'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[10] as sdk_version,
split(base_analizer(line, 'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[11] as gmail,
split(base_analizer(line, 'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[12] as height_width,
split(base_analizer(line, 'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[13] as app_time,
split(base_analizer(line, 'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[14] as network,
split(base_analizer(line, 'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[15] as lng,
split(base_analizer(line, 'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[16] as lat,
split(base_analizer(line, 'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[17] as ops,
split(base_analizer(line, 'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[18] as server_time
from "$APP".ods_event_log where dt='$do_date' and base_analizer(line, 'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la')<>''
) sdk_log lateral view flat_analizer(ops) tmp_k as event_name, event_json;
"
$hive -e "$sql"
设置权限同上
九、DWD 层事件表获取
建表:
drop table if exists dwd_display_log;
create external table dwd_display_log (
`mid_id` string,
`user_id` string,
`version_code` string,
`version_name` string,
`lang` string,
`source` string,
`os` string,
`area` string,
`model` string,
`brand` string,
`sdk_version` string,
`gmail` string,
`height_width` string,
`app_time` string,
`network` string,
`lng` string,
`lat` string,
`action` string,
`goodsid` string,
`place` string,
`extend1` string,
`category` string,
`server_time` string
)
partitioned by (`dt` string)
location '/warehouse/gmall/dwd/dwd_display_log/';
注: 黄色为基础日志数据,绿色为事件数据,蓝色为系统时间
插入数据:
>set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table dwd_display_log
partition(dt='2020-05-11')
select
mid_id,
user_id,
version_code ,
version_name ,
lang ,
source ,
os ,
area ,
model ,
brand ,
sdk_version ,
gmail ,
height_width ,
app_time ,
network ,
lng ,
lat ,
get_json_object(event_json, '$.kv.action') action ,
get_json_object(event_json, '$.kv.goodsid') goodsid ,
get_json_object(event_json, '$.kv.place') place ,
get_json_object(event_json, '$.kv.extend1') extend1 ,
get_json_object(event_json, '$.kv.category') category ,
server_time
from dwd_base_event_log
where dt='2020-05-11' and event_name='display';