数仓分层搭建操作流程

最新推荐文章于 2024-04-16 17:09:50 发布

今天你锻炼了么

最新推荐文章于 2024-04-16 17:09:50 发布

阅读量503

点赞数

文章标签：大数据 hadoop 数据仓库

本文链接：https://blog.csdn.net/weixin_43968786/article/details/107563522

版权

数仓分层搭建操作流程

此文档为当时学习数仓整理的操作流程，初学者可以当做参考。

一、数据格式

1.启动日志表基本格式：
{
"action":"1",
"ar":"MX",
"ba":"HTC",
"detail":"",
"en":"start",
"entry":"2",}
2.事件日志表基本格式（埋点数据基本格式）：
1540934156385|{
    "ap": "gmall", 
    "cm": {
        "uid": "1234", 
        "vc": "2", 
        "vn": "1.0", 
        "la": "EN", 
        "sr": "", 
        "os": "7.1.1", 
        "ar": "CN", 
        "md": "BBB100-1", 
        "ba": "blackberry", 
        "sv": "V2.2.1", 
        "g": "abc@gmail.com", 
        "hw": "1620x1080", 
        "t": "1506047606608", 
        "nw": "WIFI", 
        "ln": 0
    }, 
        "et": [
            {
				display "ett": "1506047605364",  //客户端事件产生时间
                "en": "display",  //事件名称
                "kv": {  //事件结果，以key-value形式自行定义
                    "goodsid": "236",
                    "action": "1",
                    "extend1": "1",
"place": "2",
"category": "75"
                }
            },{
		        "ett": "1552352626835",
		        "en": "active_background",
		        "kv": {
			         "active_source": "1"
		        }
	        }
        ]
    }

二、数仓搭建之 ODS 层

1.创建启动日志表：
– 如果要创建的表已经存在，则先删除

drop table if exists ods_start_log;

– 创建外部表，字段就是一个 String 类型的 json 数据

create external table ods_start_log (line string)

– 根据日期进行分区

partitioned by (dt string)

– Lzo压缩格式处理 Lzo --> text

stored as
inputformat ‘com.hadoop.mapred.DeprecatedLzoTextInputFormat’
outputformat ‘org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat’

– 设置数据存储的位置

location ‘/warehouse/ceshi/ods/ods_start_log’;

2.创建事件日志表：

drop table if exists ods_event_log;
create external table ods_event_log (line string)
partitioned by (dt string)
stored as
inputformat ‘com.hadoop.mapred.DeprecatedLzoTextInputFormat’
outputformat ‘org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat’
location ‘/warehouse/ceshi/ods/ods_event_log’;

三、ODS 层加载数据脚本:

#!/bin/bash
APP=ceshi
hive=/opt/module/hive-1.2.1/bin/hive
#如果是输入的日期按照取输入日期；如果没输入日期取当前时间的前一天
#[ -n 变量值 ] 判断变量的值，是否为空
if [-n "$1"] ; then
  do_date=$1
else
  do_date=`date -d "-1 day" +%F` 
fi
echo "===日志日期为 $do_date==="
sql="
load data inpath 'hdfs数据路径/$do_date' into table "$APP".ods_start_log partition(dt='$do_date');
load data inpath 'hdfs数据路径/$do_date' into table "$APP".ods_event_log partition(dt='$do_date');
"$hive -e "$sql"

增加脚本权限:

chmod 777 ods_log.sh

运行脚本:

ods_log.sh $do_date（例如：2020-05-20）

查看导入数据

select * from ods_start_log where dt='2020-05-20' 
select * from ods_event_log where dt='2020-05-20'

四、数仓搭建之 DWD 层

DWD 层启动表

drop table if exists dwd_start_log;
create external table dwd_start_log (
  `action` string,
  `ar` string,
  `ba` string,
  `detail` string,
  `en` string,
  `entry` string,
)
partitioned by(`dt` string)
location '/warehouse/ceshi/dwd/dwd_start_log/';

五、DWD层启动表加载数据脚本

#!/bin/bash 
APP=ceshi   #(数据库)
hive=/opt/module/hive-1.2.1/bin/hive 
if [ -n "$1" ] ;then 
	do_date=$1 
else 
	do_date=`date -d "-1 day" +%F` 
fi 
echo "===日志日期为 $do_date==="
sql=" 
insert table "$APP".dwd_start_log PARTITION (dt='$do_date') 
		select get_json_object(line,'$.start') start,
		get_json_object(line,'$.action') action,
		get_json_object(line,'$.ar') address,
		get_json_object(line,'$.ba') badata,
		get_json_object(line,'$.detail') detail, 
		get_json_object(line,'$.entry') entry
from "$APP".ods_start_log where dt='$do_date';"
$hive -e "$sql"

同上设置权限操作

六、DWD层事件日志表

drop table if exists dwd_base_event_log;
create external table dwd_base_event_log (
  `mid_id` string,
  `user_id` string,
  `version_code` string,
  `version_name` string,
  `lang` string,
  `source` string,
  `os` string,
  `area` string,
  `model` string,
  `brand` string,
  `sdk_version` string,
  `gmail` string,
  `height_width` string,
  `app_time` string,
  `network` string,
  `lng` string,
  `lat` string,
  `event_name` string,  (事件名称)
  `event_json` string,	(事件)
  `server_time` string   (系统时间)
) 
partitioned by (`dt` string)
stored as parquet
location '/warehouse/ceshi/dwd/dwd_base_event_log/';

七、自定义 UDF 函数（解析公共字段 cm）

自定义 UDTF 函数（解析具体事件字段）
创建 maven 工程：hive-function。
创建包 com.study.udtf
在 pom.xml 添加下列内容：



<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.study</groupId>
    <artifactId>hivefunction</artifactId>
    <version>1.0-SNAPSHOT</version>

<properties>
        <project.build.sourceEncoding>UTF8</project.build.sourceEncoding>
        <hive.version>1.2.1</hive.version>
    </properties>
    <dependencies>
        <!--添加hive依赖-->
        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-exec</artifactId>
            <version>${hive.version}</version>
        </dependency>
    </dependencies>

    <build>
        <plugins>
            <plugin>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>2.3.2</version>
                <configuration>
                    <source>1.8</source>
                    <target>1.8</target>
                </configuration>
            </plugin>
            <plugin>
                <artifactId>maven-assembly-plugin</artifactId>
                <configuration>
                    <descriptorRefs>
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                </configuration>
                <executions>
                    <execution>
                        <id>make-assembly</id>
                        <phase>package</phase>
                        <goals>
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
     </build>

</project>

1．在包com.study.udtf创建类BaseFieldUDF，用来解析公共字段

package com.study.udf;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.json.JSONException;
import org.json.JSONObject;

public class BaseFieldUDF extends UDF {
    public String evaluate(String line, String jsonKeysString) {

        StringBuilder sb = new StringBuilder();

        // 切割 jsonKeys 得到一个个 key 值
        String[] keys = jsonKeysString.split(",");

        // 处理 line ，得到 服务器时间 和 json
        String[] logContents = line.split("\\|");

        // 合法性校验
        if (logContents.length != 2 || StringUtils.isBlank(logContents[1])) {
            return "";
        }

        // 处理 json
        try {
            JSONObject jsonObject = new JSONObject(logContents[1]);
            // 获取公共字段 cm 中的数据
            JSONObject base = jsonObject.getJSONObject("cm");
            // 循环遍历取值
            for (String key : keys) {
                String fieldName = key.trim();
                if (base.has(fieldName)) {
                    sb.append(base.getString(fieldName)).append("\t");
                } else {
                    sb.append("\t");
                }
            }

            // 添加事件字段
            sb.append(jsonObject.getString("et")).append("\t");

            // 添加服务器时间
            sb.append(logContents[0]).append("\t");

        } catch (JSONException e) {
            e.printStackTrace();
        }
        return sb.toString();
    }
}

2.在包 com.study.udf 创建类 BaseFieldUDF，用来解析公共字段

package com.study.udtf;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.json.JSONArray;
import org.json.JSONException;

import java.util.ArrayList;

public class EventJsonUDTF extends GenericUDTF {

    // 在该方法中，指定输出参数的名称和参数类型
    @Override
    public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
        ArrayList<String> fieldNames = new ArrayList<>();
        ArrayList<ObjectInspector> fieldOIs = new ArrayList<>();

        fieldNames.add("event_name");
        fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
        fieldNames.add("event_json");
        fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);

        return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
    }

    // 输入一条记录，输出若干条结果
    @Override
    public void process(Object[] objects) throws HiveException {
        // 获取传入的 et
        String input = objects[0].toString();
        // 如果传进来的数据为空，直接返回过滤掉该数据
        if (StringUtils.isBlank(input)) {
            return;
        } else {
            try {
                // 获取一共有几个事件
                JSONArray ja = new JSONArray(input);
                if (ja == null) {
                    return;
                }

                // 遍历每一个事件
                for (int i = 0; i < ja.length(); i++) {
                    String[] result = new String[2];
                    try {
                        // 获取每个事件的名称
                        result[0] = ja.getJSONObject(i).getString("en");
                        // 取出每一个事件的整体
                        result[1] = ja.getString(i);
                    } catch (JSONException e) {
                        continue;
                    }
                    // 将结果返回
                    forward(result);
                }
            } catch (JSONException e) {
                e.printStackTrace();
            }
        }
    }

    // 当没有记录处理的时候该方法会被调用，用来清理代码或者产生额外的输出
    @Override
    public void close() throws HiveException {

    }
}

八、DWD 层数据解析脚本

#!/bin/bash
APP=ceshi
hive=/opt/module/hive-1.2.1/bin/hive
if [ -n "$1" ]; then
  do_date=$1
else
  do_date=`date -d "-1 day" +%F`
fi
sql="
add jar /opt/module/hive-1.2.1/hive-function-1.0-SNAPSHOT.jar;
create temporary function base_analizer as 'com.study.udf.BaseFieldUDF';
create temporary function flat_analizer as 'com.study.udtf.EventJsonUDTF';
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table "$APP".dwd_base_event_log
partition (dt='$do_date')
select 
mid_id,
user_id,
version_code,
version_name,
lang,
source,
os,
area,
model,
brand,
sdk_version,
gmail,
height_width,
app_time,
network,
lng,
lat,
event_name,
event_json,
server_time
from(
select
split(base_analizer(line, 'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[0] as mid_id,
split(base_analizer(line, 'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[1] as user_id,
split(base_analizer(line, 'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[2] as version_code,
split(base_analizer(line, 'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[3] as version_name,
split(base_analizer(line, 'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[4] as lang,
split(base_analizer(line, 'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[5] as source,
split(base_analizer(line, 'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[6] as os,
split(base_analizer(line, 'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[7] as area,
split(base_analizer(line, 'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[8] as model,
split(base_analizer(line, 'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[9] as brand,
split(base_analizer(line, 'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[10] as sdk_version,
split(base_analizer(line, 'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[11] as gmail,
split(base_analizer(line, 'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[12] as height_width,
split(base_analizer(line, 'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[13] as app_time,
split(base_analizer(line, 'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[14] as network,
split(base_analizer(line, 'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[15] as lng,
split(base_analizer(line, 'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[16] as lat,
split(base_analizer(line, 'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[17] as ops,
split(base_analizer(line, 'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[18] as server_time
from "$APP".ods_event_log where dt='$do_date' and base_analizer(line, 'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la')<>''
) sdk_log lateral view flat_analizer(ops) tmp_k as event_name, event_json;
"
$hive -e "$sql"

设置权限同上

九、DWD 层事件表获取

建表：

drop table if exists dwd_display_log;
create external table dwd_display_log (
  `mid_id` string,
  `user_id` string,
  `version_code` string,
  `version_name` string,
  `lang` string,
  `source` string,
  `os` string,
  `area` string, 
  `model` string,
  `brand` string,
  `sdk_version` string,
  `gmail` string,
  `height_width` string,
  `app_time` string,
  `network` string,
  `lng` string,
  `lat` string,
  `action` string,
  `goodsid` string,
  `place` string,
  `extend1` string,
  `category` string,
  `server_time` string
) 
partitioned by (`dt` string)
location '/warehouse/gmall/dwd/dwd_display_log/';

注: 黄色为基础日志数据，绿色为事件数据，蓝色为系统时间
插入数据：
>set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table dwd_display_log
partition(dt='2020-05-11')
select
  mid_id,
  user_id,
  version_code ,
  version_name ,
  lang ,
  source ,
  os ,
  area ,
  model ,
  brand ,
  sdk_version ,
  gmail ,
  height_width ,
  app_time ,
  network ,
  lng ,
  lat ,
  get_json_object(event_json, '$.kv.action') action ,
  get_json_object(event_json, '$.kv.goodsid') goodsid ,
  get_json_object(event_json, '$.kv.place') place ,
  get_json_object(event_json, '$.kv.extend1') extend1 ,
  get_json_object(event_json, '$.kv.category') category ,
  server_time 
from dwd_base_event_log
where dt='2020-05-11' and event_name='display';

今天你锻炼了么

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
数仓分层搭建操作流程

<?xml version="1.0" encoding="UTF-8"?>4.0.0<groupId>com.study</groupId><artifactId>hivefunction</artifactId><version>1.0-SNAPSHOT</version>
复制链接

扫一扫