滴滴出行实训：构建数仓与预处理用户订单数据-CSDN博客

本文链接：https://blog.csdn.net/qq_43893755/article/details/120664677

企业实训课第九节

此节内容主要是自己完成滴滴出行项目。

打开集群，先构建数据仓库（数仓分层）

一、构建数据仓库——创建表

我们创建表，首先需要明确需求。需要哪几张表，各层之间的关系如何？这些问题需要在构建数仓考虑清楚。

--1：创建数据库 
    -- 1.1 创建ods库
    create database if not exists ods_didi;
    -- 1.2 创建dw库
    create database if not exists dw_didi;
    -- 1.3 创建app库
    create database if not exists app_didi;

ods创建用户打车订单表

-- 创建用户订单表结构
create table if not exists ods_didi.t_user_order(
        orderId string comment '订单id',
        telephone string comment '打车用户手机',
        lng string comment '用户发起打车的经度',
        lat string comment '用户发起打车的纬度',
        province string comment '所在省份',
        city string comment '所在城市',
        es_money double comment '预估打车费用',
        gender string comment '用户信息 - 性别',
        profession string comment '用户信息 - 行业',
        age_range string comment '年龄段（70后、80后、...）',
        tip double comment '小费',
        subscribe integer comment '是否预约（0 - 非预约、1 - 预约）',
        sub_time string comment '预约时间',
        is_agent integer comment '是否代叫（0 - 本人、1 - 代叫）',
        agent_telephone string comment '预约人手机',
        order_time string comment '预约时间'
    )
    partitioned by (dt string comment '时间分区') 
    ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' ;

ods创建取消订单表

create table if not exists ods_didi.t_user_cancel_order(
    orderId string comment '订单ID',
    cstm_telephone string comment '客户联系电话',
    lng string comment '取消订单的经度',
    lat string comment '取消订单的纬度',
    province string comment '所在省份',
    city string comment '所在城市',
    es_distance double comment '预估距离',
    gender string comment '性别',
    profession string comment '行业',
    age_range string comment '年龄段',
    reason integer comment '取消订单原因（1 - 选择了其他交通方式、2 - 与司机达成一致，取消订单、3 - 投诉司机没来接我、4 - 已不需要用车、5 - 无理由取消订单）',
    cancel_time string comment '取消时间'
)
partitioned by (dt string comment '时间分区') 
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' ;

ods 创建订单表支付表

create table if not exists ods_didi.t_user_pay_order(
    id string comment '支付订单ID',
    orderId string comment '订单ID',
    lng string comment '目的地的经度（支付地址）',
    lat string comment '目的地的纬度（支付地址）',
    province string comment '省份',
    city string comment '城市',
    total_money double comment '车费总价',
    real_pay_money double comment '实际支付总额',
    passenger_additional_money double comment '乘客额外加价',
    base_money double comment '车费合计',
    has_coupon integer comment '是否使用优惠券（0 - 不使用、1 - 使用）',
    coupon_total double comment '优惠券合计',
    pay_way integer comment '支付方式（0 - 微信支付、1 - 支付宝支付、3 - QQ钱包支付、4 - 一网通银行卡支付）',
    mileage double comment '里程（单位公里）',
    pay_time string comment '支付时间'
)
partitioned by (dt string comment '时间分区') 
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' ;

创建用户评价表

create table if not exists ods_didi.t_user_evaluate(
    id string comment '评价日志唯一ID',
    orderId string comment '订单ID',
    passenger_telephone string comment '用户电话',
    passenger_province string comment '用户所在省份',
    passenger_city string comment '用户所在城市',
    eva_level integer comment '评价等级（1 - 一颗星、... 5 - 五星）',
    eva_time string comment '评价时间'
)
partitioned by (dt string comment '时间分区') 
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' ;

二、构建数据仓库——表数据加载

分区：大规模数据处理，必须要构建分区。

采用T+1的日期分区
假设今天是2021-01-01，那么1月1日的分析结果在第二天才能看到，也就是2021-01-02查看到上一天的数据分析结果。此处，我们采用最常用的分区方式，使用日期来进行分区。

--3:给表加载数据
--3.1、创建本地路径，上传源日志文件
mkdir -p /export/data/didi

--3.2、通过load命令给表加载数据，并指定分区
load data local inpath '/export/data/didi/order.csv' into table t_user_order partition (dt='2020-04-12');
load data local inpath '/export/data/didi/cancel_order.csv' into table t_user_cancel_order partition (dt='2020-04-12');
load data local inpath '/export/data/didi/pay.csv' into table t_user_pay_order partition (dt='2020-04-12');
load data local inpath '/export/data/didi/evaluate.csv' into table t_user_evaluate partition (dt='2020-04-12');

三、数据预处理

预处理需求
数据已经准备好了，需要对ods层中的数据进行预处理。

为什么要进行预处理？

目的主要是让预处理后的数据更容易进行数据分析，并且能够将一些非法的数据处理掉，避免影响实际的统计结果。

在机器学习、深度学习时，预处理是不可或缺的一步。

预处理前考虑以下需求：
1、过滤掉order_time长度小于8的数据，如果小于8，表示这条数据不合法，不应该参加统计。
2、**将一些0、1表示的字段，处理为更容易理解的字段。**例如：subscribe字段，0表示非预约、1表示预约。我们需要添加一个额外的字段，用来展示非预约和预约，这样将来我们分析的时候，跟容易看懂数据。
3、order_time字段为2020-4-12 1:15，为了将来更方便处理，我们统一使用类似 2020-04-12 01:15来表示，这样所有的order_time字段长度是一样的。并且将日期获取出来
4、为了方便将来按照年、月、日、小时统计，我们需要新增这几个字段。
后续要分析一天内，不同时段的订单量，我们需要在预处理过程中将订单对应的时间段提前计算出来。例如：1:00-5:00为凌晨。

3.1 数据预处理——用户订单处理

DW层创建宽表

--4:数据预处理
--建表
create table if not exists dw_didi.t_user_order_wide(
    orderId string comment '订单id',
    telephone string comment '打车用户手机',
    lng string comment '用户发起打车的经度',
    lat string comment '用户发起打车的纬度',
    province string comment '所在省份',
    city string comment '所在城市',
    es_money double comment '预估打车费用',
    gender string comment '用户信息 - 性别',
    profession string comment '用户信息 - 行业',
    age_range string comment '年龄段（70后、80后、...）',
    tip double comment '小费',
    subscribe integer comment '是否预约（0 - 非预约、1 - 预约）',
    subscribe_name string comment '是否预约名称',
    sub_time string comment '预约时间',
    is_agent integer comment '是否代叫（0 - 本人、1 - 代叫）',
    is_agent_name string comment '是否代叫名称',
    agent_telephone string comment '预约人手机',
    order_date string comment '预约时间，yyyy-MM-dd',
    order_year string comment '年',
    order_month string comment '月',
    order_day string comment '日',
    order_hour string comment '小时',
    order_time_range string comment '时间段',
    order_time string comment '预约时间'
)
partitioned by (dt string comment '时间分区') 
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' ;

--转宽表HQL语句
select 
    orderId,
    telephone,
    lng,
    lat,
    province,
    city,
    es_money,
    gender,
    profession,
    age_range,
    tip,
    subscribe,
    case when subscribe = 0 then '非预约'
         when subscribe = 1 then'预约'
    end as subscribe_name,
    sub_time,
    is_agent,
    case when is_agent = 0 then '本人'
         when is_agent = 1 then '代叫'
    end as is_agent_name,
    agent_telephone,
    date_format(concat(order_time,':00'), 'yyyy-MM-dd') as order_date,
    year(date_format(concat(order_time,':00'), 'yyyy-MM-dd')) as order_year,
    month(date_format(concat(order_time,':00'), 'yyyy-MM-dd')) as order_month,
    day(date_format(concat(order_time,':00'), 'yyyy-MM-dd')) as order_day,
    hour(date_format(concat(order_time,':00'), 'yyyy-MM-dd HH:mm:ss')) as order_hour,
    case when hour(date_format(concat(order_time,':00'), 'yyyy-MM-dd HH:mm:ss')) > 1 and hour(date_format(concat(order_time,':00'), 'yyyy-MM-dd HH:mm:ss')) <= 5 then '凌晨'
         when hour(date_format(concat(order_time,':00'), 'yyyy-MM-dd HH:mm:ss')) > 5 and hour(date_format(concat(order_time,':00'), 'yyyy-MM-dd HH:mm:ss')) <= 8 then '早上'
         when hour(date_format(concat(order_time,':00'), 'yyyy-MM-dd HH:mm:ss')) > 8 and hour(date_format(concat(order_time,':00'), 'yyyy-MM-dd HH:mm:ss')) <= 11 then '上午'
         when hour(date_format(concat(order_time,':00'), 'yyyy-MM-dd HH:mm:ss')) > 11 and hour(date_format(concat(order_time,':00'), 'yyyy-MM-dd HH:mm:ss')) <= 13 then '中午'
         when hour(date_format(concat(order_time,':00'), 'yyyy-MM-dd HH:mm:ss')) > 13 and hour(date_format(concat(order_time,':00'), 'yyyy-MM-dd HH:mm:ss')) <= 17 then '下午'
         when hour(date_format(concat(order_time,':00'), 'yyyy-MM-dd HH:mm:ss')) > 17 and hour(date_format(concat(order_time,':00'), 'yyyy-MM-dd HH:mm:ss')) <= 19 then '晚上'
         when hour(date_format(concat(order_time,':00'), 'yyyy-MM-dd HH:mm:ss')) > 19 and hour(date_format(concat(order_time,':00'), 'yyyy-MM-dd HH:mm:ss')) <= 20 then '半夜'
         when hour(date_format(concat(order_time,':00'), 'yyyy-MM-dd HH:mm:ss')) > 20 and hour(date_format(concat(order_time,':00'), 'yyyy-MM-dd HH:mm:ss')) <= 24 then '深夜'
         when hour(date_format(concat(order_time,':00'), 'yyyy-MM-dd HH:mm:ss')) >= 0 and hour(date_format(concat(order_time,':00'), 'yyyy-MM-dd HH:mm:ss')) <= 1 then '深夜'
         else 'N/A'
    end as order_time_range,
    date_format(concat(order_time,':00'), 'yyyy-MM-dd HH:mm:ss') as order_time
from ods_didi.t_user_order where dt = '2020-04-12' and length(order_time) > 8 ;

3.2 数据预处理——将数据加载到dw层宽表

--7.3 	将数据加载到dw层宽表
insert overwrite table dw_didi.t_user_order_wide partition(dt='2020-04-12')
select 
    orderId,
    telephone,
    lng,
    lat,
    province,
    city,
    es_money,
    gender,
    profession,
    age_range,
    tip,
    subscribe,
    case when subscribe = 0 then '非预约'
         when subscribe = 1 then'预约'
    end as subscribe_name,
    sub_time,
    is_agent,
    case when is_agent = 0 then '本人'
         when is_agent = 1 then '代叫'
    end as is_agent_name,
    agent_telephone,
    date_format(concat(order_time,':00'), 'yyyy-MM-dd') as order_date,
    year(date_format(concat(order_time,':00'), 'yyyy-MM-dd')) as order_year,
    month(date_format(concat(order_time,':00'), 'yyyy-MM-dd')) as order_month,
    day(date_format(concat(order_time,':00'), 'yyyy-MM-dd')) as order_day,
    hour(date_format(concat(order_time,':00'), 'yyyy-MM-dd HH:mm:ss')) as order_hour,
    case when hour(date_format(concat(order_time,':00'), 'yyyy-MM-dd HH:mm:ss')) > 1 and hour(date_format(concat(order_time,':00'), 'yyyy-MM-dd HH:mm:ss')) <= 5 then '凌晨'
         when hour(date_format(concat(order_time,':00'), 'yyyy-MM-dd HH:mm:ss')) > 5 and hour(date_format(concat(order_time,':00'), 'yyyy-MM-dd HH:mm:ss')) <= 8 then '早上'
         when hour(date_format(concat(order_time,':00'), 'yyyy-MM-dd HH:mm:ss')) > 8 and hour(date_format(concat(order_time,':00'), 'yyyy-MM-dd HH:mm:ss')) <= 11 then '上午'
         when hour(date_format(concat(order_time,':00'), 'yyyy-MM-dd HH:mm:ss')) > 11 and hour(date_format(concat(order_time,':00'), 'yyyy-MM-dd HH:mm:ss')) <= 13 then '中午'
         when hour(date_format(concat(order_time,':00'), 'yyyy-MM-dd HH:mm:ss')) > 13 and hour(date_format(concat(order_time,':00'), 'yyyy-MM-dd HH:mm:ss')) <= 17 then '下午'
         when hour(date_format(concat(order_time,':00'), 'yyyy-MM-dd HH:mm:ss')) > 17 and hour(date_format(concat(order_time,':00'), 'yyyy-MM-dd HH:mm:ss')) <= 19 then '晚上'
         when hour(date_format(concat(order_time,':00'), 'yyyy-MM-dd HH:mm:ss')) > 19 and hour(date_format(concat(order_time,':00'), 'yyyy-MM-dd HH:mm:ss')) <= 20 then '半夜'
         when hour(date_format(concat(order_time,':00'), 'yyyy-MM-dd HH:mm:ss')) > 20 and hour(date_format(concat(order_time,':00'), 'yyyy-MM-dd HH:mm:ss')) <= 24 then '深夜'
         when hour(date_format(concat(order_time,':00'), 'yyyy-MM-dd HH:mm:ss')) >= 0 and hour(date_format(concat(order_time,':00'), 'yyyy-MM-dd HH:mm:ss')) <= 1 then '深夜'
         else 'N/A'
    end as order_time_range,
    date_format(concat(order_time,':00'), 'yyyy-MM-dd HH:mm:ss') as order_time
from ods_didi.t_user_order where dt = '2020-04-12' and length(order_time) > 8
;