第12章 湖仓一体之DWS层
设计要点:
1)DWS层的设计参考指标体系。
2)DWS层表名的命名规范为dws_数据域_统计粒度_业务过程_统计周期(1d/nd/td)
注:1d表示最近1日,nd表示最近n日,td表示历史至今。
3)使用flink的sum()函数才可以回撤更新
use modules core,hive;
12.1 最近1日汇总表
12.1.1 交易域用户商品粒度订单最近1日汇总表dws_trade_user_sku_order_1d
1)建表语句
CREATE TABLE hudi_dws.dws_trade_user_sku_order_1d
(
`user_id` BIGINT,
`sku_id` BIGINT,
`sku_name` VARCHAR(200),
`category1_id` BIGINT,
`category1_name` VARCHAR(10),
`category2_id` BIGINT,
`category2_name` VARCHAR(200),
`category3_id` BIGINT,
`category3_name` VARCHAR(200),
`tm_id` BIGINT,
`tm_name` VARCHAR(100),
`order_count_1d` BIGINT,
`order_num_1d` BIGINT,
`order_original_amount_1d` DECIMAL(16, 2),
`activity_reduce_amount_1d` DECIMAL(16, 2),
`coupon_reduce_amount_1d` DECIMAL(16, 2),
`order_total_amount_1d` DECIMAL(16, 2),
`ts` TIMESTAMP(3),
`dt` STRING
)
PARTITIONED BY (`dt`)
WITH (
'connector'='hudi',
'path' ='hdfs://hadoop102:8020/user/hudi/warehouse/dws/dws_trade_user_sku_order_1d',
'table.type'='MERGE_ON_READ',
'hoodie.datasource.write.recordkey.field' = 'user_id,sku_id',
'hoodie.datasource.write.precombine.field' = 'ts',
'write.bucket_assign.tasks'='1',
'write.tasks' = '4',
'compaction.tasks' = '1',
'compaction.async.enabled' = 'true',
'compaction.schedule.enabled' = 'true',
'compaction.trigger.strategy' = 'num_commits',
'compaction.delta_commits' = '5',
'read.streaming.enabled' = 'true',
'changelog.enabled' = 'true',
'read.streaming.skip_compaction' = 'true',
'hive_sync.enable'='true',
'hive_sync.mode' = 'hms',
'hive_sync.metastore.uris' = 'thrift://hadoop102:9083',
'hive_sync.db'='hive_dws',
'hive_sync.table'='dws_trade_user_sku_order_1d'
);
2)数据装载
insert into hudi_dws.dws_trade_user_sku_order_1d
select
user_id,
id,
sku_name,
category1_id,
category1_name,
category2_id,
category2_name,
category3_id,
category3_name,
tm_id,
tm_name,
order_count_1d,
order_num_1d,
order_original_amount_1d,
activity_reduce_amount_1d,
coupon_reduce_amount_1d,
order_total_amount_1d,
LOCALTIMESTAMP as ts,
dt
from
(
select
dt,
user_id,
sku_id,
count(*) order_count_1d,
sum(sku_num) order_num_1d,
sum(split_original_amount) order_original_amount_1d,
sum(nvl(split_activity_amount,0.0)) activity_reduce_amount_1d,
sum(nvl(split_coupon_amount,0.0)) coupon_reduce_amount_1d,
sum(split_total_amount) order_total_amount_1d
from hudi_dwd.dwd_trade_order_detail
group by dt,user_id,sku_id
)od
left join
(
select
id,
sku_name,
category1_id,
category1_name,
category2_id,
category2_name,
category3_id,
category3_name,
tm_id,
tm_name
from hudi_dim.dim_sku
)sku
on od.sku_id=sku.id;
12.1.2 交易域用户粒度订单最近1日汇总表dws_trade_user_order_1d
1)建表语句
CREATE TABLE hudi_dws.dws_trade_user_order_1d
(
`user_id` BIGINT,
`order_count_1d` BIGINT,
`order_num_1d` BIGINT,
`order_original_amount_1d` DECIMAL(16, 2),
`activity_reduce_amount_1d` DECIMAL(16, 2),
`coupon_reduce_amount_1d` DECIMAL(16, 2),
`order_total_amount_1d` DECIMAL(16, 2),
`ts` TIMESTAMP(3),
`dt` STRING
)
PARTITIONED BY (`dt`)
WITH (
'connector'='hudi',
'path' ='hdfs://hadoop102:8020/user/hudi/warehouse/dws/dws_trade_user_order_1d',
'table.type'='MERGE_ON_READ',
'hoodie.datasource.write.recordkey.field' = 'user_id',
'hoodie.datasource.write.precombine.field' = 'ts',
'write.bucket_assign.tasks'='1',
'write.tasks' = '4',
'compaction.tasks' = '1',
'compaction.async.enabled' = 'true',
'compaction.schedule.enabled' = 'true',
'compaction.trigger.strategy' = 'num_commits',
'compaction.delta_commits' = '5',
'read.streaming.enabled' = 'true',
'changelog.enabled' = 'true',
'read.streaming.skip_compaction' = 'true',
'hive_sync.enable'='true',
'hive_sync.mode' = 'hms',
'hive_sync.metastore.uris' = 'thrift://hadoop102:9083',
'hive_sync.db'='hive_dws',
'hive_sync.table'='dws_trade_user_order_1d'
);
2)数据装载
insert into hudi_dws.dws_trade_user_order_1d
select
user_id,
count(distinct(order_id)),
sum(sku_num),
sum(split_original_amount),
sum(nvl(split_activity_amount,0)),
sum(nvl(split_coupon_amount,0)),
sum(split_total_amount),
LOCALTIMESTAMP as ts,
dt
from hudi_dwd.dwd_trade_order_detail
group by user_id,dt;
12.1.3 交易域用户粒度加购最近1日汇总表dws_trade_user_cart_add_1d
1)建表语句
CREATE TABLE hudi_dws.dws_trade_user_cart_add_1d
(
`user_id` VARCHAR(200),
`cart_add_count_1d` BIGINT,
`cart_add_num_1d` BIGINT,
`ts` TIMESTAMP(3),
`dt` STRING
)
PARTITIONED BY (`dt`)
WITH (
'connector'='hudi',
'path' ='hdfs://hadoop102:8020/user/hudi/warehouse/dws/dws_trade_user_cart_add_1d',
'table.type'='MERGE_ON_READ',
'hoodie.datasource.write.recordkey.field' = 'user_id',
'hoodie.datasource.write.precombine.field' = 'ts',
'write.bucket_assign.tasks'='1',
'write.tasks' = '4',
'compaction.tasks' = '1',
'compaction.async.enabled' = 'true',
'compaction.schedule.enabled' = 'true',
'compaction.trigger.strategy' = 'num_commits',
'compaction.delta_commits' = '5',
'read.streaming.enabled' = 'true',
'changelog.enabled' = 'true',
'read.streaming.skip_compaction' = 'true',
'hive_sync.enable'='true',
'hive_sync.mode' = 'hms',
'hive_sync.metastore.uris' = 'thrift://hadoop102:9083',
'hive_sync.db'='hive_dws',
'hive_sync.table'='dws_trade_user_cart_add_1d'
);
2)数据装载
insert into hudi_dws.dws_trade_user_cart_add_1d
select
user_id,
count(*),
sum(sku_num),
LOCALTIMESTAMP as ts,
dt
from hudi_dwd.dwd_trade_cart_add
group by user_id,dt;
12.1.4 交易域用户粒度支付最近1日汇总表dws_trade_user_payment_1d
1)建表语句
CREATE TABLE hudi_dws.dws_trade_user_payment_1d
(
`user_id` BIGINT,
`payment_count_1d` BIGINT,
`payment_num_1d` BIGINT,
`payment_amount_1d` DECIMAL(16, 2),
`ts` TIMESTAMP(3),
`dt` STRING
)
PARTITIONED BY (`dt`)
WITH (
'connector'='hudi',
'path' ='hdfs://hadoop102:8020/user/hudi/warehouse/dws/dws_trade_user_payment_1d',
'table.type'='MERGE_ON_READ',
'hoodie.datasource.write.recordkey.field' = 'user_id',
'hoodie.datasource.write.precombine.field' = 'ts',
'write.bucket_assign.tasks'='1',
'write.tasks' = '4',
'compaction.tasks' = '1',
'compaction.async.enabled' = 'true',
'compaction.schedule.enabled' = 'true',
'compaction.trigger.strategy' = 'num_commits',
'compaction.delta_commits' = '5',
'read.streaming.enabled' = 'true',
'changelog.enabled' = 'true',
'read.streaming.skip_compaction' = 'true',
'hive_sync.enable'='true',
'hive_sync.mode' = 'hms',
'hive_sync.metastore.uris' = 'thrift://hadoop102:9083',
'hive_sync.db'='hive_dws',
'hive_sync.table'='dws_trade_user_payment_1d'
);
2)数据装载
insert into hudi_dws.dws_trade_user_payment_1d
select
user_id,
count(distinct(order_id)),
sum(sku_num),
sum(split_payment_amount),
LOCALTIMESTAMP as ts,
dt
from hudi_dwd.dwd_trade_pay_detail_suc
group by user_id,dt;
12.1.5 交易域省份粒度订单最近1日汇总表dws_trade_province_order_1d
1)建表语句
CREATE TABLE hudi_dws.dws_trade_province_order_1d
(
`province_id` INT,
`province_name` varchar(20),
`area_code` varchar(20),
`iso_code` varchar(20),
`iso_3166_2` varchar(20),
`order_count_1d` BIGINT,
`order_original_amount_1d` DECIMAL(16, 2),
`activity_reduce_amount_1d` DECIMAL(16, 2),
`coupon_reduce_amount_1d` DECIMAL(16, 2),
`order_total_amount_1d` DECIMAL(16, 2),
`ts` TIMESTAMP(3),
`dt` STRING
)
PARTITIONED BY (`dt`)
WITH (
'connector'='hudi',
'path' ='hdfs://hadoop102:8020/user/hudi/warehouse/dws/dws_trade_province_order_1d',
'table.type'='MERGE_ON_READ',
'hoodie.datasource.write.recordkey.field' = 'province_id',
'hoodie.datasource.write.precombine.field' = 'ts',
'write.bucket_assign.tasks'='1',
'write.tasks' = '4',
'compaction.tasks' = '1',
'compaction.async.enabled' = 'true',
'compaction.schedule.enabled' = 'true',
'compaction.trigger.strategy' = 'num_commits',
'compaction.delta_commits' = '5',
'read.streaming.enabled' = 'true',
'changelog.enabled' = 'true',
'read.streaming.skip_compaction' = 'true',
'hive_sync.enable'='true',
'hive_sync.mode' = 'hms',
'hive_sync.metastore.uris' = 'thrift://hadoop102:9083',
'hive_sync.db'='hive_dws',
'hive_sync.table'='dws_trade_province_order_1d'
);
2)数据装载
insert into hudi_dws.dws_trade_province_order_1d
select
province_id,
province_name,
area_code,
iso_code,
iso_3166_2,
order_count_1d,
order_original_amount_1d,
activity_reduce_amount_1d,
coupon_reduce_amount_1d,
order_total_amount_1d,
LOCALTIMESTAMP as ts,
dt
from
(
select
province_id,
count(distinct(order_id)) order_count_1d,
sum(split_original_amount) order_original_amount_1d,
sum(nvl(split_activity_amount,0)) activity_reduce_amount_1d,
sum(nvl(split_coupon_amount,0)) coupon_reduce_amount_1d,
sum(split_total_amount) order_total_amount_1d,
dt
from hudi_dwd.dwd_trade_order_detail
group by province_id,dt
)o
left join
(
select
id,
province_name,
area_code,
iso_code,
iso_3166_2
from hudi_dim.dim_province
)p
on o.province_id=p.id;
12.1.6 工具域用户优惠券粒度优惠券使用(支付)最近1日汇总表dws_tool_user_coupon_coupon_used_1d
1)建表语句
CREATE TABLE hudi_dws.dws_tool_user_coupon_coupon_used_1d
(
`user_id` BIGINT,
`coupon_id` BIGINT,
`coupon_name` VARCHAR(100),
`coupon_type_code` VARCHAR(10),
`coupon_type_name` VARCHAR(100),
`benefit_rule` STRING,
`used_count_1d` BIGINT,
`ts` TIMESTAMP(3),
`dt` STRING
)
PARTITIONED BY (`dt`)
WITH (
'connector'='hudi',
'path' ='hdfs://hadoop102:8020/user/hudi/warehouse/dws/dws_tool_user_coupon_coupon_used_1d',
'table.type'='MERGE_ON_READ',
'hoodie.datasource.write.recordkey.field' = 'user_id,coupon_id',
'hoodie.datasource.write.precombine.field' = 'ts',
'write.bucket_assign.tasks'='1',
'write.tasks' = '4',
'compaction.tasks' = '1',
'compaction.async.enabled' = 'true',
'compaction.schedule.enabled' = 'true',
'compaction.trigger.strategy' = 'num_commits',
'compaction.delta_commits' = '5',
'read.streaming.enabled' = 'true',
'changelog.enabled' = 'true',
'read.streaming.skip_compaction' = 'true',
'hive_sync.enable'='true',
'hive_sync.mode' = 'hms',
'hive_sync.metastore.uris' = 'thrift://hadoop102:9083',
'hive_sync.db'='hive_dws',
'hive_sync.table'='dws_tool_user_coupon_coupon_used_1d'
);
2)数据装载
insert into hudi_dws.dws_tool_user_coupon_coupon_used_1d
select
user_id,
coupon_id,
coupon_name,
coupon_type_code,
coupon_type_name,
benefit_rule,
used_count,
LOCALTIMESTAMP as ts,
dt
from
(
select
dt,
user_id,
coupon_id,
count(*) used_count
from hudi_dwd.dwd_tool_coupon_used
group by dt,user_id,coupon_id
)t1
left join
(
select
id,
coupon_name,
coupon_type_code,
coupon_type_name,
benefit_rule
from hudi_dim.dim_coupon
)t2
on t1.coupon_id=t2.id;
12.1.7 互动域商品粒度收藏商品最近1日汇总表dws_interaction_sku_favor_add_1d
1)建表语句
CREATE TABLE hudi_dws.dws_interaction_sku_favor_add_1d
(
`sku_id` BIGINT,
`sku_name` VARCHAR(200),
`category1_id` BIGINT,
`category1_name` VARCHAR(10),
`category2_id` BIGINT,
`category2_name` VARCHAR(200),
`category3_id` BIGINT,
`category3_name` VARCHAR(200),
`tm_id` BIGINT,
`tm_name` VARCHAR(100),
`favor_add_count_1d` BIGINT,
`ts` TIMESTAMP(3),
`dt` STRING
)
PARTITIONED BY (`dt`)
WITH (
'connector'='hudi',
'path' ='hdfs://hadoop102:8020/user/hudi/warehouse/dws/dws_interaction_sku_favor_add_1d',
'table.type'='MERGE_ON_READ',
'hoodie.datasource.write.recordkey.field' = 'sku_id',
'hoodie.datasource.write.precombine.field' = 'ts',
'write.bucket_assign.tasks'='1',
'write.tasks' = '4',
'compaction.tasks' = '1',
'compaction.async.enabled' = 'true',
'compaction.schedule.enabled' = 'true',
'compaction.trigger.strategy' = 'num_commits',
'compaction.delta_commits' = '5',
'read.streaming.enabled' = 'true',
'changelog.enabled' = 'true',
'read.streaming.skip_compaction' = 'true',
'hive_sync.enable'='true',
'hive_sync.mode' = 'hms',
'hive_sync.metastore.uris' = 'thrift://hadoop102:9083',
'hive_sync.db'='hive_dws',
'hive_sync.table'='dws_interaction_sku_favor_add_1d'
);
2)数据装载
insert into hudi_dws.dws_interaction_sku_favor_add_1d
select
sku_id,
sku_name,
category1_id,
category1_name,
category2_id,
category2_name,
category3_id,
category3_name,
tm_id,
tm_name,
favor_add_count,
LOCALTIMESTAMP as ts,
dt
from
(
select
dt,
sku_id,
count(*) favor_add_count
from hudi_dwd.dwd_interaction_favor_add
group by dt,sku_id
)favor
left join
(
select
id,
sku_name,
category1_id,
category1_name,
category2_id,
category2_name,
category3_id,
category3_name,
tm_id,
tm_name
from hudi_dim.dim_sku
)sku
on favor.sku_id=sku.id;
12.1.8 流量域会话粒度页面浏览最近1日汇总表dws_traffic_session_page_view_1d
1)建表语句
CREATE TABLE hudi_dws.dws_traffic_session_page_view_1d
(
`session_id` STRING,
`mid_id` STRING,
`brand` STRING,
`model` STRING,
`operate_system` STRING,
`version_code` STRING,
`channel` STRING,
`during_time_1d` BIGINT,
`page_count_1d` BIGINT,
`ts` TIMESTAMP(3),
`dt` STRING
)
PARTITIONED BY (`dt`)
WITH (
'connector'='hudi',
'path' ='hdfs://hadoop102:8020/user/hudi/warehouse/dws/dws_traffic_session_page_view_1d',
'table.type'='MERGE_ON_READ',
'hoodie.datasource.write.recordkey.field' = 'session_id,mid_id,brand,model,operate_system,version_code,channel',
'hoodie.datasource.write.precombine.field' = 'ts',
'write.bucket_assign.tasks'='1',
'write.tasks' = '4',
'compaction.tasks' = '1',
'compaction.async.enabled' = 'true',
'compaction.schedule.enabled' = 'true',
'compaction.trigger.strategy' = 'num_commits',
'compaction.delta_commits' = '5',
'read.streaming.enabled' = 'true',
'changelog.enabled' = 'true',
'read.streaming.skip_compaction' = 'true',
'hive_sync.enable'='true',
'hive_sync.mode' = 'hms',
'hive_sync.metastore.uris' = 'thrift://hadoop102:9083',
'hive_sync.db'='hive_dws',
'hive_sync.table'='dws_traffic_session_page_view_1d'
);
2)数据装载
insert into hudi_dws.dws_traffic_session_page_view_1d
select
session_id,
mid_id,
brand,
model,
operate_system,
version_code,
channel,
sum(cast(during_time as bigint)),
count(*),
LOCALTIMESTAMP as ts,
dt
from hudi_dwd.dwd_traffic_page_view
group by dt,session_id,mid_id,brand,model,operate_system,version_code,channel;
12.1.9 流量域访客页面粒度页面浏览最近1日汇总表dws_traffic_page_visitor_page_view_1d
1)建表语句
CREATE TABLE hudi_dws.dws_traffic_page_visitor_page_view_1d
(
`mid_id` STRING,
`brand` STRING,
`model` STRING,
`operate_system` STRING,
`page_id` STRING,
`during_time_1d` BIGINT,
`view_count_1d` BIGINT,
`ts` TIMESTAMP(3),
`dt` STRING
)
PARTITIONED BY (`dt`)
WITH (
'connector'='hudi',
'path' ='hdfs://hadoop102:8020/user/hudi/warehouse/dws/dws_traffic_page_visitor_page_view_1d',
'table.type'='MERGE_ON_READ',
'hoodie.datasource.write.recordkey.field' = 'mid_id,brand,model,operate_system,page_id',
'hoodie.datasource.write.precombine.field' = 'ts',
'write.bucket_assign.tasks'='1',
'write.tasks' = '4',
'compaction.tasks' = '1',
'compaction.async.enabled' = 'true',
'compaction.schedule.enabled' = 'true',
'compaction.trigger.strategy' = 'num_commits',
'compaction.delta_commits' = '5',
'read.streaming.enabled' = 'true',
'changelog.enabled' = 'true',
'read.streaming.skip_compaction' = 'true',
'hive_sync.enable'='true',
'hive_sync.mode' = 'hms',
'hive_sync.metastore.uris' = 'thrift://hadoop102:9083',
'hive_sync.db'='hive_dws',
'hive_sync.table'='dws_traffic_page_visitor_page_view_1d'
);
2)数据装载
insert into hudi_dws.dws_traffic_page_visitor_page_view_1d
select
mid_id,
brand,
model,
operate_system,
page_id,
sum(cast(during_time as bigint)),
count(*),
LOCALTIMESTAMP as ts,
dt
from hudi_dwd.dwd_traffic_page_view
group by dt,mid_id,brand,model,operate_system,page_id;
12.2 最近n日汇总表
12.2.1 交易域用户商品粒度订单最近n日汇总表dws_trade_user_sku_order_nd
1)建表语句
CREATE TABLE hudi_dws.dws_trade_user_sku_order_nd
(
`user_id` BIGINT,
`sku_id` BIGINT,
`sku_name` VARCHAR(200),
`category1_id` BIGINT,
`category1_name` VARCHAR(10),
`category2_id` BIGINT,
`category2_name` VARCHAR(200),
`category3_id` BIGINT,
`category3_name` VARCHAR(200),
`tm_id` BIGINT,
`tm_name` VARCHAR(100),
`order_count_7d` BIGINT,
`order_num_7d` BIGINT,
`order_original_amount_7d` DECIMAL(38, 2),
`activity_reduce_amount_7d` DECIMAL(38, 2),
`coupon_reduce_amount_7d` DECIMAL(38, 2),
`order_total_amount_7d` DECIMAL(38, 2),
`order_count_30d` BIGINT,
`order_num_30d` BIGINT,
`order_original_amount_30d` DECIMAL(38, 2),
`activity_reduce_amount_30d` DECIMAL(38, 2),
`coupon_reduce_amount_30d` DECIMAL(38, 2),
`order_total_amount_30d` DECIMAL(38, 2),
`ts` TIMESTAMP(3),
`dt` STRING
)
PARTITIONED BY (`dt`)
WITH (
'connector'='hudi',
'path' ='hdfs://hadoop102:8020/user/hudi/warehouse/dws/dws_trade_user_sku_order_nd',
'table.type'='MERGE_ON_READ',
'hoodie.datasource.write.recordkey.field' = 'user_id,sku_id,sku_name,category1_id,category1_name,category2_id,category2_name,category3_id,category3_name,tm_id,tm_name',
'hoodie.datasource.write.precombine.field' = 'ts',
'write.bucket_assign.tasks'='1',
'write.tasks' = '4',
'compaction.tasks' = '1',
'compaction.async.enabled' = 'true',
'compaction.schedule.enabled' = 'true',
'compaction.trigger.strategy' = 'num_commits',
'compaction.delta_commits' = '5',
'read.streaming.enabled' = 'true',
'changelog.enabled' = 'true',
'read.streaming.skip_compaction' = 'true',
'hive_sync.enable'='true',
'hive_sync.mode' = 'hms',
'hive_sync.metastore.uris' = 'thrift://hadoop102:9083',
'hive_sync.db'='hive_dws',
'hive_sync.table'='dws_trade_user_sku_order_nd'
);
2)数据装载
insert into hudi_dws.dws_trade_user_sku_order_nd
select
user_id,
sku_id,
sku_name,
category1_id,
category1_name,
category2_id,
category2_name,
category3_id,
category3_name,
tm_id,
tm_name,
sum(if(dt>=date_add(CURRENT_DATE,-6),order_count_1d,0)),
sum(if(dt>=date_add(CURRENT_DATE,-6),order_num_1d,0)),
sum(if(dt>=date_add(CURRENT_DATE,-6),order_original_amount_1d,0)),
sum(if(dt>=date_add(CURRENT_DATE,-6),activity_reduce_amount_1d,0)),
sum(if(dt>=date_add(CURRENT_DATE,-6),coupon_reduce_amount_1d,0)),
sum(if(dt>=date_add(CURRENT_DATE,-6),order_total_amount_1d,0)),
sum(order_count_1d),
sum(order_num_1d),
sum(order_original_amount_1d),
sum(activity_reduce_amount_1d),
sum(coupon_reduce_amount_1d),
sum(order_total_amount_1d),
LOCALTIMESTAMP as ts,
cast(CURRENT_DATE as STRING) dt
from hudi_dws.dws_trade_user_sku_order_1d
where dt >= date_add(CURRENT_DATE,-29)
group by user_id,sku_id,sku_name,category1_id,category1_name,category2_id,category2_name,category3_id,category3_name,tm_id,tm_name;
12.2.2 交易域省份粒度订单最近n日汇总表dws_trade_province_order_nd
1)建表语句
CREATE TABLE hudi_dws.dws_trade_province_order_nd
(
`province_id` INT,
`province_name` varchar(20),
`area_code` varchar(20),
`iso_code` varchar(20),
`iso_3166_2` varchar(20),
`order_count_7d` BIGINT,
`order_original_amount_7d` DECIMAL(38, 2),
`activity_reduce_amount_7d` DECIMAL(38, 2),
`coupon_reduce_amount_7d` DECIMAL(38, 2),
`order_total_amount_7d` DECIMAL(38, 2),
`order_count_30d` BIGINT,
`order_original_amount_30d` DECIMAL(38, 2),
`activity_reduce_amount_30d` DECIMAL(38, 2),
`coupon_reduce_amount_30d` DECIMAL(38, 2),
`order_total_amount_30d` DECIMAL(38, 2),
`ts` TIMESTAMP(3),
`dt` STRING
)
PARTITIONED BY (`dt`)
WITH (
'connector'='hudi',
'path' ='hdfs://hadoop102:8020/user/hudi/warehouse/dws/dws_trade_province_order_nd',
'table.type'='MERGE_ON_READ',
'hoodie.datasource.write.recordkey.field' = 'province_id,province_name,area_code,iso_code,iso_3166_2',
'hoodie.datasource.write.precombine.field' = 'ts',
'write.bucket_assign.tasks'='1',
'write.tasks' = '4',
'compaction.tasks' = '1',
'compaction.async.enabled' = 'true',
'compaction.schedule.enabled' = 'true',
'compaction.trigger.strategy' = 'num_commits',
'compaction.delta_commits' = '5',
'read.streaming.enabled' = 'true',
'changelog.enabled' = 'true',
'read.streaming.skip_compaction' = 'true',
'hive_sync.enable'='true',
'hive_sync.mode' = 'hms',
'hive_sync.metastore.uris' = 'thrift://hadoop102:9083',
'hive_sync.db'='hive_dws',
'hive_sync.table'='dws_trade_province_order_nd'
);
2)数据装载
insert into hudi_dws.dws_trade_province_order_nd
select
province_id,
province_name,
area_code,
iso_code,
iso_3166_2,
sum(if(dt>=date_add(CURRENT_DATE,-6),order_count_1d,0)),
sum(if(dt>=date_add(CURRENT_DATE,-6),order_original_amount_1d,0)),
sum(if(dt>=date_add(CURRENT_DATE,-6),activity_reduce_amount_1d,0)),
sum(if(dt>=date_add(CURRENT_DATE,-6),coupon_reduce_amount_1d,0)),
sum(if(dt>=date_add(CURRENT_DATE,-6),order_total_amount_1d,0)),
sum(order_count_1d),
sum(order_original_amount_1d),
sum(activity_reduce_amount_1d),
sum(coupon_reduce_amount_1d),
sum(order_total_amount_1d),
LOCALTIMESTAMP as ts,
cast(CURRENT_DATE as STRING) dt
from hudi_dws.dws_trade_province_order_1d
where dt>=date_add(CURRENT_DATE,-29)
and dt<=CURRENT_DATE
group by province_id,province_name,area_code,iso_code,iso_3166_2;
12.3 历史至今汇总表
12.3.1 交易域用户粒度订单历史至今汇总表dws_trade_user_order_td
1)建表语句
CREATE TABLE hudi_dws.dws_trade_user_order_td
(
`user_id` BIGINT,
`order_date_first` STRING,
`order_date_last` STRING,
`order_count_td` BIGINT,
`order_num_td` BIGINT,
`original_amount_td` DECIMAL(38, 2),
`activity_reduce_amount_td` DECIMAL(38, 2),
`coupon_reduce_amount_td` DECIMAL(38, 2),
`total_amount_td` DECIMAL(38, 2),
`ts` TIMESTAMP(3),
`dt` STRING
)
PARTITIONED BY (`dt`)
WITH (
'connector'='hudi',
'path' ='hdfs://hadoop102:8020/user/hudi/warehouse/dws/dws_trade_user_order_td',
'table.type'='MERGE_ON_READ',
'hoodie.datasource.write.recordkey.field' = 'user_id',
'hoodie.datasource.write.precombine.field' = 'ts',
'write.bucket_assign.tasks'='1',
'write.tasks' = '4',
'compaction.tasks' = '1',
'compaction.async.enabled' = 'true',
'compaction.schedule.enabled' = 'true',
'compaction.trigger.strategy' = 'num_commits',
'compaction.delta_commits' = '5',
'read.streaming.enabled' = 'true',
'changelog.enabled' = 'true',
'read.streaming.skip_compaction' = 'true',
'hive_sync.enable'='true',
'hive_sync.mode' = 'hms',
'hive_sync.metastore.uris' = 'thrift://hadoop102:9083',
'hive_sync.db'='hive_dws',
'hive_sync.table'='dws_trade_user_order_td'
);
2)数据装载
insert into hudi_dws.dws_trade_user_order_td
select
user_id,
min(dt) login_date_first,
max(dt) login_date_last,
sum(order_count_1d) order_count,
sum(order_num_1d) order_num,
sum(order_original_amount_1d) original_amount,
sum(activity_reduce_amount_1d) activity_reduce_amount,
sum(coupon_reduce_amount_1d) coupon_reduce_amount,
sum(order_total_amount_1d) total_amount,
LOCALTIMESTAMP as ts,
cast(CURRENT_DATE as STRING) dt
from hudi_dws.dws_trade_user_order_1d
group by user_id;
12.3.2 用户域用户粒度登录历史至今汇总表dws_user_user_login_td
1)建表语句
CREATE TABLE hudi_dws.dws_user_user_login_td
(
`user_id` bigint,
`login_date_last` STRING,
`login_count_td` BIGINT,
`ts` TIMESTAMP(3),
`dt` STRING
)
PARTITIONED BY (`dt`)
WITH (
'connector'='hudi',
'path' ='hdfs://hadoop102:8020/user/hudi/warehouse/dws/dws_user_user_login_td',
'table.type'='MERGE_ON_READ',
'hoodie.datasource.write.recordkey.field' = 'user_id',
'hoodie.datasource.write.precombine.field' = 'ts',
'write.bucket_assign.tasks'='1',
'write.tasks' = '4',
'compaction.tasks' = '1',
'compaction.async.enabled' = 'true',
'compaction.schedule.enabled' = 'true',
'compaction.trigger.strategy' = 'num_commits',
'compaction.delta_commits' = '5',
'read.streaming.enabled' = 'true',
'changelog.enabled' = 'true',
'read.streaming.skip_compaction' = 'true',
'hive_sync.enable'='true',
'hive_sync.mode' = 'hms',
'hive_sync.metastore.uris' = 'thrift://hadoop102:9083',
'hive_sync.db'='hive_dws',
'hive_sync.table'='dws_user_user_login_td'
);
2)数据装载
insert into hudi_dws.dws_user_user_login_td
select
u.id,
nvl(login_date_last,u.dt),
nvl(login_count_td,1),
LOCALTIMESTAMP as ts,
cast(CURRENT_DATE as STRING) dt
from
(
select
id,
dt
from hudi_dim.dim_user_info
)u
left join
(
select
user_id,
max(dt) login_date_last,
count(*) login_count_td
from hudi_dwd.dwd_user_login
group by user_id
)l
on cast(u.id as string)=l.user_id;
第13章 湖仓一体之ADS层
13.1 流量主题
13.1.1 各渠道流量统计ads_traffic_stats_by_channel
需求说明如下
统计周期 | 统计粒度 | 指标 | 说明 |
最近1/7/30日 | 渠道 | 访客数 | 统计访问人数 |
最近1/7/30日 | 渠道 | 会话平均停留时长 | 统计每个会话平均停留时长 |
最近1/7/30日 | 渠道 | 会话平均浏览页面数 | 统计每个会话平均浏览页面数 |
最近1/7/30日 | 渠道 | 会话总数 | 统计会话总数 |
最近1/7/30日 | 渠道 | 跳出率 | 只有一个页面的会话的比例 |
1)建表语句
CREATE TABLE hudi_ads.ads_traffic_stats_by_channel
(
`dt` STRING,
`recent_days` INT,
`channel` STRING,
`uv_count` BIGINT,
`avg_duration_sec` BIGINT,
`avg_page_count` BIGINT,
`sv_count` BIGINT,
`bounce_rate` DECIMAL(16, 2),
`ts` TIMESTAMP(3)
) WITH (
'connector'='hudi',
'path' ='hdfs://hadoop102:8020/user/hudi/warehouse/ads/ads_traffic_stats_by_channel',
'table.type'='MERGE_ON_READ',
'hoodie.datasource.write.recordkey.field' = 'dt,recent_days,channel',
'hoodie.datasource.write.precombine.field' = 'ts',
'write.bucket_assign.tasks'='1',
'write.tasks' = '4',
'compaction.tasks' = '1',
'compaction.async.enabled' = 'true',
'compaction.schedule.enabled' = 'true',
'compaction.trigger.strategy' = 'num_commits',
'compaction.delta_commits' = '5',
'read.streaming.enabled' = 'true',
'changelog.enabled' = 'true',
'read.streaming.skip_compaction' = 'true',
'hive_sync.enable'='true',
'hive_sync.mode' = 'hms',
'hive_sync.metastore.uris' = 'thrift://hadoop102:9083',
'hive_sync.db'='hive_ads',
'hive_sync.table'='ads_traffic_stats_by_channel'
);
2)数据装载
insert into hudi_ads.ads_traffic_stats_by_channel
select
dt,
recent_days,
channel,
cast(count(distinct(mid_id)) as bigint) uv_count,
cast(avg(during_time_1d)/1000 as bigint) avg_duration_sec,
cast(avg(page_count_1d) as bigint) avg_page_count,
cast(count(*) as bigint) sv_count,
cast(sum(if(page_count_1d=1,1,0))/count(*) as decimal(16,2)) bounce_rate,
LOCALTIMESTAMP as ts
from
(
select
dt,
channel,
mid_id,
during_time_1d,
page_count_1d,
Array[1,7,30] days
from hudi_dws.dws_traffic_session_page_view_1d
) t
CROSS JOIN UNNEST(days) tmp(recent_days)
where dt>=date_add(dt,-recent_days+1)
group by dt,recent_days,channel;
13.2 用户主题
13.2.1 用户变动统计ads_user_change
该需求包括两个指标,分别为流失用户数和回流用户数,以下为对两个指标的解释说明。
指标 | 说明 |
流失用户数 | 之前活跃过的用户,最近一段时间未活跃,就称为流失用户。此处要求统计7日前(只包含7日前当天)活跃,但最近7日未活跃的用户总数。 |
回流用户数 | 之前的活跃用户,一段时间未活跃(流失),今日又活跃了,就称为回流用户。此处要求统计回流用户总数。 |
1)建表语句
CREATE TABLE hudi_ads.ads_user_change
(
`dt` STRING,
`user_churn_count` BIGINT,
`user_back_count` BIGINT,
`ts` TIMESTAMP(3)
) WITH (
'connector'='hudi',
'path' ='hdfs://hadoop102:8020/user/hudi/warehouse/ads/ads_user_change',
'table.type'='MERGE_ON_READ',
'hoodie.datasource.write.recordkey.field' = 'dt',
'hoodie.datasource.write.precombine.field' = 'ts',
'write.bucket_assign.tasks'='1',
'write.tasks' = '4',
'compaction.tasks' = '1',
'compaction.async.enabled' = 'true',
'compaction.schedule.enabled' = 'true',
'compaction.trigger.strategy' = 'num_commits',
'compaction.delta_commits' = '5',
'read.streaming.enabled' = 'true',
'changelog.enabled' = 'true',
'read.streaming.skip_compaction' = 'true',
'hive_sync.enable'='true',
'hive_sync.mode' = 'hms',
'hive_sync.metastore.uris' = 'thrift://hadoop102:9083',
'hive_sync.db'='hive_ads',
'hive_sync.table'='ads_user_change'
);
2)数据装载
insert into hudi_ads.ads_user_change
select
churn.dt,
user_churn_count,
user_back_count,
LOCALTIMESTAMP as ts
from
(
select
dt,
count(*) user_churn_count
from hudi_dws.dws_user_user_login_td
where login_date_last=date_add(dt,-7)
group by dt
)churn
join
(
select
dt,
count(*) user_back_count
from
(
select
dt,
user_id,
login_date_last
from hudi_dws.dws_user_user_login_td
)t1
join
(
select
user_id,
login_date_last login_date_previous
from hudi_dws.dws_user_user_login_td
where dt=date_add(dt,-1)
)t2
on t1.user_id=t2.user_id
where datediff(login_date_last,login_date_previous)>=8
group by dt
)back
on churn.dt=back.dt;
13.2.2 用户留存率ads_user_retention
留存分析一般包含新增留存和活跃留存分析。
新增留存分析是分析某天的新增用户中,有多少人有后续的活跃行为。活跃留存分析是分析某天的活跃用户中,有多少人有后续的活跃行为。
留存分析是衡量产品对用户价值高低的重要指标。
此处要求统计新增留存率,新增留存率具体是指留存用户数与新增用户数的比值,例如2020-06-14新增100个用户,1日之后(2020-06-15)这100人中有80个人活跃了,那2020-06-14的1日留存数则为80,2020-06-14的1日留存率则为80%。
要求统计每天的1至7日留存率,如下图所示。
1)建表语句
CREATE TABLE hudi_ads.ads_user_retention
(
`dt` STRING,
`create_date` STRING,
`retention_day` INT,
`retention_count` BIGINT,
`new_user_count` BIGINT,
`retention_rate` DECIMAL(16, 2),
`ts` TIMESTAMP(3)
) WITH (
'connector'='hudi',
'path' ='hdfs://hadoop102:8020/user/hudi/warehouse/ads/ads_user_retention',
'table.type'='MERGE_ON_READ',
'hoodie.datasource.write.recordkey.field' = 'dt,create_date',
'hoodie.datasource.write.precombine.field' = 'ts',
'write.bucket_assign.tasks'='1',
'write.tasks' = '4',
'compaction.tasks' = '1',
'compaction.async.enabled' = 'true',
'compaction.schedule.enabled' = 'true',
'compaction.trigger.strategy' = 'num_commits',
'compaction.delta_commits' = '5',
'read.streaming.enabled' = 'true',
'changelog.enabled' = 'true',
'read.streaming.skip_compaction' = 'true',
'hive_sync.enable'='true',
'hive_sync.mode' = 'hms',
'hive_sync.metastore.uris' = 'thrift://hadoop102:9083',
'hive_sync.db'='hive_ads',
'hive_sync.table'='ads_user_retention'
);
2)数据装载
insert into hudi_ads.ads_user_retention
select
t2.dt,
login_date_first create_date,
datediff(t2.dt,login_date_first) retention_day,
sum(if(login_date_last=t2.dt,1,0)) retention_count,
count(*) new_user_count,
cast(sum(if(login_date_last=t2.dt,1,0))/count(*)*100 as decimal(16,2)) retention_rate,
LOCALTIMESTAMP as ts
from
(
select
dt,
user_id,
date_id login_date_first
from hudi_dwd.dwd_user_register
where dt>=date_add(dt,-7)
and dt<dt
)t1
join
(
select
dt,
user_id,
login_date_last
from hudi_dws.dws_user_user_login_td
where dt=dt
)t2
on t1.user_id=t2.user_id
and t1.dt=t2.dt
group by t2.dt,login_date_first;
13.2.3 用户新增活跃统计ads_user_stats
需求说明如下
统计周期 | 指标 | 指标说明 |
最近1、7、30日 | 新增用户数 | 略 |
最近1、7、30日 | 活跃用户数 | 略 |
1)建表语句
CREATE TABLE hudi_ads.ads_user_stats
(
`dt` STRING,
`recent_days` BIGINT,
`new_user_count` BIGINT,
`active_user_count` BIGINT,
`ts` TIMESTAMP(3)
) WITH (
'connector'='hudi',
'path' ='hdfs://hadoop102:8020/user/hudi/warehouse/ads/ads_user_stats',
'table.type'='MERGE_ON_READ',
'hoodie.datasource.write.recordkey.field' = 'dt,recent_days',
'hoodie.datasource.write.precombine.field' = 'ts',
'write.bucket_assign.tasks'='1',
'write.tasks' = '4',
'compaction.tasks' = '1',
'compaction.async.enabled' = 'true',
'compaction.schedule.enabled' = 'true',
'compaction.trigger.strategy' = 'num_commits',
'compaction.delta_commits' = '5',
'read.streaming.enabled' = 'true',
'changelog.enabled' = 'true',
'read.streaming.skip_compaction' = 'true',
'hive_sync.enable'='true',
'hive_sync.mode' = 'hms',
'hive_sync.metastore.uris' = 'thrift://hadoop102:9083',
'hive_sync.db'='hive_ads',
'hive_sync.table'='ads_user_stats'
);
2)数据装载
insert into hudi_ads.ads_user_stats
select
t2.dt,
t2.recent_days,
new_user_count,
active_user_count,
LOCALTIMESTAMP as ts
from
(
select
t1.dt dt,
recent_days,
sum(if(login_date_last>=date_add(t1.dt,-recent_days+1),1,0)) new_user_count
from
(
select
dt,
login_date_last,
Array[1,7,30] days
from hudi_dws.dws_user_user_login_td
)t1
CROSS JOIN UNNEST(days) tmp(recent_days)
group by t1.dt,recent_days
)t2
join
(
select
t3.dt,
recent_days,
sum(if(date_id>=date_add(t3.dt,-recent_days+1),1,0)) active_user_count
from
(
select
dt,
date_id,
Array[1,7,30] days
from hudi_dwd.dwd_user_register
)t3
CROSS JOIN UNNEST(days) tmp(recent_days)
group by t3.dt,recent_days
)t4
on t2.recent_days=t4.recent_days
and t2.dt=t4.dt;
13.2.4 用户行为漏斗分析ads_user_action
漏斗分析是一个数据分析模型,它能够科学反映一个业务流程从起点到终点各阶段用户转化情况。由于其能将各阶段环节都展示出来,故哪个阶段存在问题,就能一目了然。
该需求要求统计一个完整的购物流程各个阶段的人数,具体说明如下:
统计周期 | 指标 | 说明 |
最近1 日 | 首页浏览人数 | 略 |
最近1 日 | 商品详情页浏览人数 | 略 |
最近1 日 | 加购人数 | 略 |
最近1 日 | 下单人数 | 略 |
最近1 日 | 支付人数 | 支付成功人数 |
1)建表语句
CREATE TABLE hudi_ads.ads_user_action
(
`dt` STRING,
`home_count` BIGINT,
`good_detail_count` BIGINT,
`cart_count` BIGINT,
`order_count` BIGINT,
`payment_count` BIGINT,
`ts` TIMESTAMP(3)
) WITH (
'connector'='hudi',
'path' ='hdfs://hadoop102:8020/user/hudi/warehouse/ads/ads_user_action',
'table.type'='MERGE_ON_READ',
'hoodie.datasource.write.recordkey.field' = 'dt',
'hoodie.datasource.write.precombine.field' = 'ts',
'write.bucket_assign.tasks'='1',
'write.tasks' = '4',
'compaction.tasks' = '1',
'compaction.async.enabled' = 'true',
'compaction.schedule.enabled' = 'true',
'compaction.trigger.strategy' = 'num_commits',
'compaction.delta_commits' = '5',
'read.streaming.enabled' = 'true',
'changelog.enabled' = 'true',
'read.streaming.skip_compaction' = 'true',
'hive_sync.enable'='true',
'hive_sync.mode' = 'hms',
'hive_sync.metastore.uris' = 'thrift://hadoop102:9083',
'hive_sync.db'='hive_ads',
'hive_sync.table'='ads_user_action'
);
2)数据装载
insert into hudi_ads.ads_user_action
select
page.dt,
home_count,
good_detail_count,
cart_count,
order_count,
payment_count,
LOCALTIMESTAMP as ts
from
(
select
dt,
1 recent_days,
sum(if(page_id='home',1,0)) home_count,
sum(if(page_id='good_detail',1,0)) good_detail_count
from hudi_dws.dws_traffic_page_visitor_page_view_1d
where page_id in ('home','good_detail')
group by dt
)page
join
(
select
dt,
1 recent_days,
count(*) cart_count
from hudi_dws.dws_trade_user_cart_add_1d
group by dt
)cart
on page.recent_days=cart.recent_days and page.dt=cart.dt
join
(
select
dt,
1 recent_days,
count(*) order_count
from hudi_dws.dws_trade_user_order_1d
group by dt
)ord
on page.recent_days=ord.recent_days and page.dt=ord.dt
join
(
select
dt,
1 recent_days,
count(*) payment_count
from hudi_dws.dws_trade_user_payment_1d
group by dt
)pay
on page.recent_days=pay.recent_days and page.dt=pay.dt;
13.2.5 新增下单用户统计ads_new_order_user_stats
需求说明如下
统计周期 | 指标 | 说明 |
最近1、7、30日 | 新增下单人数 | 略 |
1)建表语句
CREATE TABLE hudi_ads.ads_new_order_user_stats
(
`dt` STRING,
`recent_days` BIGINT,
`new_order_user_count` BIGINT,
`ts` TIMESTAMP(3)
) WITH (
'connector'='hudi',
'path' ='hdfs://hadoop102:8020/user/hudi/warehouse/ads/ads_new_order_user_stats',
'table.type'='MERGE_ON_READ',
'hoodie.datasource.write.recordkey.field' = 'dt,recent_days',
'hoodie.datasource.write.precombine.field' = 'ts',
'write.bucket_assign.tasks'='1',
'write.tasks' = '4',
'compaction.tasks' = '1',
'compaction.async.enabled' = 'true',
'compaction.schedule.enabled' = 'true',
'compaction.trigger.strategy' = 'num_commits',
'compaction.delta_commits' = '5',
'read.streaming.enabled' = 'true',
'changelog.enabled' = 'true',
'read.streaming.skip_compaction' = 'true',
'hive_sync.enable'='true',
'hive_sync.mode' = 'hms',
'hive_sync.metastore.uris' = 'thrift://hadoop102:9083',
'hive_sync.db'='hive_ads',
'hive_sync.table'='ads_new_order_user_stats'
);
2)数据装载
insert into hudi_ads.ads_new_order_user_stats
select
dt,
recent_days,
sum(if(order_date_first>=date_add(dt,-recent_days+1),1,0)) new_order_user_count,
LOCALTIMESTAMP as ts
from
(
select
dt,
order_date_first,
Array[1,7,30] days
from hudi_dws.dws_trade_user_order_td
) t
CROSS JOIN UNNEST(days) tmp(recent_days)
group by dt,recent_days;
13.3 商品主题
13.3.1 最近30日各品牌复购率ads_repeat_purchase_by_tm
需求说明如下
统计周期 | 统计粒度 | 指标 | 说明 |
最近30日 | 品牌 | 复购率 | 重复购买人数占购买人数比例 |
1)建表语句
CREATE TABLE hudi_ads.ads_repeat_purchase_by_tm
(
`dt` STRING,
`recent_days` INT,
`tm_id` BIGINT,
`tm_name` VARCHAR(100),
`order_repeat_rate` DECIMAL(16, 2),
`ts` TIMESTAMP(3)
) WITH (
'connector'='hudi',
'path' ='hdfs://hadoop102:8020/user/hudi/warehouse/ads/ads_repeat_purchase_by_tm',
'table.type'='MERGE_ON_READ',
'hoodie.datasource.write.recordkey.field' = 'dt,recent_days,tm_id,tm_name',
'hoodie.datasource.write.precombine.field' = 'ts',
'write.bucket_assign.tasks'='1',
'write.tasks' = '4',
'compaction.tasks' = '1',
'compaction.async.enabled' = 'true',
'compaction.schedule.enabled' = 'true',
'compaction.trigger.strategy' = 'num_commits',
'compaction.delta_commits' = '5',
'read.streaming.enabled' = 'true',
'changelog.enabled' = 'true',
'read.streaming.skip_compaction' = 'true',
'hive_sync.enable'='true',
'hive_sync.mode' = 'hms',
'hive_sync.metastore.uris' = 'thrift://hadoop102:9083',
'hive_sync.db'='hive_ads',
'hive_sync.table'='ads_repeat_purchase_by_tm'
);
2)数据装载
insert into hudi_ads.ads_repeat_purchase_by_tm
select
dt,
30,
tm_id,
tm_name,
cast(sum(if(order_count>=2,1,0))/sum(if(order_count>=1,1,0)) as decimal(16,2)),
LOCALTIMESTAMP as ts
from
(
select
dt,
user_id,
tm_id,
tm_name,
sum(order_count_30d) order_count
from hudi_dws.dws_trade_user_sku_order_nd
group by dt,user_id, tm_id,tm_name
)t1
group by dt,tm_id,tm_name;
13.3.2 各品牌商品下单统计ads_order_stats_by_tm
需求说明如下
统计周期 | 统计粒度 | 指标 | 说明 |
最近1、7、30日 | 品牌 | 订单数 | 略 |
最近1、7、30日 | 品牌 | 订单人数 | 略 |
1)建表语句
CREATE TABLE hudi_ads.ads_order_stats_by_tm
(
`dt` STRING,
`recent_days` BIGINT,
`tm_id` BIGINT,
`tm_name` VARCHAR(100),
`order_count` BIGINT,
`order_user_count` BIGINT,
`ts` TIMESTAMP(3)
) WITH (
'connector'='hudi',
'path' ='hdfs://hadoop102:8020/user/hudi/warehouse/ads/ads_order_stats_by_tm',
'table.type'='MERGE_ON_READ',
'hoodie.datasource.write.recordkey.field' = 'dt,recent_days,tm_id,tm_name',
'hoodie.datasource.write.precombine.field' = 'ts',
'write.bucket_assign.tasks'='1',
'write.tasks' = '4',
'compaction.tasks' = '1',
'compaction.async.enabled' = 'true',
'compaction.schedule.enabled' = 'true',
'compaction.trigger.strategy' = 'num_commits',
'compaction.delta_commits' = '5',
'read.streaming.enabled' = 'true',
'changelog.enabled' = 'true',
'read.streaming.skip_compaction' = 'true',
'hive_sync.enable'='true',
'hive_sync.mode' = 'hms',
'hive_sync.metastore.uris' = 'thrift://hadoop102:9083',
'hive_sync.db'='hive_ads',
'hive_sync.table'='ads_order_stats_by_tm'
);
2)数据装载
insert into hudi_ads.ads_order_stats_by_tm
select
dt,
recent_days,
tm_id,
tm_name,
order_count,
order_user_count,
LOCALTIMESTAMP as ts
from
(
select
dt,
1 recent_days,
tm_id,
tm_name,
sum(order_count_1d) order_count,
count(distinct(user_id)) order_user_count
from hudi_dws.dws_trade_user_sku_order_1d
group by dt,tm_id,tm_name
union all
select
dt,
recent_days,
tm_id,
tm_name,
sum(order_count),
count(distinct(if(order_count>0,user_id,cast(null as string))))
from
(
select
dt,
recent_days,
user_id,
tm_id,
tm_name,
case recent_days
when 7 then order_count_7d
when 30 then order_count_30d
end order_count
from
(
select
dt,
user_id,
tm_id,
tm_name,
order_count_7d,
order_count_30d,
Array[7,30] days
from hudi_dws.dws_trade_user_sku_order_nd
)t1
CROSS JOIN UNNEST(days) tmp(recent_days)
)t2
group by dt,recent_days,tm_id,tm_name
)odr;
13.3.3 各品类商品下单统计ads_order_stats_by_cate
需求说明如下
统计周期 | 统计粒度 | 指标 | 说明 |
最近1、7、30日 | 品类 | 订单数 | 略 |
最近1、7、30日 | 品类 | 订单人数 | 略 |
1)建表语句
CREATE TABLE hudi_ads.ads_order_stats_by_cate
(
`dt` STRING,
`recent_days` INT,
`category1_id` BIGINT,
`category1_name` VARCHAR(10),
`category2_id` BIGINT,
`category2_name` VARCHAR(200),
`category3_id` BIGINT,
`category3_name` VARCHAR(200),
`order_count` BIGINT,
`order_user_count` BIGINT,
`ts` TIMESTAMP(3)
) WITH (
'connector'='hudi',
'path' ='hdfs://hadoop102:8020/user/hudi/warehouse/ads/ads_order_stats_by_cate',
'table.type'='MERGE_ON_READ',
'hoodie.datasource.write.recordkey.field' = 'dt,recent_days,category1_id,category1_name,category2_id,category2_name,category3_id,category3_name',
'hoodie.datasource.write.precombine.field' = 'ts',
'write.bucket_assign.tasks'='1',
'write.tasks' = '4',
'compaction.tasks' = '1',
'compaction.async.enabled' = 'true',
'compaction.schedule.enabled' = 'true',
'compaction.trigger.strategy' = 'num_commits',
'compaction.delta_commits' = '5',
'read.streaming.enabled' = 'true',
'changelog.enabled' = 'true',
'read.streaming.skip_compaction' = 'true',
'hive_sync.enable'='true',
'hive_sync.mode' = 'hms',
'hive_sync.metastore.uris' = 'thrift://hadoop102:9083',
'hive_sync.db'='hive_ads',
'hive_sync.table'='ads_order_stats_by_cate'
);
2)数据装载
insert into hudi_ads.ads_order_stats_by_cate
select
dt,
recent_days,
category1_id,
category1_name,
category2_id,
category2_name,
category3_id,
category3_name,
order_count,
order_user_count,
LOCALTIMESTAMP as ts
from
(
select
dt,
1 recent_days,
category1_id,
category1_name,
category2_id,
category2_name,
category3_id,
category3_name,
sum(order_count_1d) order_count,
count(distinct(user_id)) order_user_count
from hudi_dws.dws_trade_user_sku_order_1d
group by dt,category1_id,category1_name,category2_id,category2_name,category3_id,category3_name
union all
select
dt,
recent_days,
category1_id,
category1_name,
category2_id,
category2_name,
category3_id,
category3_name,
sum(order_count),
count(distinct(if(order_count>0,user_id,cast(null as string))))
from
(
select
dt,
recent_days,
user_id,
category1_id,
category1_name,
category2_id,
category2_name,
category3_id,
category3_name,
case recent_days
when 7 then order_count_7d
when 30 then order_count_30d
end order_count
from
(
select
dt,
user_id,
category1_id,
category1_name,
category2_id,
category2_name,
category3_id,
category3_name,
order_count_7d,
order_count_30d,
Array[7,30] days
from hudi_dws.dws_trade_user_sku_order_nd
)
CROSS JOIN UNNEST(days) tmp(recent_days)
)t1
group by dt,recent_days,category1_id,category1_name,category2_id,category2_name,category3_id,category3_name
)odr;
13.3.4 各分类商品购物车存量Top3ads_sku_cart_num_top3_by_cate
1)建表语句
CREATE TABLE hudi_ads.ads_sku_cart_num_top3_by_cate
(
`dt` STRING,
`category1_id` BIGINT,
`category1_name` VARCHAR(10),
`category2_id` BIGINT,
`category2_name` VARCHAR(200),
`category3_id` BIGINT,
`category3_name` VARCHAR(200),
`sku_id` BIGINT,
`sku_name` VARCHAR(200),
`cart_num` INT,
`rk` BIGINT,
`ts` TIMESTAMP(3)
) WITH (
'connector'='hudi',
'path' ='hdfs://hadoop102:8020/user/hudi/warehouse/ads/ads_sku_cart_num_top3_by_cate',
'table.type'='MERGE_ON_READ',
'hoodie.datasource.write.recordkey.field' = 'dt,sku_id',
'hoodie.datasource.write.precombine.field' = 'ts',
'write.bucket_assign.tasks'='1',
'write.tasks' = '4',
'compaction.tasks' = '1',
'compaction.async.enabled' = 'true',
'compaction.schedule.enabled' = 'true',
'compaction.trigger.strategy' = 'num_commits',
'compaction.delta_commits' = '5',
'read.streaming.enabled' = 'true',
'changelog.enabled' = 'true',
'read.streaming.skip_compaction' = 'true',
'hive_sync.enable'='true',
'hive_sync.mode' = 'hms',
'hive_sync.metastore.uris' = 'thrift://hadoop102:9083',
'hive_sync.db'='hive_ads',
'hive_sync.table'='ads_sku_cart_num_top3_by_cate'
);
2)数据装载
insert into hudi_ads.ads_sku_cart_num_top3_by_cate
select
dt,
category1_id,
category1_name,
category2_id,
category2_name,
category3_id,
category3_name,
sku_id,
sku_name,
cart_num,
rk,
LOCALTIMESTAMP as ts
from
(
select
cart.dt,
sku_id,
sku_name,
category1_id,
category1_name,
category2_id,
category2_name,
category3_id,
category3_name,
cart_num,
ROW_NUMBER() over (partition by category1_id,category2_id,category3_id order by cart_num desc) rk
from
(
select
dt,
sku_id,
sum(sku_num) cart_num
from hudi_dwd.dwd_trade_cart_add
group by dt,sku_id
)cart
left join
(
select
dt,
id,
sku_name,
category1_id,
category1_name,
category2_id,
category2_name,
category3_id,
category3_name
from hudi_dim.dim_sku
)sku
on cart.sku_id=sku.id and cart.dt=sku.dt
)t1
where rk<=3;
13.3.5 各品牌商品收藏次数Top3ads_sku_favor_count_top3_by_tm
1)建表语句
CREATE TABLE hudi_ads.ads_sku_favor_count_top3_by_tm
(
`dt` STRING,
`tm_id` BIGINT,
`tm_name` VARCHAR(100),
`sku_id` BIGINT,
`sku_name` VARCHAR(200),
`favor_count` BIGINT,
`rk` BIGINT,
`ts` TIMESTAMP(3)
) WITH (
'connector'='hudi',
'path' ='hdfs://hadoop102:8020/user/hudi/warehouse/ads/ads_sku_favor_count_top3_by_tm',
'table.type'='MERGE_ON_READ',
'hoodie.datasource.write.recordkey.field' = 'dt,tm_id,tm_name,sku_id,sku_name,favor_count',
'hoodie.datasource.write.precombine.field' = 'ts',
'write.bucket_assign.tasks'='1',
'write.tasks' = '4',
'compaction.tasks' = '1',
'compaction.async.enabled' = 'true',
'compaction.schedule.enabled' = 'true',
'compaction.trigger.strategy' = 'num_commits',
'compaction.delta_commits' = '5',
'read.streaming.enabled' = 'true',
'changelog.enabled' = 'true',
'read.streaming.skip_compaction' = 'true',
'hive_sync.enable'='true',
'hive_sync.mode' = 'hms',
'hive_sync.metastore.uris' = 'thrift://hadoop102:9083',
'hive_sync.db'='hive_ads',
'hive_sync.table'='ads_sku_favor_count_top3_by_tm'
);
2)数据装载
insert into hudi_ads.ads_sku_favor_count_top3_by_tm
select
dt,
tm_id,
tm_name,
sku_id,
sku_name,
favor_add_count_1d,
rk,
LOCALTIMESTAMP as ts
from
(
select
dt,
tm_id,
tm_name,
sku_id,
sku_name,
favor_add_count_1d,
ROW_NUMBER() over (partition by tm_id order by favor_add_count_1d desc) rk
from hudi_dws.dws_interaction_sku_favor_add_1d
)t1
where rk<=3;
13.4 交易主题
13.4.1 下单到支付时间间隔平均值ads_order_to_pay_interval_avg
具体要求:最近1日完成支付的订单的下单时间到支付时间的时间间隔的平均值。
1)建表语句
CREATE TABLE hudi_ads.ads_order_to_pay_interval_avg
(
`dt` STRING,
`order_to_pay_interval_avg` BIGINT,
`ts` TIMESTAMP(3)
) WITH (
'connector'='hudi',
'path' ='hdfs://hadoop102:8020/user/hudi/warehouse/ads/ads_order_to_pay_interval_avg',
'table.type'='MERGE_ON_READ',
'hoodie.datasource.write.recordkey.field' = 'dt',
'hoodie.datasource.write.precombine.field' = 'ts',
'write.bucket_assign.tasks'='1',
'write.tasks' = '4',
'compaction.tasks' = '1',
'compaction.async.enabled' = 'true',
'compaction.schedule.enabled' = 'true',
'compaction.trigger.strategy' = 'num_commits',
'compaction.delta_commits' = '5',
'read.streaming.enabled' = 'true',
'changelog.enabled' = 'true',
'read.streaming.skip_compaction' = 'true',
'hive_sync.enable'='true',
'hive_sync.mode' = 'hms',
'hive_sync.metastore.uris' = 'thrift://hadoop102:9083',
'hive_sync.db'='hive_ads',
'hive_sync.table'='ads_order_to_pay_interval_avg'
);
2)数据装载
insert into hudi_ads.ads_order_to_pay_interval_avg
select
dt,
cast(avg(to_unix_timestamp(payment_time)-to_unix_timestamp(order_time)) as bigint),
LOCALTIMESTAMP as ts
from hudi_dwd.dwd_trade_trade_flow_acc
where payment_date_id is not null
group by dt;
13.4.2 各省份交易统计ads_order_by_province
需求说明如下
统计周期 | 统计粒度 | 指标 | 说明 |
最近1、7、30日 | 省份 | 订单数 | 略 |
最近1、7、30日 | 省份 | 订单金额 | 略 |
1)建表语句
CREATE TABLE hudi_ads.ads_order_by_province
(
`dt` STRING,
`recent_days` INT,
`province_id` INT,
`province_name` VARCHAR(20),
`area_code` VARCHAR(20),
`iso_code` VARCHAR(20),
`iso_code_3166_2` VARCHAR(20),
`order_count` BIGINT,
`order_total_amount` DECIMAL(38, 2),
`ts` TIMESTAMP(3)
) WITH (
'connector'='hudi',
'path' ='hdfs://hadoop102:8020/user/hudi/warehouse/ads/ads_order_by_province',
'table.type'='MERGE_ON_READ',
'hoodie.datasource.write.recordkey.field' = 'dt,recent_days,province_id,province_name,area_code,iso_code,iso_code_3166_2',
'hoodie.datasource.write.precombine.field' = 'ts',
'write.bucket_assign.tasks'='1',
'write.tasks' = '4',
'compaction.tasks' = '1',
'compaction.async.enabled' = 'true',
'compaction.schedule.enabled' = 'true',
'compaction.trigger.strategy' = 'num_commits',
'compaction.delta_commits' = '5',
'read.streaming.enabled' = 'true',
'changelog.enabled' = 'true',
'read.streaming.skip_compaction' = 'true',
'hive_sync.enable'='true',
'hive_sync.mode' = 'hms',
'hive_sync.metastore.uris' = 'thrift://hadoop102:9083',
'hive_sync.db'='hive_ads',
'hive_sync.table'='ads_order_by_province'
);
2)数据装载
insert into hudi_ads.ads_order_by_province
select
dt,
1 recent_days,
province_id,
province_name,
area_code,
iso_code,
iso_3166_2,
order_count_1d,
order_total_amount_1d,
LOCALTIMESTAMP as ts
from hudi_dws.dws_trade_province_order_1d
union
select
dt,
recent_days,
province_id,
province_name,
area_code,
iso_code,
iso_3166_2,
case recent_days
when 7 then order_count_7d
when 30 then order_count_30d
end order_count,
case recent_days
when 7 then order_total_amount_7d
when 30 then order_total_amount_30d
end order_total_amount,
LOCALTIMESTAMP as ts
from
(
select
dt,
province_id,
province_name,
area_code,
iso_code,
iso_3166_2,
order_count_7d,
order_count_30d,
order_total_amount_7d,
order_total_amount_30d,
Array[7,30] days
from hudi_dws.dws_trade_province_order_nd
) t1
CROSS JOIN UNNEST(days) tmp(recent_days);
13.5 优惠券主题
13.5.1 优惠券使用统计ads_coupon_stats
需求说明如下
统计周期 | 统计粒度 | 指标 | 说明 |
最近1日 | 优惠券 | 使用次数 | 支付才算使用 |
最近1日 | 优惠券 | 使用人数 | 支付才算使用 |
1)建表语句
CREATE TABLE hudi_ads.ads_coupon_stats
(
`dt` STRING,
`coupon_id` BIGINT,
`coupon_name` VARCHAR(100),
`used_count` BIGINT,
`used_user_count` BIGINT,
`ts` TIMESTAMP(3)
) WITH (
'connector'='hudi',
'path' ='hdfs://hadoop102:8020/user/hudi/warehouse/ads/ads_coupon_stats',
'table.type'='MERGE_ON_READ',
'hoodie.datasource.write.recordkey.field' = 'dt,coupon_id,coupon_name',
'hoodie.datasource.write.precombine.field' = 'ts',
'write.bucket_assign.tasks'='1',
'write.tasks' = '4',
'compaction.tasks' = '1',
'compaction.async.enabled' = 'true',
'compaction.schedule.enabled' = 'true',
'compaction.trigger.strategy' = 'num_commits',
'compaction.delta_commits' = '5',
'read.streaming.enabled' = 'true',
'changelog.enabled' = 'true',
'read.streaming.skip_compaction' = 'true',
'hive_sync.enable'='true',
'hive_sync.mode' = 'hms',
'hive_sync.metastore.uris' = 'thrift://hadoop102:9083',
'hive_sync.db'='hive_ads',
'hive_sync.table'='ads_coupon_stats'
);
2)数据装载
insert into hudi_ads.ads_coupon_stats
select
dt,
coupon_id,
coupon_name,
cast(sum(used_count_1d) as bigint),
cast(count(*) as bigint),
LOCALTIMESTAMP as ts
from hudi_dws.dws_tool_user_coupon_coupon_used_1d
group by dt,coupon_id,coupon_name;
第14章 报表数据导出
为方便报表应用使用数据,需将ads各指标的统计结果导出到MySQL数据库中。
14.1 MySQL建库建表
14.1.1 创建数据库
CREATE DATABASE IF NOT EXISTS gmall_report DEFAULT CHARSET utf8 COLLATE utf8_general_ci;
14.1.2 创建表
1)各渠道流量统计
DROP TABLE IF EXISTS `ads_traffic_stats_by_channel`;
CREATE TABLE `ads_traffic_stats_by_channel` (
`dt` date NOT NULL COMMENT '统计日期',
`recent_days` bigint(20) NOT NULL COMMENT '最近天数,1:最近1天,7:最近7天,30:最近30天',
`channel` varchar(16) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '渠道',
`uv_count` bigint(20) NULL DEFAULT NULL COMMENT '访客人数',
`avg_duration_sec` bigint(20) NULL DEFAULT NULL COMMENT '会话平均停留时长,单位为秒',
`avg_page_count` bigint(20) NULL DEFAULT NULL COMMENT '会话平均浏览页面数',
`sv_count` bigint(20) NULL DEFAULT NULL COMMENT '会话数',
`bounce_rate` decimal(16, 2) NULL DEFAULT NULL COMMENT '跳出率',
PRIMARY KEY (`dt`, `recent_days`, `channel`) USING BTREE
) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_general_ci COMMENT = '各渠道流量统计' ROW_FORMAT = DYNAMIC;
2)路径分析
DROP TABLE IF EXISTS `ads_page_path`;
CREATE TABLE `ads_page_path` (
`dt` date NOT NULL COMMENT '统计日期',
`source` varchar(64) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '跳转起始页面ID',
`target` varchar(64) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '跳转终到页面ID',
`path_count` bigint(20) NULL DEFAULT NULL COMMENT '跳转次数',
PRIMARY KEY (`dt`, `source`, `target`) USING BTREE
) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_general_ci COMMENT = '页面浏览路径分析' ROW_FORMAT = DYNAMIC;
3)用户变动统计
DROP TABLE IF EXISTS `ads_user_change`;
CREATE TABLE `ads_user_change` (
`dt` varchar(16) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '统计日期',
`user_churn_count` varchar(16) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '流失用户数',
`user_back_count` varchar(16) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '回流用户数',
PRIMARY KEY (`dt`) USING BTREE
) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_general_ci COMMENT = '用户变动统计' ROW_FORMAT = DYNAMIC;
4)用户留存率
DROP TABLE IF EXISTS `ads_user_retention`;
CREATE TABLE `ads_user_retention` (
`dt` date NOT NULL COMMENT '统计日期',
`create_date` varchar(16) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '用户新增日期',
`retention_day` int(20) NOT NULL COMMENT '截至当前日期留存天数',
`retention_count` bigint(20) NULL DEFAULT NULL COMMENT '留存用户数量',
`new_user_count` bigint(20) NULL DEFAULT NULL COMMENT '新增用户数量',
`retention_rate` decimal(16, 2) NULL DEFAULT NULL COMMENT '留存率',
PRIMARY KEY (`dt`, `create_date`, `retention_day`) USING BTREE
) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_general_ci COMMENT = '留存率' ROW_FORMAT = DYNAMIC;
5)用户新增活跃统计
DROP TABLE IF EXISTS `ads_user_stats`;
CREATE TABLE `ads_user_stats` (
`dt` date NOT NULL COMMENT '统计日期',
`recent_days` bigint(20) NOT NULL COMMENT '最近n日,1:最近1日,7:最近7日,30:最近30日',
`new_user_count` bigint(20) NULL DEFAULT NULL COMMENT '新增用户数',
`active_user_count` bigint(20) NULL DEFAULT NULL COMMENT '活跃用户数',
PRIMARY KEY (`dt`, `recent_days`) USING BTREE
) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_general_ci COMMENT = '用户新增活跃统计' ROW_FORMAT = DYNAMIC;
6)用户行为漏斗分析
DROP TABLE IF EXISTS `ads_user_action`;
CREATE TABLE `ads_user_action` (
`dt` date NOT NULL COMMENT '统计日期',
`home_count` bigint(20) NULL DEFAULT NULL COMMENT '浏览首页人数',
`good_detail_count` bigint(20) NULL DEFAULT NULL COMMENT '浏览商品详情页人数',
`cart_count` bigint(20) NULL DEFAULT NULL COMMENT '加入购物车人数',
`order_count` bigint(20) NULL DEFAULT NULL COMMENT '下单人数',
`payment_count` bigint(20) NULL DEFAULT NULL COMMENT '支付人数',
PRIMARY KEY (`dt`) USING BTREE
) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_general_ci COMMENT = '漏斗分析' ROW_FORMAT = DYNAMIC;
7)新增下单用户统计
DROP TABLE IF EXISTS `ads_new_order_user_stats`;
CREATE TABLE `ads_new_order_user_stats` (
`dt` varchar(20) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL,
`recent_days` bigint(20) NOT NULL,
`new_order_user_count` bigint(20) NULL DEFAULT NULL,
PRIMARY KEY (`recent_days`, `dt`) USING BTREE
) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic;
8)最近7日内连续3日下单用户数
DROP TABLE IF EXISTS `ads_order_continuously_user_count`;
CREATE TABLE `ads_order_continuously_user_count` (
`dt` varchar(20) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL,
`recent_days` bigint(20) NOT NULL,
`order_continuously_user_count` bigint(20) NULL DEFAULT NULL,
PRIMARY KEY (`dt`, `recent_days`) USING BTREE
) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic;
9)最近30日各品牌复购率
DROP TABLE IF EXISTS `ads_repeat_purchase_by_tm`;
CREATE TABLE `ads_repeat_purchase_by_tm` (
`dt` date NOT NULL COMMENT '统计日期',
`recent_days` bigint(20) NOT NULL COMMENT '最近天数,30:最近30天',
`tm_id` varchar(16) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '品牌ID',
`tm_name` varchar(32) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '品牌名称',
`order_repeat_rate` decimal(16, 2) NULL DEFAULT NULL COMMENT '复购率',
PRIMARY KEY (`dt`, `recent_days`, `tm_id`) USING BTREE
) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_general_ci COMMENT = '各品牌复购率统计' ROW_FORMAT = DYNAMIC;
10)各品牌商品下单统计
DROP TABLE IF EXISTS `ads_order_stats_by_tm`;
CREATE TABLE `ads_order_stats_by_tm` (
`dt` date NOT NULL COMMENT '统计日期',
`recent_days` bigint(20) NOT NULL COMMENT '最近天数,1:最近1天,7:最近7天,30:最近30天',
`tm_id` varchar(16) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '品牌ID',
`tm_name` varchar(32) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '品牌名称',
`order_count` bigint(20) NULL DEFAULT NULL COMMENT '订单数',
`order_user_count` bigint(20) NULL DEFAULT NULL COMMENT '订单人数',
PRIMARY KEY (`dt`, `recent_days`, `tm_id`) USING BTREE
) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_general_ci COMMENT = '各品牌商品交易统计' ROW_FORMAT = DYNAMIC;
11)各分类商品下单统计
DROP TABLE IF EXISTS `ads_order_stats_by_cate`;
CREATE TABLE `ads_order_stats_by_cate` (
`dt` date NOT NULL COMMENT '统计日期',
`recent_days` bigint(20) NOT NULL COMMENT '最近天数,1:最近1天,7:最近7天,30:最近30天',
`category1_id` varchar(16) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '一级分类id',
`category1_name` varchar(64) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '一级分类名称',
`category2_id` varchar(16) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '二级分类id',
`category2_name` varchar(64) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '二级分类名称',
`category3_id` varchar(16) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '三级分类id',
`category3_name` varchar(64) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '三级分类名称',
`order_count` bigint(20) NULL DEFAULT NULL COMMENT '订单数',
`order_user_count` bigint(20) NULL DEFAULT NULL COMMENT '订单人数',
PRIMARY KEY (`dt`, `recent_days`, `category1_id`, `category2_id`, `category3_id`) USING BTREE
) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_general_ci COMMENT = '各分类商品交易统计' ROW_FORMAT = DYNAMIC;
12)各分类商品购物车存量Top3
DROP TABLE IF EXISTS `ads_sku_cart_num_top3_by_cate`;
CREATE TABLE `ads_sku_cart_num_top3_by_cate` (
`dt` date NOT NULL COMMENT '统计日期',
`category1_id` varchar(16) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '一级分类ID',
`category1_name` varchar(64) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '一级分类名称',
`category2_id` varchar(16) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '二级分类ID',
`category2_name` varchar(64) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '二级分类名称',
`category3_id` varchar(16) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '三级分类ID',
`category3_name` varchar(64) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '三级分类名称',
`sku_id` varchar(16) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '商品id',
`sku_name` varchar(128) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '商品名称',
`cart_num` bigint(20) NULL DEFAULT NULL COMMENT '购物车中商品数量',
`rk` bigint(20) NULL DEFAULT NULL COMMENT '排名',
PRIMARY KEY (`dt`, `sku_id`, `category1_id`, `category2_id`, `category3_id`) USING BTREE
) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_general_ci COMMENT = '各分类商品购物车存量Top10' ROW_FORMAT = DYNAMIC;
13)各品牌商品收藏次数Top3
DROP TABLE IF EXISTS `ads_sku_favor_count_top3_by_tm`;
CREATE TABLE `ads_sku_favor_count_top3_by_tm` (
`dt` varchar(20) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL,
`tm_id` varchar(20) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL,
`tm_name` varchar(128) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`sku_id` varchar(20) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL,
`sku_name` varchar(128) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`favor_count` bigint(20) NULL DEFAULT NULL,
`rk` bigint(20) NULL DEFAULT NULL,
PRIMARY KEY (`dt`, `tm_id`, `sku_id`) USING BTREE
) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic;
14)下单到支付时间间隔平均值
DROP TABLE IF EXISTS `ads_order_to_pay_interval_avg`;
CREATE TABLE `ads_order_to_pay_interval_avg` (
`dt` varchar(20) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL,
`order_to_pay_interval_avg` bigint(20) NULL DEFAULT NULL,
PRIMARY KEY (`dt`) USING BTREE
) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic;
15)各省份交易统计
DROP TABLE IF EXISTS `ads_order_by_province`;
CREATE TABLE `ads_order_by_province` (
`dt` date NOT NULL COMMENT '统计日期',
`recent_days` bigint(20) NOT NULL COMMENT '最近天数,1:最近1天,7:最近7天,30:最近30天',
`province_id` varchar(16) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '省份ID',
`province_name` varchar(16) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '省份名称',
`area_code` varchar(16) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '地区编码',
`iso_code` varchar(16) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '国际标准地区编码',
`iso_code_3166_2` varchar(16) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '国际标准地区编码',
`order_count` bigint(20) NULL DEFAULT NULL COMMENT '订单数',
`order_total_amount` decimal(16, 2) NULL DEFAULT NULL COMMENT '订单金额',
PRIMARY KEY (`dt`, `recent_days`, `province_id`) USING BTREE
) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_general_ci COMMENT = '各地区订单统计' ROW_FORMAT = DYNAMIC;
16)优惠券使用情况统计
DROP TABLE IF EXISTS `ads_coupon_stats`;
CREATE TABLE `ads_coupon_stats` (
`dt` varchar(20) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL,
`coupon_id` varchar(20) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL,
`coupon_name` varchar(128) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL,
`used_count` bigint(20) NULL DEFAULT NULL,
`userd_user_count` bigint(20) NULL DEFAULT NULL,
PRIMARY KEY (`dt`, `coupon_id`) USING BTREE
) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic;
14.2 数据导出
数据导出工具选用DataX,选用HDFSReader和MySQLWriter。
14.2.1 编写DataX配置文件
我们需要为每个张表编写一个DataX配置文件,此处以ads_traffic_stats_by_channel为例,配置文件内容如下:
{
"job": {
"content": [
{
"reader": {
"name": "hdfsreader",
"parameter": {
"column": [
"*"
],
"defaultFS": "hdfs://hadoop102:8020",
"encoding": "UTF-8",
"fieldDelimiter": "\t",
"fileType": "text",
"nullFormat": "\\N",
"path": "${exportdir}"
}
},
"writer": {
"name": "mysqlwriter",
"parameter": {
"column": [
"dt",
"recent_days",
"channel",
"uv_count",
"avg_duration_sec",
"avg_page_count",
"sv_count",
"bounce_rate"
],
"connection": [
{
"jdbcUrl": "jdbc:mysql://hadoop102:3306/gmall_report?useUnicode=true&characterEncoding=utf-8",
"table": [
"ads_traffic_stats_by_channel"
]
}
],
"password": "000000",
"username": "root",
"writeMode": "replace"
}
}
}
],
"setting": {
"errorLimit": {
"percentage": 0.02,
"record": 0
},
"speed": {
"channel": 3
}
}
}
}
注:导出路径path参数并未写死,需在提交任务时通过参数动态传入,参数名称为exportdir。
14.2.2 DataX配置文件生成脚本
方便起见,此处提供了DataX配置文件批量生成脚本,脚本内容及使用方式如下。
1)在~/bin目录下创建gen_export_config.py脚本
[atguigu@hadoop102 bin]$ vim ~/bin/gen_export_config.py
脚本内容如下
# coding=utf-8
import json
import getopt
import os
import sys
import MySQLdb
#MySQL相关配置,需根据实际情况作出修改
mysql_host = "hadoop102"
mysql_port = "3306"
mysql_user = "root"
mysql_passwd = "123456"
#HDFS NameNode相关配置,需根据实际情况作出修改
hdfs_nn_host = "hadoop102"
hdfs_nn_port = "8020"
#生成配置文件的目标路径,可根据实际情况作出修改
output_path = "/opt/module/datax/job/export"
def get_connection():
return MySQLdb.connect(host=mysql_host, port=int(mysql_port), user=mysql_user, passwd=mysql_passwd)
def get_mysql_meta(database, table):
connection = get_connection()
cursor = connection.cursor()
sql = "SELECT COLUMN_NAME,DATA_TYPE from information_schema.COLUMNS WHERE TABLE_SCHEMA=%s AND TABLE_NAME=%s ORDER BY ORDINAL_POSITION"
cursor.execute(sql, [database, table])
fetchall = cursor.fetchall()
cursor.close()
connection.close()
return fetchall
def get_mysql_columns(database, table):
return map(lambda x: x[0], get_mysql_meta(database, table))
def generate_json(target_database, target_table):
job = {
"job": {
"setting": {
"speed": {
"channel": 3
},
"errorLimit": {
"record": 0,
"percentage": 0.02
}
},
"content": [{
"reader": {
"name": "hdfsreader",
"parameter": {
"path": "${exportdir}",
"defaultFS": "hdfs://" + hdfs_nn_host + ":" + hdfs_nn_port,
"column": ["*"],
"fileType": "text",
"encoding": "UTF-8",
"fieldDelimiter": "\t",
"nullFormat": "\\N"
}
},
"writer": {
"name": "mysqlwriter",
"parameter": {
"writeMode": "replace",
"username": mysql_user,
"password": mysql_passwd,
"column": get_mysql_columns(target_database, target_table),
"connection": [
{
"jdbcUrl":
"jdbc:mysql://" + mysql_host + ":" + mysql_port + "/" + target_database + "?useUnicode=true&characterEncoding=utf-8",
"table": [target_table]
}
]
}
}
}]
}
}
if not os.path.exists(output_path):
os.makedirs(output_path)
with open(os.path.join(output_path, ".".join([target_database, target_table, "json"])), "w") as f:
json.dump(job, f)
def main(args):
target_database = ""
target_table = ""
options, arguments = getopt.getopt(args, '-d:-t:', ['targetdb=', 'targettbl='])
for opt_name, opt_value in options:
if opt_name in ('-d', '--targetdb'):
target_database = opt_value
if opt_name in ('-t', '--targettbl'):
target_table = opt_value
generate_json(target_database, target_table)
if __name__ == '__main__':
main(sys.argv[1:])
注:
(1)安装Python Mysql驱动
由于需要使用Python访问Mysql数据库,故需安装驱动,命令如下:
[atguigu@hadoop102 bin]$ sudo yum install -y MySQL-python
(2)脚本使用说明
python gen_export_config.py -d database -t table
通过-d传入MySQL数据库名,-t传入MySQL表名,执行上述命令即可生成该表的DataX同步配置文件。
2)在~/bin目录下创建gen_export_config.sh脚本
[atguigu@hadoop102 bin]$ vim ~/bin/gen_export_config.sh
脚本内容如下
#!/bin/bash
python ~/bin/gen_export_config.py -d gmall_report -t ads_coupon_stats
python ~/bin/gen_export_config.py -d gmall_report -t ads_new_order_user_stats
python ~/bin/gen_export_config.py -d gmall_report -t ads_order_by_province
python ~/bin/gen_export_config.py -d gmall_report -t ads_order_continuously_user_count
python ~/bin/gen_export_config.py -d gmall_report -t ads_order_stats_by_cate
python ~/bin/gen_export_config.py -d gmall_report -t ads_order_stats_by_tm
python ~/bin/gen_export_config.py -d gmall_report -t ads_order_to_pay_interval_avg
python ~/bin/gen_export_config.py -d gmall_report -t ads_page_path
python ~/bin/gen_export_config.py -d gmall_report -t ads_repeat_purchase_by_tm
python ~/bin/gen_export_config.py -d gmall_report -t ads_sku_cart_num_top3_by_cate
python ~/bin/gen_export_config.py -d gmall_report -t ads_sku_favor_count_top3_by_tm
python ~/bin/gen_export_config.py -d gmall_report -t ads_traffic_stats_by_channel
python ~/bin/gen_export_config.py -d gmall_report -t ads_user_action
python ~/bin/gen_export_config.py -d gmall_report -t ads_user_change
python ~/bin/gen_export_config.py -d gmall_report -t ads_user_retention
python ~/bin/gen_export_config.py -d gmall_report -t ads_user_stats
3)为gen_export_config.sh脚本增加执行权限
[atguigu@hadoop102 bin]$ chmod +x ~/bin/gen_export_config.sh
4)执行gen_export_config.sh脚本,生成配置文件
[atguigu@hadoop102 bin]$ gen_export_config.sh
5)观察生成的配置文件
[atguigu@hadoop102 bin]$ ls /opt/module/datax/job/export/
总用量 64
gmall_report.ads_activity_stats.json gmall_report.ads_trade_stats_by_cate.json
gmall_report.ads_coupon_stats.json gmall_report.ads_trade_stats_by_tm.json
gmall_report.ads_new_buyer_stats.json gmall_report.ads_trade_stats.json
gmall_report.ads_order_by_province.json gmall_report.ads_traffic_stats_by_channel.json
gmall_report.ads_user_action.json
gmall_report.ads_page_path.json gmall_report.ads_user_change.json
gmall_report.ads_repeat_purchase_by_tm.json gmall_report.ads_user_retention.json
gmall_report.ads_sku_cart_num_top3_by_cate.json gmall_report.ads_user_stats.json
14.2.3 测试生成的DataX配置文件
以ads_traffic_stats_by_channel为例,测试用脚本生成的配置文件是否可用。
1)执行DataX同步命令
[atguigu@hadoop102 bin]$ python /opt/module/datax/bin/datax.py -p"-Dexportdir=/warehouse/gmall/ads/ads_traffic_stats_by_channel" /opt/module/datax/job/export/gmall_report.ads_traffic_stats_by_channel.json
2)观察同步结果
观察MySQL目标表是否出现数据。
14.2.4 编写每日导出脚本
(1)在hadoop102的/home/atguigu/bin目录下创建hdfs_to_mysql.sh
[atguigu@hadoop102 bin]$ vim hdfs_to_mysql.sh
(2)编写如下内容
#! /bin/bash
DATAX_HOME=/opt/module/datax
#DataX导出路径不允许存在空文件,该函数作用为清理空文件
handle_export_path(){
for i in `hadoop fs -ls -R $1 | awk '{print $8}'`; do
hadoop fs -test -z $i
if [[ $? -eq 0 ]]; then
echo "$i文件大小为0,正在删除"
hadoop fs -rm -r -f $i
fi
done
}
#数据导出
export_data() {
datax_config=$1
export_dir=$2
handle_export_path $export_dir
$DATAX_HOME/bin/datax.py -p"-Dexportdir=$export_dir" $datax_config
}
case $1 in
"ads_coupon_stats")
export_data /opt/module/datax/job/export/gmall_report.ads_coupon_stats.json /warehouse/gmall/ads/ads_coupon_stats
;;
"ads_new_order_user_stats")
export_data /opt/module/datax/job/export/gmall_report.ads_new_order_user_stats.json /warehouse/gmall/ads/ads_new_order_user_stats
;;
"ads_order_by_province")
export_data /opt/module/datax/job/export/gmall_report.ads_order_by_province.json /warehouse/gmall/ads/ads_order_by_province
;;
"ads_order_continuously_user_count")
export_data /opt/module/datax/job/export/gmall_report.ads_order_continuously_user_count.json /warehouse/gmall/ads/ads_order_continuously_user_count
;;
"ads_order_stats_by_cate")
export_data /opt/module/datax/job/export/gmall_report.ads_order_stats_by_cate.json /warehouse/gmall/ads/ads_order_stats_by_cate
;;
"ads_order_stats_by_tm")
export_data /opt/module/datax/job/export/gmall_report.ads_order_stats_by_tm.json /warehouse/gmall/ads/ads_order_stats_by_tm
;;
"ads_order_to_pay_interval_avg")
export_data /opt/module/datax/job/export/gmall_report.ads_order_to_pay_interval_avg.json /warehouse/gmall/ads/ads_order_to_pay_interval_avg
;;
"ads_page_path")
export_data /opt/module/datax/job/export/gmall_report.ads_page_path.json /warehouse/gmall/ads/ads_page_path
;;
"ads_repeat_purchase_by_tm")
export_data /opt/module/datax/job/export/gmall_report.ads_repeat_purchase_by_tm.json /warehouse/gmall/ads/ads_repeat_purchase_by_tm
;;
"ads_sku_cart_num_top3_by_cate")
export_data /opt/module/datax/job/export/gmall_report.ads_sku_cart_num_top3_by_cate.json /warehouse/gmall/ads/ads_sku_cart_num_top3_by_cate
;;
"ads_sku_favor_count_top3_by_tm")
export_data /opt/module/datax/job/export/gmall_report.ads_sku_favor_count_top3_by_tm.json /warehouse/gmall/ads/ads_sku_favor_count_top3_by_tm
;;
"ads_traffic_stats_by_channel")
export_data /opt/module/datax/job/export/gmall_report.ads_traffic_stats_by_channel.json /warehouse/gmall/ads/ads_traffic_stats_by_channel
;;
"ads_user_action")
export_data /opt/module/datax/job/export/gmall_report.ads_user_action.json /warehouse/gmall/ads/ads_user_action
;;
"ads_user_change")
export_data /opt/module/datax/job/export/gmall_report.ads_user_change.json /warehouse/gmall/ads/ads_user_change
;;
"ads_user_retention")
export_data /opt/module/datax/job/export/gmall_report.ads_user_retention.json /warehouse/gmall/ads/ads_user_retention
;;
"ads_user_stats")
export_data /opt/module/datax/job/export/gmall_report.ads_user_stats.json /warehouse/gmall/ads/ads_user_stats
;;
"all")
export_data /opt/module/datax/job/export/gmall_report.ads_coupon_stats.json /warehouse/gmall/ads/ads_coupon_stats
export_data /opt/module/datax/job/export/gmall_report.ads_new_order_user_stats.json /warehouse/gmall/ads/ads_new_order_user_stats
export_data /opt/module/datax/job/export/gmall_report.ads_order_by_province.json /warehouse/gmall/ads/ads_order_by_province
export_data /opt/module/datax/job/export/gmall_report.ads_order_continuously_user_count.json /warehouse/gmall/ads/ads_order_continuously_user_count
export_data /opt/module/datax/job/export/gmall_report.ads_order_stats_by_cate.json /warehouse/gmall/ads/ads_order_stats_by_cate
export_data /opt/module/datax/job/export/gmall_report.ads_order_stats_by_tm.json /warehouse/gmall/ads/ads_order_stats_by_tm
export_data /opt/module/datax/job/export/gmall_report.ads_order_to_pay_interval_avg.json /warehouse/gmall/ads/ads_order_to_pay_interval_avg
export_data /opt/module/datax/job/export/gmall_report.ads_page_path.json /warehouse/gmall/ads/ads_page_path
export_data /opt/module/datax/job/export/gmall_report.ads_repeat_purchase_by_tm.json /warehouse/gmall/ads/ads_repeat_purchase_by_tm
export_data /opt/module/datax/job/export/gmall_report.ads_sku_cart_num_top3_by_cate.json /warehouse/gmall/ads/ads_sku_cart_num_top3_by_cate
export_data /opt/module/datax/job/export/gmall_report.ads_sku_favor_count_top3_by_tm.json /warehouse/gmall/ads/ads_sku_favor_count_top3_by_tm
export_data /opt/module/datax/job/export/gmall_report.ads_traffic_stats_by_channel.json /warehouse/gmall/ads/ads_traffic_stats_by_channel
export_data /opt/module/datax/job/export/gmall_report.ads_user_action.json /warehouse/gmall/ads/ads_user_action
export_data /opt/module/datax/job/export/gmall_report.ads_user_change.json /warehouse/gmall/ads/ads_user_change
export_data /opt/module/datax/job/export/gmall_report.ads_user_retention.json /warehouse/gmall/ads/ads_user_retention
export_data /opt/module/datax/job/export/gmall_report.ads_user_stats.json /warehouse/gmall/ads/ads_user_stats
;;
esac
(3)增加脚本执行权限
[atguigu@hadoop102 bin]$ chmod +x hdfs_to_mysql.sh
(4)脚本用法
[atguigu@hadoop102 bin]$ hdfs_to_mysql.sh all