数仓dwd层事件日志dwd_base_event_log解析生成若干表

需要创建的表

dwd_display_log 商品点击表
dwd_newsdetail_log 商品详情页表
dwd_loading_log 商品列表页表
dwd_ad_log 广告表
dwd_notification_log 消息通知表
dwd_active_background_log 用户后台活跃表
dwd_comment_log 评论表
dwd_favorites_log 收藏表
dwd_praise_log 点赞表
dwd_error_log 错误日志表

商品详情页表

#创建表dwd_newsdetail_log
drop table if exists dwd_newsdetail_log; 
create external table dwd_newsdetail_log( 
`mid_id` string, 
`user_id` string, 
`version_code` string, 
`version_name` string, 
`lang` string, 
`source` string, 
`os` string, 
`area` string, 
`model` string, 
`brand` string, 
`sdk_version` string, 
`gmail` string, 
`height_width` string, 
`app_time` string, 
`network` string, 
`lng` string, 
`lat` string, 
`entry` string, 
`action` string, 
`goodsid` string, 
`showtype` string, 
`news_staytime` string,
`loading_time` string, 
`type1` string, 
`category` string, 
`server_time` string
) 
partitioned by (`dt` string) stored as parquet 
location '/warehouse/gmall/dwd/dwd_newsdetail_log/'; 

#解析并导入数据
insert overwrite table dwd_newsdetail_log
partition(`dt`='2020-02-01')
select 
mid_id, 
user_id, 
version_code, 
version_name, 
lang, 
source, 
os, 
area, 
model, 
brand, 
sdk_version, 
gmail, 
height_width, 
app_time, 
network, 
lng, 
lat, 
get_json_object(event_json,'$.kv.entry') entry, get_json_object(event_json,'$.kv.action') action, get_json_object(event_json,'$.kv.goodsid') goodsid, get_json_object(event_json,'$.kv.showtype') showtype, get_json_object(event_json,'$.kv.news_staytime') news_staytime, get_json_object(event_json,'$.kv.loading_time') loading_time, get_json_object(event_json,'$.kv.type1') type1, get_json_object(event_json,'$.kv.category') category, 
server_time 
from dwd_base_event_log
where `dt`='2020-02-01' and event_name='newsdetail';

#查询导入结果
select * from dwd_newsdetail_log limit 2;

商品列表页表

drop table if exists dwd_loading_log;
create external table dwd_loading_log(
`mid_id` string, 
`user_id` string, 
`version_code` string, 
`version_name` string, 
`lang` string, 
`source` string, 
`os` string, 
`area` string,
`model` string, 
`brand` string, 
`sdk_version` string, 
`gmail` string, 
`height_width` string, 
`app_time` string, 
`network` string, 
`lng` string, 
`lat` string, 
`action` string, 
`loading_time` string, 
`loading_way` string, 
`extend1` string, 
`extend2` string, 
`type` string, 
`type1` string, 
`server_time` string
) 
partitioned by (`dt` string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_loading_log';

#导入数据
insert overwrite table dwd_loading_log
partition(`dt`='2020-02-01')
select
mid_id, 
user_id, 
version_code, 
version_name, 
lang, 
source, 
os, 
area, 
model, 
brand, 
sdk_version, 
gmail, 
height_width, 
app_time, 
network, 
lng, 
lat, 
get_json_object(event_json,'$.kv.action') action, get_json_object(event_json,'$.kv.loading_time') loading_time, get_json_object(event_json,'$.kv.loading_way') loading_way, get_json_object(event_json,'$.kv.extend1') extend1, get_json_object(event_json,'$.kv.extend2') extend2, get_json_object(event_json,'$.kv.type') type, get_json_object(event_json,'$.kv.type1') type1, 
server_time 
from dwd_base_event_log
where `dt`='2020-02-01' and event_name='loading';

广告表

drop table if exists  dwd_ad_log;
create external table  dwd_ad_log(
`mid_id` string, 
`user_id` string, 
`version_code` string, 
`version_name` string, 
`lang` string, 
`source` string, 
`os` string, 
`area` string, 
`model` string, 
`brand` string, 
`sdk_version` string, 
`gmail` string, 
`height_width` string, 
`app_time` string, 
`network` string, 
`lng` string, 
`lat` string, 
`entry` string, 
`action` string, 
`contentType` string, 
`displayMills` string, 
`itemId` string, 
`activityId` string, 
`server_time` string
) 
partitioned by (`dt` string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_ad_log';

#导入数据
insert overwrite table dwd_ad_log
partition (`dt`='2020-02-01')
select
mid_id, 
user_id, 
version_code, 
version_name, 
lang, 
source, 
os, 
area, 
model, 
brand, 
sdk_version, 
gmail, 
height_width, 
app_time, 
network, 
lng, 
lat, 
get_json_object(event_json,'$.kv.entry') entry, get_json_object(event_json,'$.kv.action') action, get_json_object(event_json,'$.kv.contentType') contentType, get_json_object(event_json,'$.kv.displayMills') displayMills, get_json_object(event_json,'$.kv.itemId') itemId, get_json_object(event_json,'$.kv.activityId') activityId, 
server_time
from dwd_base_event_log
where `dt`='2020-02-01' and event_name='ad';

消息通知表

#创建表dwd_notification_log
drop table if exists dwd_notification_log; 
create external table dwd_notification_log( 
`mid_id` string, 
`user_id` string, 
`version_code` string, 
`version_name` string, 
`lang` string, 
`source` string, 
`os` string, 
`area` string, 
`model` string, 
`brand` string, 
`sdk_version` string, 
`gmail` string, 
`height_width` string, `
app_time` string, 
`network` string, 
`lng` string, 
`lat` string, 
`action` string, 
`noti_type` string, 
`ap_time` string, 
`content` string, 
`server_time` string 
) 
partitioned by (dt string) stored as parquet 
location '/warehouse/gmall/dwd/dwd_notification_log/';

#导入数据
insert overwrite table dwd_notification_log 
partition(dt='2020-02-01') 
select mid_id, 
user_id, 
version_code, 
version_name, 
lang, 
source, 
os, 
area, 
model, 
brand, 
sdk_version, 
gmail, 
height_width, 
app_time, 
network,
lng, lat, 
get_json_object(event_json,'$.kv.action') action, get_json_object(event_json,'$.kv.noti_type') noti_type, get_json_object(event_json,'$.kv.ap_time') ap_time, get_json_object(event_json,'$.kv.content') content, 
server_time 
from dwd_base_event_log 
where dt='2020-02-01' and event_name='notification'; 

用户后台活跃表

#创建表
drop table if exists dwd_active_background_log; 
create external table dwd_active_background_log( 
`mid_id` string, 
`user_id` string, 
`version_code` string, 
`version_name` string, 
`lang` string, 
`source` string, 
`os` string, 
`area` string, 
`model` string, 
`brand` string, 
`sdk_version` string, 
`gmail` string, 
`height_width` string, 
`app_time` string, 
`network` string, 
`lng` string, 
`lat` string, 
`active_source` string, 
`server_time` string 
) 
partitioned by (`dt` string) 
stored as parquet 
location '/warehouse/gmall/dwd/dwd_background_log/';
 
#导入数据
insert overwrite table dwd_active_background_log 
partition (dt='2020-02-01') 
select 
mid_id, 
user_id, 
version_code, 
version_name, 
lang, 
source, 
os, 
area, model,
brand, 
sdk_version, 
gmail, 
height_width, 
app_time, 
network, 
lng, 
lat, 
get_json_object(event_json,'$.kv.active_source') active_source, server_time 
from dwd_base_event_log 
where dt='2020-02-01' and event_name='active_background'; 

评论表

#创建表
drop table if exists dwd_comment_log; 
create external table dwd_comment_log( 
`mid_id` string, 
`user_id` string, 
`version_code` string, 
`version_name` string, 
`lang` string, 
`source` string, 
`os` string, 
`area` string, 
`model` string, 
`brand` string, 
`sdk_version` string, 
`gmail` string, 
`height_width` string, 
`app_time` string, 
`network` string, 
`lng` string, 
`lat` string, 
`comment_id` int, 
`userid` int, 
`p_comment_id` int, 
`content` string, 
`addtime` string, 
`other_id` int, 
`praise_count` int, 
`reply_count` int, 
`server_time` string 
) 
partitioned by (`dt` string) 
stored as parquet 
location '/warehouse/gmall/dwd/dwd_comment_log/'; 

#导入数据
insert overwrite table dwd_comment_log 
partition (dt='2020-02-01') 
select
mid_id, 
user_id, 
version_code, 
version_name, 
lang, 
source, 
os, 
area, 
model, 
brand, 
sdk_version, 
gmail, 
height_width, 
app_time, 
network, 
lng, 
lat, 
get_json_object(event_json,'$.kv.comment_id') comment_id, get_json_object(event_json,'$.kv.userid') userid, get_json_object(event_json,'$.kv.p_comment_id') p_comment_id, get_json_object(event_json,'$.kv.content') content, get_json_object(event_json,'$.kv.addtime') addtime, get_json_object(event_json,'$.kv.other_id') other_id, get_json_object(event_json,'$.kv.praise_count') praise_count, get_json_object(event_json,'$.kv.reply_count') reply_count, 
server_time 
from dwd_base_event_log 
where dt='2020-02-01' and event_name='comment'; 

收藏表

#创建表
drop table if exists dwd_favorites_log; 
CREATE EXTERNAL TABLE dwd_favorites_log( 
`mid_id` string, 
`user_id` string, 
`version_code` string, 
`version_name` string, 
`lang` string, 
`source` string, 
`os` string, 
`area` string, 
`model` string, 
`brand` string, 
`sdk_version` string, 
`gmail` string, 
`height_width` string, 
`app_time` string, 
`network` string, 
`lng` string, 
`lat` string, 
`id` int, 
`course_id` int, 
`userid` int, 
`add_time` string,
`server_time` string
) 
partitioned by (`dt` string) 
stored as parquet 
location '/warehouse/gmall/dwd/dwd_favorites_log/';
 
#导入数据
insert overwrite table dwd_favorites_log 
partition (`dt`='2020-02-01') 
select 
mid_id, 
user_id, 
version_code, 
version_name, 
lang, 
source, 
os, 
area,
model, 
brand, 
sdk_version, 
gmail, 
height_width, 
app_time, 
network, 
lng, 
lat, 
get_json_object(event_json,'$.kv.id') id, get_json_object(event_json,'$.kv.course_id') course_id, get_json_object(event_json,'$.kv.userid') userid, get_json_object(event_json,'$.kv.add_time') add_time, 
server_time 
from dwd_base_event_log 
where dt='2020-02-01' and event_name='favorites'; 

点赞表

#创建表
drop table if exists dwd_praise_log; 
CREATE EXTERNAL TABLE dwd_praise_log( 
`mid_id` string, 
`user_id` string, 
`version_code` string, 
`version_name` string, 
`lang` string, 
`source` string, 
`os` string, 
`area` string, 
`model` string, 
`brand` string, 
`sdk_version` string, 
`gmail` string, 
`height_width` string,
`app_time` string, 
`network` string, 
`lng` string, 
`lat` string, 
`id` string, 
`userid` string, 
`target_id` string, 
`type` string, 
`add_time` string, 
`server_time` string
) 
partitioned by(`dt` string) 
stored as parquet 
location '/warehouse/gmall/dwd/dwd_praise_log/'; 

#导入数据
insert overwrite table dwd_praise_log 
PARTITION (dt='2020-02-01') 
select 
mid_id, 
user_id, 
version_code, 
version_name, 
lang, 
source, 
os, 
area, 
model, 
brand, 
sdk_version, 
gmail, 
height_width, 
app_time, 
network, 
lng, 
lat, 
get_json_object(event_json,'$.kv.id') id, get_json_object(event_json,'$.kv.userid') userid, get_json_object(event_json,'$.kv.target_id') target_id, get_json_object(event_json,'$.kv.type') type, get_json_object(event_json,'$.kv.add_time') add_time, 
server_time 
from dwd_base_event_log 
where dt='2020-02-01' and event_name='praise'; 

错误日志表

#创建表
drop table if exists dwd_error_log; 
CREATE EXTERNAL TABLE dwd_error_log( 
`mid_id` string, 
`user_id` string, 
`version_code` string, 
`version_name` string,
`lang` string, 
`source` string, 
`os` string, 
`area` string, 
`model` string, 
`brand` string, 
`sdk_version` string, 
`gmail` string, 
`height_width` string, 
`app_time` string, 
`network` string, 
`lng` string, 
`lat` string, 
`errorBrief` string, 
`errorDetail` string, 
`server_time` string
) 
partitioned by (`dt` string) 
stored as parquet 
location '/warehouse/gmall/dwd/dwd_error_log/';
 
#导入数据
insert overwrite table dwd_error_log 
partition(`dt`='2020-02-01') 
select 
mid_id, 
user_id, 
version_code, 
version_name, 
lang, 
source, 
os, 
area, 
model, 
brand, 
sdk_version, 
gmail, 
height_width, 
app_time, 
network, 
lng, 
lat, 
get_json_object(event_json,'$.kv.errorBrief') errorBrief, get_json_object(event_json,'$.kv.errorDetail') errorDetail, 
server_time 
from dwd_base_event_log 
where dt='2020-02-01' and event_name='error'; 
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
一、课程简介随着技术的飞速发展,经过多年的数据积累,各互联网公司已保存了海量的原始数据和各种业务数据,所以数据仓库技术是各大公司目前都需要着重发展投入的技术领域。数据仓库是面向分析的集成化数据环境,为企业所有决策制定过程,提供系统数据支持的战略集合。通过对数据仓库中数据的分析,可以帮助企业改进业务流程、控制成本、提高产品质量等。二、课程内容本次精心打造的数仓项目的课程,从项目架构的搭建,到数据采集模块的设计、数仓架构的设计、实战需求实现、即席查询的实现,我们针对国内目前广泛使用的Apache原生框架和CDH版本框架进行了分别介绍,Apache原生框架介绍中涉及到的技术框架包括Flume、Kafka、Sqoop、MySql、HDFS、Hive、Tez、Spark、Presto、Druid等,CDH版本框架讲解包括CM的安装部署、Hadoop、Zookeeper、Hive、Flume、Kafka、Oozie、Impala、HUE、Kudu、Spark的安装配置,透彻了解不同版本框架的区别联系,将大数据全生态系统前沿技术一网打尽。在过程中对大数据生态体系进行了系统的讲解,对实际企业数仓项目中可能涉及到的技术点都进行了深入的讲解和探讨。同时穿插了大量数仓基础理论知识,让你在掌握实战经验的同时能够打下坚实的理论基础。三、课程目标本课程以国内电商巨头实际业务应用场景为依托,对电商数仓的常见实战指标以及难点实战指标进行了详尽讲解,具体指标包括:每日、周、月活跃设备明细,留存用户比例,沉默用户、回流用户、流失用户统计,最近连续3周活跃用户统计,最近7天内连续3天活跃用户统计,GMV成交总额分析,转化率及漏斗分析,品牌复购率分析、订单拉链的设计等,让学生拥有更直观全面的实战经验。通过对本课程的学习,对数仓项目可以建立起清晰明确的概念,系统全面的掌握各项数仓项目技术,轻松应对各种数仓难题。四、课程亮点本课程结合国内多家企业实际项目经验,特别加入了项目架构模块,从集群规模的确定到框架版本选型以及服务器选型,手把手教你从零开始搭建大数据集群。并且总结大量项目实战中会遇到的问题,针对各个技术框架,均有调优实战经验,具体包括:常用Linux运维命令、Hadoop集群调优、Flume组件选型及性能优化、Kafka集群规模确认及关键参数调优。通过这部分学习,助学生迅速成长,获取前沿技术经验,从容解决实战问题。
本教程为授权出品 一、课程简介数据仓库(Data Warehouse,可简写为DW或DWH),是面向分析的集成化数据环境,为企业决策制定过程,提供系统数据支持的战略集合,是国内外各大公司正在重点投入的战略级技术领域。 二、课程内容《大数据电商数仓项目实战》视频教程,从项目架构的搭建,到数据采集模块的设计、数仓架构的设计、实战需求实现、即席查询的实现,我们针对国内目前广泛使用的Apache原生框架和CDH版本框架进行了分别介绍,Apache原生框架介绍中涉及到的技术框架包括Flume、Kafka、Sqoop、MySql、HDFS、Hive、Tez、Spark、Presto、Druid等,CDH版本框架讲解包括CM的安装部署、Hadoop、Zookeeper、Hive、Flume、Kafka、Oozie、Impala、HUE、Kudu、Spark的安装配置,透彻了解不同版本框架的区别联系,将大数据全生态系统前沿技术一网打尽。在过程中对大数据生态体系进行了系统的讲解,对实际企业数仓项目中可能涉及到的技术点都进行了深入的讲解和探讨。同时穿插了大量数仓基础理论知识,让你在掌握实战经验的同时能够打下坚实的理论基础。 三、课程目标本课程以国内电商巨头实际业务应用场景为依托,对电商数仓的常见实战指标以及难点实战指标进行了详尽讲解,具体指标包括:每日、周、月活跃设备明细,留存用户比例,沉默用户、回流用户、流失用户统计,最近连续3周活跃用户统计,最近7天内连续3天活跃用户统计,GMV成交总额分析,转化率及漏斗分析,品牌复购率分析、订单拉链的设计等,让学生拥有更直观全面的实战经验。通过对本课程的学习,对数仓项目可以建立起清晰明确的概念,系统全面的掌握各项数仓项目技术,轻松应对各种数仓难题。 四、课程亮点本课程结合国内多家企业实际项目经验,特别加入了项目架构模块,从集群规模的确定到框架版本选型以及服务器选型,手把手教你从零开始搭建大数据集群。并且总结大量项目实战中会遇到的问题,针对各个技术框架,均有调优实战经验,具体包括:常用Linux运维命令、Hadoop集群调优、Flume组件选型及性能优化、Kafka集群规模确认及关键参数调优。通过这部分学习,助学生迅速成长,获取前沿技术经验,从容解决实战问题。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值