数据ETL综合案例

一、建表,保存原始数据

create table ori_data(
 cid_sn string,
 mobile_data_type string,
 os_ver string,
 mac string,
 resolution string,
 commit_time string,
 sdk_ver string,
 device_id_type string,
 city string,
 android_id string,
 device_model string,
 carrier string,
 promotion_channel string,
 app_ver_name string,
 imei string,
 app_ver_code string,
 pid string,
 net_type string,
 device_id string,
 app_device_id string,
 release_channel string,
 country string,
 time_zone string,
 os_name string,
 manufacture string,
 commit_id string,
 app_token string,
 account string,
 app_id string,
 build_num string,
 language string,
 uid string
)
partitioned by (dt string)
row format delimited fields terminated by ',';

load data local inpath '/root/etlWork/20170101' into table ori_data partition (dt='20170101');
load data local inpath '/root/etlWork/20170102' into table ori_data partition (dt='20170102');

//测试
select * from ori_data where dt="20170101" limit 10;

二、每天的活跃用户
1、思路:使用函数row_number按用户(uid)分组事件标记排序(commit_id),获得rank,去除重复的id(获取rank=1的值),获得的用户就是日活跃用户
2、建立用户存储活跃用户的表

create table act_user(
uid string,
commit_time string,
city string,
release_channel string,
app_ver_name string
)
partitioned by (dt string);

3、向活跃用户表添加值

insert into table act_user partition (dt="20170101")
select tmp.uid,tmp.commit_time,tmp.city,tmp.release_channel,tmp.app_ver_name from
(select uid,commit_time,city,release_channel,app_ver_name,
row_number() over(partition by uid order by commit_id desc) as rank
from ori_data where dt="20170101") tmp where tmp.rank=1; 

insert into table act_user partition (dt="20170102")
select tmp.uid,tmp.commit_time,tmp.city,tmp.release_channel,tmp.app_ver_name from
(select uid,commit_time,city,release_channel,app_ver_name,
row_number() over(partition by uid order by commit_id desc) as rank
from ori_data where dt="20170102") tmp where tmp.rank=1;

4、分维度进行统计每天的活跃用户

create table act_user_dim(
city string,
release_channel string,
app_ver_name string,
active_user_cnt int
)
partitioned by (dt string,flag string);

from act_user
insert into table act_user_dim partition (dt="20170101",flag='000')
select "all","all","all",count(1) where dt="20170101"
insert into table act_user_dim partition (dt="20170101",flag="100")
select city,"all","all",count(1) where dt="20170101" group by city
insert into table act_user_dim partition (dt="20170101",flag="110")
select city,release_channel,"all",count(1) where dt="20170101" group by city,release_channel
insert into table act_user_dim partition (dt="20170101",flag="101")
select city,"all",app_ver_name,count(1) where dt="20170101" group by city,app_ver_name
insert into table act_user_dim partition (dt="20170101",flag="111")
select city,release_channel,app_ver_name,count(1) where dt="20170101" group by city,release_channel,app_ver_name
insert into table act_user_dim partition (dt="20170101",flag="010")
select "all",release_channel,"all",count(1) where dt="20170101" group by release_channel
insert into table act_user_dim partition (dt="20170101",flag="011")
select "all",release_channel,app_ver_name,count(1) where dt="20170101" group by release_channel,app_ver_name
insert into table act_user_dim partition (dt="20170101",flag="001")
select "all","all",app_ver_name,count(1) where dt="20170101" group by app_ver_name;


from act_user
insert into table act_user_dim partition (dt="20170102",flag='000')
select "all","all","all",count(1) where dt="20170102"
insert into table act_user_dim partition (dt="20170102",flag="100")
select city,"all","all",count(1) where dt="20170102" group by city
insert into table act_user_dim partition (dt="20170102",flag="110")
select city,release_channel,"all",count(1) where dt="20170102" group by city,release_channel
insert into table act_user_dim partition (dt="20170102",flag="101")
select city,"all",app_ver_name,count(1) where dt="20170102" group by city,app_ver_name
insert into table act_user_dim partition (dt="20170102",flag="111")
select city,release_channel,app_ver_name,count(1) where dt="20170102" group by city,release_channel,app_ver_name
insert into table act_user_dim partition (dt="20170102",flag="010")
select "all",release_channel,"all",count(1) where dt="20170102" group by release_channel
insert into table act_user_dim partition (dt="20170102",flag="011")
select "all",release_channel,app_ver_name,count(1) where dt="20170102" group by release_channel,app_ver_name
insert into table act_user_dim partition (dt="20170102",flag="001")
select "all","all",app_ver_name,count(1) where dt="20170102" group by app_ver_name;

注:多重插入(读一次数据,把多个sql逻辑执行)不能同时往一个分区内写数据。

三、新增用户
思路:获取ori_data原始数据表内容,与历史用户表进行左外连接,获取历史表中uid为空的数据存入新增用户表中,将新增用户存入历史用户中
1、历史用户表

create table his_user(
uid string
);

2、每日新增用户表

create table new_user(
uid string,
commit_time string,
city string,
release_channel string,
app_ver_name string
)
partitioned by (dt string);

3、获取ori_data原始数据表内容,与历史用户表进行左外连接,获取历史表中uid为空的数据存入新增用户表中,将新增用户存入历史用户中

insert into table new_user partition (dt="20170101") 
select a.uid,a.commit_time,a.city,a.release_channel,a.app_ver_name 
from ori_data a left join his_user b on a.uid=b.uid where dt="20170101" and b.uid is null;

insert into table new_user partition (dt="20170102") 
select a.uid,a.commit_time,a.city,a.release_channel,a.app_ver_name 
from ori_data a left join his_user b on a.uid=b.uid where dt="20170102" and b.uid is null;

4、将新增用户信息添加至历史用户表中

insert into table his_user
select uid from new_user where dt="20170101";

insert into table his_user
select uid from new_user where dt="20170102";

5、分维度进行统计
创建一个根据维度统计的新增用户表

create table new_user_dim(
city string,
release_channel string,
app_ver_name string,
new_user_count int
)
partitioned by (dt string,flag string);


from new_user
insert into table new_user_dim partition(dt="20170101",flag="000")
select "all","all","all",count(1)
insert into table new_user_dim partition(dt="20170101",flag="100")
select city,"all","all",count(1) group by city
insert into table new_user_dim partition(dt="20170101",flag="110")
select city,release_channel,"all",count(1) group by city,release_channel
insert into table new_user_dim partition(dt="20170101",flag="101")
select city,"all",app_ver_name,count(1) group by city,app_ver_name
insert into table new_user_dim partition(dt="20170101",flag="111")
select city,release_channel,app_ver_name,count(1) group by city,app_ver_name,release_channel
insert into table new_user_dim partition(dt="20170101",flag="010")
select "all",release_channel,"all",count(1) group by release_channel
insert into table new_user_dim partition(dt="20170101",flag="011")
select "all",release_channel,app_ver_name,count(1) group by release_channel,app_ver_name
insert into table new_user_dim partition(dt="20170101",flag="001")
select "all","all",app_ver_name,count(1) group by app_ver_name;
  • 3
    点赞
  • 32
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值