MapReduce数据清洗案例

最新推荐文章于 2023-10-17 21:22:38 发布

weixin_44090237

最新推荐文章于 2023-10-17 21:22:38 发布

阅读量2.9k

点赞数

本文链接：https://blog.csdn.net/weixin_44090237/article/details/89312825

版权

原始建标语句：

create table user_log(
userId string,
country string,
device_model string,
city string,
app_token string,
pid string,
language string,
commit_time string,
device_id_type string,
resolution string,
mac string,
app_ver_code string,
release_channel string,
cid_sn string,
app_id string,
os_ver string,
device_id string,
net_type string,
promotion_channel string,
app_device_id string,
time_zone string,
manufacture string,
carrier string,
build_num string,
imei string,
os_name string ,
mobile_data_type string,
sdk_ver string,
android_id string,
commit_id string,
app_ver_name string,
account string)
partitioned by(dt string,os string)
row format delimited
fields terminated by ‘,’;

导入数据

load data inpath’/android1/log2’ into table user_log partition (dt=‘20190414’,os=‘android’);
load data inpath’/ios1/log3’ into table user_log partition (dt=‘20190414’,os=‘ios’);

创建日活跃用户表

create table ods_app_active1_log(
userId string,
country string,
device_model string,
city string,
app_token string,
pid string,
language string,
commit_time string,
device_id_type string,
resolution string,
mac string,
app_ver_code string,
release_channel string,
cid_sn string,
app_id string,
os_ver string,
device_id string,
net_type string,
promotion_channel string,
app_device_id string,
time_zone string,
manufacture string,
carrier string,
build_num string,
imei string,
os_name string ,
mobile_data_type string,
sdk_ver string,
android_id string,
commit_id string,
app_ver_name string,
account string)
partitioned by(dt string)
row format delimited
fields terminated by ‘,’;

给日活用户添加数据

insert into table ods_app_active_log partition(dt = ‘20190414’) select
userId,
country,
device_model,
city,
app_token,
pid,
language,
commit_time,
device_id_type,
resolution,
mac,
app_ver_code,
release_channel,
cid_sn,
app_id,
os_ver,
device_id,
net_type,
promotion_channel,
app_device_id,
time_zone,
manufacture,
carrier,
build_num,
imei,
os_name,
mobile_data_type,
sdk_ver,
android_id ,
commit_id ,
app_ver_name ,
account
from (select * ,row_number() over (partition by userId order by commit_time desc) as rank from user_log where dt = ‘20190414’)tmp where rank=1;

思路： a、应该建立一个历史用户表（只存user_id）

   b、将当日的活跃用户去 比对  历史用户表， 就知道哪些人是今天新出现的用户 --> 当日新增用户
   
   c、将当日新增用户追加到历史用户表

创建新增用户表

create table userid_new_add (userId string) partitioned by (dt string);

创建一个存放所有用户ID的表

create table userid_history (userId string) partitioned by (dt string);

从第一天的数据里取出userId放进存放所有用户ID表里
insert into table userid_history partition(dt=‘20190413’) select distinct(userid) from ods_app_active_log where dt=‘20190413’;

查出增加用户的id

insert into userid_new_add partition (dt = ‘20190414’)

select a.userid from ods_app_active_log a left join userid_history b on a.userid = b.userid where a.dt = ‘20190414’ and b.userid is null;

– 1 日新维度统计报表–数据建模
create table dim_user_new_day(os_name string,city string,release_channel string,app_ver_name string,cnts int)
partitioned by (day string, dim string);

– 2 日新维度统计报表sql开发(利用多重插入语法)
from ods_app_active_log
insert into table dim_user_new_day partition(dt=‘2019-04-14’,dim=‘0000’)
select ‘all’,‘all’,‘all’,‘all’,count(1)
where dt=‘2019-04-14’

insert into table dim_user_new_day partition(day=‘2017-09-21’,dim=‘0001’)
select ‘all’,‘all’,‘all’,app_ver_name,count(1)
where day=‘2017-09-21’
group by app_ver_name

insert into table dim_user_new_day partition(day=‘2017-09-21’,dim=‘0010’)
select ‘all’,‘all’,release_channel,‘all’,count(1)
where day=‘2017-09-21’
group by release_channel

insert into table dim_user_new_day partition(day=‘2017-09-21’,dim=‘0011’)
select ‘all’,‘all’,release_channel,app_ver_name,count(1)
where day=‘2017-09-21’
group by release_channel,app_ver_name

insert into table dim_user_new_day partition(day=‘2017-09-21’,dim=‘0100’)
select ‘all’,city,‘all’,‘all’,count(1)
where day=‘2017-09-21’
group by city；

次日留存用户：第一天的活跃用户与第二天的活跃用户相比多出来的用户叫新增用户，新增用户与第三天的活跃用户相比，用 join ,还存在的叫次日留存用户。

创建表：
create table userid_liucun(userId string) partitioned by (dt string);

统计实现

insert into table userid_liucun partition (dt=‘20190416’)
select b.userid from userid_new_add a join ods_app_active_log b on a.userid = b.userid where b.dt = ‘20190416’ and a.dt = ‘20190414’;

/* 用左半连接效率略高************************************** */
insert into table etl_user_keepalive_nextday partition(day=‘2017-09-22’)
select
sdk_ver
,time_zone
,commit_id
,commit_time
,pid
,app_token
,app_id
,device_id
,device_id_type
,release_channel
,app_ver_name
,app_ver_code
,os_name
,os_ver
,language
,country
,manufacture
,device_model
,resolution
,net_type
,account
,app_device_id
,mac
,android_id
,imei
,cid_sn
,build_num
,mobile_data_type
,promotion_channel
,carrier
,city
,user_id
from etl_user_new_day a left semi join etl_user_active_day b
on a.user_id = b.user_id and a.day=‘2017-09-21’ and b.day=‘2017-09-22’;