MapReduce数据清洗案例

原始建标语句:

create table user_log(
userId string,
country string,
device_model string,
city string,
app_token string,
pid string,
language string,
commit_time string,
device_id_type string,
resolution string,
mac string,
app_ver_code string,
release_channel string,
cid_sn string,
app_id string,
os_ver string,
device_id string,
net_type string,
promotion_channel string,
app_device_id string,
time_zone string,
manufacture string,
carrier string,
build_num string,
imei string,
os_name string ,
mobile_data_type string,
sdk_ver string,
android_id string,
commit_id string,
app_ver_name string,
account string)
partitioned by(dt string,os string)
row format delimited
fields terminated by ‘,’;

导入数据

load data inpath’/android1/log2’ into table user_log partition (dt=‘20190414’,os=‘android’);
load data inpath’/ios1/log3’ into table user_log partition (dt=‘20190414’,os=‘ios’);

创建日活跃用户表

create table ods_app_active1_log(
userId string,
country string,
device_model string,
city string,
app_token string,
pid string,
language string,
commit_time string,
device_id_type string,
resolution string,
mac string,
app_ver_code string,
release_channel string,
cid_sn string,
app_id string,
os_ver string,
device_id string,
net_type string,
promotion_channel string,
app_device_id string,
time_zone string,
manufacture string,
carrier string,
build_num string,
imei string,
os_name string ,
mobile_data_type string,
sdk_ver string,
android_id string,
commit_id string,
app_ver_name string,
account string)
partitioned by(dt string)
row format delimited
fields terminated by ‘,’;

给日活用户添加数据

insert into table ods_app_active_log partition(dt = ‘20190414’) select
userId,
country,
device_model,
city,
app_token,
pid,
language,
commit_time,
device_id_type,
resolution,
mac,
app_ver_code,
release_channel,
cid_sn,
app_id,
os_ver,
device_id,
net_type,
promotion_channel,
app_device_id,
time_zone,
manufacture,
carrier,
build_num,
imei,
os_name,
mobile_data_type,
sdk_ver,
android_id ,
commit_id ,
app_ver_name ,
account
from (select * ,row_number() over (partition by userId order by commit_time desc) as rank from user_log where dt = ‘20190414’)tmp where rank=1;

思路: a、应该建立一个历史用户表(只存user_id)

   b、将当日的活跃用户去 比对  历史用户表, 就知道哪些人是今天新出现的用户 --> 当日新增用户
   
   c、将当日新增用户追加到历史用户表

创建新增用户表

create table userid_new_add (userId string) partitioned by (dt string);

创建一个存放所有用户ID的表

create table userid_history (userId string) partitioned by (dt string);

从第一天的数据里取出userId放进存放所有用户ID表里
insert into table userid_history partition(dt=‘20190413’) select distinct(userid) from ods_app_active_log where dt=‘20190413’;

查出增加用户的id

insert into userid_new_add partition (dt = ‘20190414’)

select a.userid from ods_app_active_log a left join userid_history b on a.userid = b.userid where a.dt = ‘20190414’ and b.userid is null;

– 1 日新维度统计报表–数据建模
create table dim_user_new_day(os_name string,city string,release_channel string,app_ver_name string,cnts int)
partitioned by (day string, dim string);

– 2 日新维度统计报表sql开发(利用多重插入语法)
from ods_app_active_log
insert into table dim_user_new_day partition(dt=‘2019-04-14’,dim=‘0000’)
select ‘all’,‘all’,‘all’,‘all’,count(1)
where dt=‘2019-04-14’

insert into table dim_user_new_day partition(day=‘2017-09-21’,dim=‘0001’)
select ‘all’,‘all’,‘all’,app_ver_name,count(1)
where day=‘2017-09-21’
group by app_ver_name

insert into table dim_user_new_day partition(day=‘2017-09-21’,dim=‘0010’)
select ‘all’,‘all’,release_channel,‘all’,count(1)
where day=‘2017-09-21’
group by release_channel

insert into table dim_user_new_day partition(day=‘2017-09-21’,dim=‘0011’)
select ‘all’,‘all’,release_channel,app_ver_name,count(1)
where day=‘2017-09-21’
group by release_channel,app_ver_name

insert into table dim_user_new_day partition(day=‘2017-09-21’,dim=‘0100’)
select ‘all’,city,‘all’,‘all’,count(1)
where day=‘2017-09-21’
group by city;

次日留存用户:第一天的活跃用户与第二天的活跃用户相比多出来的用户叫新增用户,新增用户与第三天的活跃用户相比,用 join ,还存在的叫次日留存用户。

创建表:
create table userid_liucun(userId string) partitioned by (dt string);

统计实现

insert into table userid_liucun partition (dt=‘20190416’)
select b.userid from userid_new_add a join ods_app_active_log b on a.userid = b.userid where b.dt = ‘20190416’ and a.dt = ‘20190414’;

/* 用左半连接效率略高************************************** */
insert into table etl_user_keepalive_nextday partition(day=‘2017-09-22’)
select
sdk_ver
,time_zone
,commit_id
,commit_time
,pid
,app_token
,app_id
,device_id
,device_id_type
,release_channel
,app_ver_name
,app_ver_code
,os_name
,os_ver
,language
,country
,manufacture
,device_model
,resolution
,net_type
,account
,app_device_id
,mac
,android_id
,imei
,cid_sn
,build_num
,mobile_data_type
,promotion_channel
,carrier
,city
,user_id
from etl_user_new_day a left semi join etl_user_active_day b
on a.user_id = b.user_id and a.day=‘2017-09-21’ and b.day=‘2017-09-22’;

where a.day=‘2017-09-21’ and b.day=‘2017-09-22’; // 注意:left semi join中,右表的引用不能出现在where条件中

以下是一个基于Java的MapReduce数据清洗案例: 假设有一个数据集,包含多行记录,每行记录由三个字段组成:name, age和gender。其中,name字段可能会包含特殊字符或空格,age字段可能会包含非数字字符或负数,而gender字段可能会包含空格或大小写不一致的字母。 我们想要清洗这个数据集,将name字段中的特殊字符和空格去除,将age字段中的非数字字符和负数去除,将gender字段中的空格去除并将字母转换为小写。 下面是实现这个数据清洗过程的Java代码: Mapper类: public class DataCleanMapper extends Mapper<LongWritable, Text, Text, Text>{ private Text outputKey = new Text(); private Text outputValue = new Text(); @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); String[] fields = line.split(","); String name = fields[0].replaceAll("[^a-zA-Z0-9]", "").trim(); String ageStr = fields[1].replaceAll("[^0-9]", "").trim(); int age = Integer.parseInt(ageStr); if(age <= 0){ return; } String gender = fields[2].replaceAll("\\s+", "").toLowerCase(); outputKey.set(name); outputValue.set(age + "\t" + gender); context.write(outputKey, outputValue); } } Reducer类: public class DataCleanReducer extends Reducer<Text, Text, Text, Text>{ private Text outputValue = new Text(); @Override protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { for(Text value : values){ outputValue.set(value); context.write(key, outputValue); break; } } } Driver类: public class DataCleanDriver { public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf, "data clean"); job.setJarByClass(DataCleanDriver.class); job.setMapperClass(DataCleanMapper.class); job.setReducerClass(DataCleanReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } } 在这个案例中,Mapper类用于数据清洗,它通过正则表达式去除name、age和gender字段中的非法字符,并将age字段转换为整数类型,并过滤掉非正数值。Reducer类用于去除重复数据。Driver类用于设置MapReduce作业的各种参数,包括输入路径、输出路径、Mapper类、Reducer类等。最终,执行Driver类的main方法即可启动MapReduce作业,完成数据清洗
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值