Hadoop_数据清洗
示例(去除空行、开头为空格的数据):
- 原始数据:D:\data\testdata.txt
zhangsan 500 450 jan
zhangsan 550 450 feb
lisi 210 150 jan
lisi 200 150 feb
zhangsan 400 150 march
zhangsan 600 500 april
lisi 190 150 april
800 100 jan
BLU 2000 200 feb
lisi 110 10 may
- DataCleanMapper
package com.blu.dataclean;
import java.io.IOException;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class DataCleanMapper extends Mapper<LongWritable, Text, Text, NullWritable>{
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, NullWritable>.Context context)
throws IOException, InterruptedException {
String val = value.toString();
String[] vals = val.split(" ");
if(StringUtils.isEmpty(vals[0])) {
//如果当前行的第一个值是空的,说明不是我们要的数据,直接返回。
return;
}
context.write(value, NullWritable.get());
}
}
- DataCleanJob
package com.blu.dataclean;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class DataCleanJob {
public static void main(String[] args) throws Exception {
Job job = Job.getInstance();
job.setJarByClass(DataCleanJob.class);
job.setMapperClass(DataCleanMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
//设置任务数为0
job.setNumReduceTasks(0);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
boolean flag = job.waitForCompletion(true);
System.exit(flag ?0 : 1);
}
}
- 运行参数:
D:\data\testdata.txt D:\data\output
- 运行结果:
zhangsan 500 450 jan
zhangsan 550 450 feb
zhangsan 400 150 march
zhangsan 600 500 april
BLU 2000 200 feb
lisi 110 10 may