MapReduce简单实例2——开发MR应用程序来统计清洗数据并去重。
对文件中的数据按要求进行清洗并去重输出,文件内容如下:
具体要求如下:
1.清洗要求:字段间使用一个空格分隔;若某行字段总数不等于2,那么抛弃该行数据
2.去重:对清洗后的数据进行去重
1.1. 启动Hadoop后上传数据文件并查看
start-all.sh
hdfs dfs -put /home/hadoop01/test/mr_score /input
hdfs dfs -ls /input/mr_score
1.2. 导入所需要的包
package mr.demo;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
1.3. Mapper 类
public static class Map extends Mapper<LongWritable, Text, Text, NullWritable> {
// 实现map函数
public void map(LongWritable key, Text value, Context context) throws IOException,
InterruptedException {
//去除values首尾空格,字符串中的空格留一个
value.set(value.toString().trim().replaceAll("\\s{1,}"," "));
//解析文件,返回bool值
String line = value.toString();
boolean result = parseFiles(line);
// 3.判断字符串不合法退出,true就写入
if (!result){
return;
}
context.write(value, NullWritable.get());
}
//解析文件方法
private boolean parseFiles(String line) {
// 字符串切割
String[] fields = line.split(" ");
// 判断字符串长度等于2为合法
if (fields.length == 2) {
return true;
}else {
return false;
}
}
}
1.4. Reducer 类
// reduce函数将输入中的key复制到输出数据的key上,并直接输出
public static class Reduce extends Reducer<Text,NullWritable, Text, NullWritable> {
// 实现reduce函数
public void reduce(Text key, Iterable<NullWritable> values, Context context)
throws IOException, InterruptedException {
context.write(key, NullWritable.get());
}
}
1.5. Driver 驱动类
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
//构建任务对象
Job job = Job.getInstance(conf, "My Data clean and Deduplication");
job.setJarByClass(Dedup.class);
// 设置Map、Combine和Reduce处理类
job.setMapperClass(Map.class);
job.setCombinerClass(Reduce.class);
job.setReducerClass(Reduce.class);
// 设置输出类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
// 设置输入和输出目录
FileInputFormat.addInputPath(job, new Path("/input/mr_dedup"));
FileOutputFormat.setOutputPath(job, new Path("/output/mr_dedup"));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
1.6. 导出jar包
1.7. 上传jar包
hdfs dfs -put /home/hadoop01/eclipse-workspace /home/hadoop01
1.8. 运行jar包
hadoop jar ~/eclipse-workspace/jars/mr_demo.jar mr.demo.Dedup
1.9 查看结果
hdfs dfs -cat /output/mr_dedup/*
课堂小作业,还请指教!