元数据
数据去重源数据:
2012-3-1 a
2012-3-2 b
2012-3-3 c
2012-3-4 d
2012-3-5 a
2012-3-6 b
2012-3-7 c
2012-3-3 c
2012-3-1 b
2012-3-2 a
2012-3-3 b
2012-3-4 d
2012-3-5 a
2012-3-6 c
2012-3-7 d
2012-3-3 c
最终结果:
2012-3-1 a
2012-3-1 b
2012-3-2 a
2012-3-2 b
2012-3-3 b
2012-3-3 c
2012-3-4 d
2012-3-5 a
2012-3-6 b
2012-3-6 c
2012-3-7 c
2012-3-7 d
思路:MapReduce的key可以自动去重,所以在reduce阶段,每次输出一个,就可以达到去重的目的。
package practice1;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* 数据去重
* @author potter
*/
public class Practice3 {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
// conf.set("fs.defaultFS", "hdfs:potter2:9000");//使用配置文件
// System.setProperty("HADOOP_USER_NAME", "potter");//使用集群
FileSystem fs = FileSystem.get(conf);//默认使用本地文件
Job job = Job.getInstance();
job.setJarByClass(Practice3.class);
job.setMapperClass(Practice3Mapper.class);
job.setReducerClass(Practice3Reducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
// String inputpath = args[0];//0代表的是输入第一个参数
// String outpath = args[1]; //1代表的是输入的第二个参数
Path input = new Path("D:\\practice\\input3\\work3.txt");
Path output = new Path("D:\\practice\\input3\\output1");
FileInputFormat.setInputPaths(job, input);
FileOutputFormat.setOutputPath(job, output);
if (fs.exists(output)) {
fs.delete(output,true);
}
boolean isdone = job.waitForCompletion(true);
System.exit(isdone ? 0 : 1);
}
public static class Practice3Mapper extends Mapper<LongWritable, Text, Text, NullWritable>{
//2012-3-1 a
Text text = new Text();
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, NullWritable>.Context context)
throws IOException, InterruptedException {
String[] split = value.toString().trim().split(" ");
String kk = split[0]+"\t"+split[1];
text.set(kk);
context.write(text, NullWritable.get());
}
}
public static class Practice3Reducer extends Reducer<Text, NullWritable, Text, NullWritable>{
@Override
protected void reduce(Text key, Iterable<NullWritable> values, Context context)
throws IOException, InterruptedException {
context.write(key, NullWritable.get());
}
}
}