题目要求:
对于两个输入的文件A和B,请编写MapReduce程序,对两个文件进行合并,并删除其中重复的内容,得到一个新的输出文件C
文件A
20150101 x
20150102 y
20150103 x
20150104 y
20150105 z
20150106 x
文件B
20150101 y
20150102 y
20150103 x
20150104 z
20150105 y
根据输入文件A和B合并得到的输出文件C
20150101 x
20150101 y
20150102 y
20150103 x
20150104 y
20150104 z
20150105 y
20150105 z
20150106 x
环境 :Linux下Hadoop系统
工具 :hdfs 中 用MapReduce进行统计
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class CrossTest{
public static void main(String[] args)throws Exception{
Configuration conf = new Configuration();
String[] otherArgs = (new GenericOptionsParser(conf,args)).getRemainingArgs();
if(otherArgs.length<2){
System.err.println("Usage:CrossTest <in> [..<in>] <out>");
System.exit(2);
}
Job job = Job.getInstance(conf,"cross");
job.setJarByClass(CrossTest.class);
job.setJar("CrossTest.jar");
job.setMapperClass(CrossTest.crossMapper.class);
job.setReducerClass(CrossTest.crossReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
for(int i = 0; i <otherArgs.length - 1;i++){
FileInputFormat.addInputPath(job,new Path(otherArgs[i]));
}
FileOutputFormat.setOutputPath(job,new Path(otherArgs[otherArgs.length -1]));
System.exit(job.waitForCompletion(true)?0:1);
}
public static class crossMapper extends Mapper<Object,Text,Text,Text>{
private Text va = new Text("");
protected void map(Object key,Text value,Context context) throws IOException,InterruptedException{
//之间把取出来的值当做键,取一个空字符串作为value,利用MapReduce框架自带的去重和整合功能
context.write(value,va);
}
}
public static class crossReducer extends Reducer<Text,Text,Text,Text>{
protected void reduce(Text key,Iterable<Text> values,Context context)throws IOException,InterruptedException{
//此处自带去重功能
context.write(key,new Text(""));
}
}
}