mapreduce程序reduce输出控制

1,在hadoop中,reduce支持多个输出,输出的文件名也是可控的,就是继承MultipleTextOutputFormat类,重写generateFileNameForKey方法

public class LzoHandleLogMr extends Configured implements Tool {

	 static class LzoHandleLogMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, Text> {
       
	  
		public void map(LongWritable key, Text value, OutputCollector<Text, Text> output, Reporter reporter)
				throws IOException {
	    	try {
	    	    String[] sp = value.toString().split(",");
	    		output.collect(new Text(sp[0]), value);
	    	}catch (Exception e) {
			   e.printStackTrace();
		    }    	
		}


	}
	static class LzoHandleLogReducer  extends MapReduceBase implements Reducer<Text, Text, Text, NullWritable> {
        


		@Override
		public void reduce(Text key, Iterator<Text> values,
				OutputCollector<Text, NullWritable> output, Reporter reporter)
				throws IOException {
			while (values.hasNext()) {
		   		  output.collect(values.next(), NullWritable.get());   
		   	   }
			
		}	
	}
	
	public static class LogNameMultipleTextOutputFormat extends MultipleTextOutputFormat<Text, NullWritable> 
	   {


		@Override
		protected String generateFileNameForKeyValue(Text key,
				NullWritable value, String name) {
			String sp[] = key.toString().split(",");
			String filename = sp[0];
			if(sp[0].contains(".")) filename="000000000000";
			return filename;
		}
		
	}
    


	@Override
	public int run(String[] args) throws Exception {
		 
		    JobConf jobconf = new JobConf(LzoHandleLogMr.class);
		    jobconf.setMapperClass(LzoHandleLogMapper.class);
		    jobconf.setReducerClass(LzoHandleLogReducer.class);
		    jobconf.setOutputFormat(LogNameMultipleTextOutputFormat.class);
		    jobconf.setOutputKeyClass(Text.class);
		    jobconf.setNumReduceTasks(12);
		    
		    
		 FileInputFormat.setInputPaths(jobconf,new Path(args[0]));
	    	FileOutputFormat.setOutputPath(jobconf,new Path(args[1]));
	    	FileOutputFormat.setCompressOutput(jobconf, true);
	    	FileOutputFormat.setOutputCompressorClass(jobconf, LzopCodec.class);  
	    	
	    	JobClient.runJob(jobconf);
	      return 0;
			
	}
}


在新版本的hadoopAPI是通过Job类来设置各种参数的,但是我调用 Job.setOutputFormatClass()来使用MultipleTextOutputFormat的时候,竟然报错,原因是必须继承子org.apache.hadoop.mapreduce.OutputFormat。0.20.2比较致命的其中一个bug, 升级到0.21能解决


2, 如果同一行数据,需要同时输出至多个文件的话,我们可以使用MultipleOutputs类:

  1. publicclassMultiFileextendsConfiguredimplementsTool{
  2. publicstaticclassMapClassextendsMapReduceBase
  3. implementsMapper<LongWritable,Text,NullWritable,Text>{
  4. privateMultipleOutputsmos;
  5. privateOutputCollector<NullWritable,Text>collector;
  6. publicvoidconfigure(JobConfconf){
  7. mos=newMultipleOutputs(conf);
  8. }
  9. publicvoidmap(LongWritablekey,Textvalue,
  10. OutputCollector<NullWritable,Text>output,
  11. Reporterreporter)throwsIOException{
  12. String[]arr=value.toString().split(",",-1);
  13. Stringchrono=arr[0]+","+arr[1]+","+arr[2];
  14. Stringgeo=arr[0]+","+arr[4]+","+arr[5];
  15. collector=mos.getCollector("chrono",reporter);
  16. collector.collect(NullWritable.get(),newText(chrono));
  17. collector=mos.getCollector("geo",reporter);
  18. collector.collect(NullWritable.get(),newText(geo));
  19. }
  20. publicvoidclose()throwsIOException{
  21. mos.close();
  22. }
  23. }
  24. publicintrun(String[]args)throwsException{
  25. Configurationconf=getConf();
  26. JobConfjob=newJobConf(conf,MultiFile.class);
  27. Pathin=newPath(args[0]);
  28. Pathout=newPath(args[1]);
  29. FileInputFormat.setInputPaths(job,in);
  30. FileOutputFormat.setOutputPath(job,out);
  31. job.setJobName("MultiFile");
  32. job.setMapperClass(MapClass.class);
  33. job.setInputFormat(TextInputFormat.class);
  34. job.setOutputKeyClass(NullWritable.class);
  35. job.setOutputValueClass(Text.class);
  36. job.setNumReduceTasks(0);
  37. MultipleOutputs.addNamedOutput(job,
  38. "chrono",
  39. TextOutputFormat.class,
  40. NullWritable.class,
  41. Text.class);
  42. MultipleOutputs.addNamedOutput(job,
  43. "geo",
  44. TextOutputFormat.class,
  45. NullWritable.class,
  46. Text.class);
  47. JobClient.runJob(job);
  48. return0;
  49. }
  50. }

这个类维护了一个<name, OutputCollector>的map。我们可以在job配置里添加collector,然后在reduce方法中,取得对应的collector并调用collector.write即可。


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值