hadoop的reducer输出多个文件

hadoop的reducer输出多个文件

关键字: hadoop, mapreduce

有时候我们想到这样的功能: reducer能根据key(或value)值来输出多个文件,同一key(或value)处于同一个文件中。现在hadoop的0.17.x版本可以重写MultipleOutputFormat的generateFileNameForKeyValue就可以实现此功能。

比如:
Java代码 复制代码
  1. package org.apache.hadoop.mapred.lib;   
  2.   
  3. import java.io.IOException;   
  4.   
  5. import org.apache.hadoop.fs.FileSystem;   
  6. import org.apache.hadoop.io.Writable;   
  7. import org.apache.hadoop.io.WritableComparable;   
  8. import org.apache.hadoop.mapred.JobConf;   
  9. import org.apache.hadoop.mapred.RecordWriter;   
  10. import org.apache.hadoop.mapred.TextOutputFormat;   
  11. import org.apache.hadoop.util.Progressable;   
  12.   
  13. public class MultipleTextOutputFormat<K extends WritableComparable, V extends Writable>   
  14.     extends MultipleOutputFormat<K, V> {   
  15.   
  16.   private TextOutputFormat<K, V> theTextOutputFormat = null;   
  17.   
  18.   @Override  
  19.   protected RecordWriter<K, V> getBaseRecordWriter(FileSystem fs, JobConf job,   
  20.       String name, Progressable arg3) throws IOException {   
  21.     if (theTextOutputFormat == null) {   
  22.       theTextOutputFormat = new TextOutputFormat<K, V>();   
  23.     }   
  24.     return theTextOutputFormat.getRecordWriter(fs, job, name, arg3);   
  25.   }   
  26.   
  27.     @Override  
  28.     protected String generateFileNameForKeyValue(K key, V value, String name) {   
  29.         return name + "_" + value.toString();   
  30.     }   
  31.      
  32.      
  33. }  
package org.apache.hadoop.mapred.lib;

import java.io.IOException;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.Progressable;

public class MultipleTextOutputFormat<K extends WritableComparable, V extends Writable>
    extends MultipleOutputFormat<K, V> {

  private TextOutputFormat<K, V> theTextOutputFormat = null;

  @Override
  protected RecordWriter<K, V> getBaseRecordWriter(FileSystem fs, JobConf job,
      String name, Progressable arg3) throws IOException {
    if (theTextOutputFormat == null) {
      theTextOutputFormat = new TextOutputFormat<K, V>();
    }
    return theTextOutputFormat.getRecordWriter(fs, job, name, arg3);
  }

	@Override
	protected String generateFileNameForKeyValue(K key, V value, String name) {
		return name + "_" + value.toString();
	}
  
  
}


试一下wordcount这个例子,把WordCount.java的run函数加上一行
conf.setOutputFormat(org.apache.hadoop.mapred.lib.MultipleTextOutputFormat.class);

Java代码 复制代码
  1. public int run(String[] args) throws Exception {   
  2.     JobConf conf = new JobConf(getConf(), WordCount.class);   
  3.     conf.setJobName("wordcount");   
  4.     
  5.     // the keys are words (strings)   
  6.     conf.setOutputKeyClass(Text.class);   
  7.     // the values are counts (ints)   
  8.     conf.setOutputValueClass(IntWritable.class);   
  9.        
  10.     conf.setMapperClass(MapClass.class);           
  11.     conf.setCombinerClass(Reduce.class);   
  12.     conf.setReducerClass(Reduce.class);   
  13.        
  14.     conf.setOutputFormat(org.apache.hadoop.mapred.lib.MultipleTextOutputFormat.class);   
  15.        
  16.     List<String> other_args = new ArrayList<String>();   
  17.     for(int i=0; i < args.length; ++i) {   
  18.       try {   
  19.         if ("-m".equals(args[i])) {   
  20.           conf.setNumMapTasks(Integer.parseInt(args[++i]));   
  21.         } else if ("-r".equals(args[i])) {   
  22.           conf.setNumReduceTasks(Integer.parseInt(args[++i]));   
  23.         } else {   
  24.           other_args.add(args[i]);   
  25.         }   
  26.       } catch (NumberFormatException except) {   
  27.         System.out.println("ERROR: Integer expected instead of " + args[i]);   
  28.         return printUsage();   
  29.       } catch (ArrayIndexOutOfBoundsException except) {   
  30.         System.out.println("ERROR: Required parameter missing from " +   
  31.                            args[i-1]);   
  32.         return printUsage();   
  33.       }   
  34.     }   
  35.     // Make sure there are exactly 2 parameters left.   
  36.     if (other_args.size() != 2) {   
  37.       System.out.println("ERROR: Wrong number of parameters: " +   
  38.                          other_args.size() + " instead of 2.");   
  39.       return printUsage();   
  40.     }   
  41.     FileInputFormat.setInputPaths(conf, other_args.get(0));   
  42.     FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));   
  43.            
  44.     JobClient.runJob(conf);   
  45.     return 0;   
  46.   }  
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), WordCount.class);
    conf.setJobName("wordcount");
 
    // the keys are words (strings)
    conf.setOutputKeyClass(Text.class);
    // the values are counts (ints)
    conf.setOutputValueClass(IntWritable.class);
    
    conf.setMapperClass(MapClass.class);        
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);
    
    conf.setOutputFormat(org.apache.hadoop.mapred.lib.MultipleTextOutputFormat.class);
    
    List<String> other_args = new ArrayList<String>();
    for(int i=0; i < args.length; ++i) {
      try {
        if ("-m".equals(args[i])) {
          conf.setNumMapTasks(Integer.parseInt(args[++i]));
        } else if ("-r".equals(args[i])) {
          conf.setNumReduceTasks(Integer.parseInt(args[++i]));
        } else {
          other_args.add(args[i]);
        }
      } catch (NumberFormatException except) {
        System.out.println("ERROR: Integer expected instead of " + args[i]);
        return printUsage();
      } catch (ArrayIndexOutOfBoundsException except) {
        System.out.println("ERROR: Required parameter missing from " +
                           args[i-1]);
        return printUsage();
      }
    }
    // Make sure there are exactly 2 parameters left.
    if (other_args.size() != 2) {
      System.out.println("ERROR: Wrong number of parameters: " +
                         other_args.size() + " instead of 2.");
      return printUsage();
    }
    FileInputFormat.setInputPaths(conf, other_args.get(0));
    FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));
        
    JobClient.runJob(conf);
    return 0;
  }


则使用
bin/hadoop jar build/hadoop-*-examples.jar wordcount conf  wordcount_output
可输出一个目录wordcount_output
Java代码 复制代码
  1. $ls wordcount_output/   
  2. part-00000_1    part-00000_13   part-00000_16  part-00000_214  part-00000_28  part-00000_38  part-00000_5   part-00000_8   
  3. part-00000_10   part-00000_14   part-00000_17  part-00000_22   part-00000_29  part-00000_4   part-00000_6   part-00000_9   
  4. part-00000_102  part-00000_141  part-00000_19  part-00000_23   part-00000_3   part-00000_42  part-00000_62   
  5. part-00000_11   part-00000_143  part-00000_2   part-00000_24   part-00000_31  part-00000_44  part-00000_63   
  6. part-00000_117  part-00000_15   part-00000_20  part-00000_25   part-00000_35  part-00000_46  part-00000_7   
  7. part-00000_12   part-00000_152  part-00000_21  part-00000_26   part-00000_36  part-00000_47  part-00000_70  
  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值