实现mapreduce多文件自定义输出

    普通maprduce中通常是有map和reduce两个阶段,在不做设置的情况下,计算结果会以part-000*输出成多个文件,并且输出的文件数量和reduce数量一样,文件内容格式也不能随心所欲。这样不利于后续结果处理。

       在hadoop中,reduce支持多个输出,输出的文件名也是可控的,就是继承MultipleTextOutputFormat类,重写generateFileNameForKey方法。如果只是想做到输出结果的文件名可控,实现自己的LogNameMultipleTextOutputFormat类,设置jobconf.setOutputFormat(LogNameMultipleTextOutputFormat.class);就可以了,但是这种方式只限于使用旧版本的hadoop api.如果想采用新版本的api接口或者自定义输出内容的格式等等更多的需求,那么就要自己动手重写一些hadoop api了。

    首先需要构造一个自己的MultipleOutputFormat类实现FileOutputFormat类(注意是org.apache.hadoop.mapreduce.lib.output包的FileOutputFormat)

 

Java代码 复制代码  收藏代码
  1.      
  2.   
  3. import java.io.DataOutputStream;   
  4. import java.io.IOException;   
  5. import java.util.HashMap;   
  6. import java.util.Iterator;   
  7.   
  8.   
  9. import org.apache.hadoop.conf.Configuration;   
  10. import org.apache.hadoop.fs.FSDataOutputStream;   
  11. import org.apache.hadoop.fs.Path;   
  12. import org.apache.hadoop.io.Writable;   
  13. import org.apache.hadoop.io.WritableComparable;   
  14. import org.apache.hadoop.io.compress.CompressionCodec;   
  15. import org.apache.hadoop.io.compress.GzipCodec;   
  16. import org.apache.hadoop.mapreduce.OutputCommitter;   
  17. import org.apache.hadoop.mapreduce.RecordWriter;   
  18. import org.apache.hadoop.mapreduce.TaskAttemptContext;   
  19. import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;   
  20. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;   
  21. import org.apache.hadoop.util.ReflectionUtils;   
  22.   
  23.   
  24. /**  
  25.  * This abstract class extends the FileOutputFormat, allowing to write the 
  26.  * output data to different output files. There are three basic use cases for 
  27.  * this class.   
  28.  * Created on 2012-07-08  
  29.  * @author zhoulongliu  
  30.  * @param <K>  
  31.  * @param <V>  
  32.  */  
  33. public abstract class MultipleOutputFormat<K extends WritableComparable<?>, V extends Writable> extends  
  34.         FileOutputFormat<K, V> {   
  35.   
  36.   
  37.    //接口类,需要在调用程序中实现generateFileNameForKeyValue来获取文件名  
  38.     private MultiRecordWriter writer = null;   
  39.   
  40.   
  41.     public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {   
  42.         if (writer == null) {   
  43.             writer = new MultiRecordWriter(job, getTaskOutputPath(job));   
  44.         }   
  45.         return writer;   
  46.     }   
  47.   
  48.   
  49.     /**  
  50.      * get task output path  
  51.      * @param conf  
  52.      * @return  
  53.      * @throws IOException  
  54.      */  
  55.     private Path getTaskOutputPath(TaskAttemptContext conf) throws IOException {   
  56.         Path workPath = null;   
  57.         OutputCommitter committer = super.getOutputCommitter(conf);   
  58.         if (committer instanceof FileOutputCommitter) {   
  59.             workPath = ((FileOutputCommitter) committer).getWorkPath();   
  60.         } else {   
  61.             Path outputPath = super.getOutputPath(conf);   
  62.             if (outputPath == null) {   
  63.                 throw new IOException("Undefined job output-path");   
  64.             }   
  65.             workPath = outputPath;   
  66.         }   
  67.         return workPath;   
  68.     }   
  69.   
  70.   
  71.     /**  
  72.      * 通过key, value, conf来确定输出文件名(含扩展名) Generate the file output file name based 
  73.      * on the given key and the leaf file name. The default behavior is that the 
  74.      * file name does not depend on the key.  
  75.      *   
  76.      * @param key the key of the output data  
  77.      * @param name the leaf file name  
  78.      * @param conf the configure object  
  79.      * @return generated file name  
  80.      */  
  81.     protected abstract String generateFileNameForKeyValue(K key, V value, Configuration conf);   
  82.   
  83.   
  84.    /**  
  85.     * 实现记录写入器RecordWriter类  
  86.     * (内部类)  
  87.     * @author zhoulongliu  
  88.     *  
  89.     */  
  90.     public class MultiRecordWriter extends RecordWriter<K, V> {   
  91.         /** RecordWriter的缓存 */  
  92.         private HashMap<String, RecordWriter<K, V>> recordWriters = null;   
  93.         private TaskAttemptContext job = null;   
  94.         /** 输出目录 */  
  95.         private Path workPath = null;   
  96.   
  97.   
  98.         public MultiRecordWriter(TaskAttemptContext job, Path workPath) {   
  99.             super();   
  100.             this.job = job;   
  101.             this.workPath = workPath;   
  102.             recordWriters = new HashMap<String, RecordWriter<K, V>>();   
  103.         }   
  104.   
  105.   
  106.         @Override  
  107.         public void close(TaskAttemptContext context) throws IOException, InterruptedException {   
  108.             Iterator<RecordWriter<K, V>> values = this.recordWriters.values().iterator();   
  109.             while (values.hasNext()) {   
  110.                 values.next().close(context);   
  111.             }   
  112.             this.recordWriters.clear();   
  113.         }   
  114.   
  115.   
  116.         @Override  
  117.         public void write(K key, V value) throws IOException, InterruptedException {   
  118.             // 得到输出文件名   
  119.             String baseName = generateFileNameForKeyValue(key, value, job.getConfiguration());   
  120.            //如果recordWriters里没有文件名,那么就建立。否则就直接写值。  
  121.             RecordWriter<K, V> rw = this.recordWriters.get(baseName);   
  122.             if (rw == null) {   
  123.                 rw = getBaseRecordWriter(job, baseName);   
  124.                 this.recordWriters.put(baseName, rw);   
  125.             }   
  126.             rw.write(key, value);   
  127.         }   
  128.   
  129.   
  130.         // ${mapred.out.dir}/_temporary/_${taskid}/${nameWithExtension}  
  131.         private RecordWriter<K, V> getBaseRecordWriter(TaskAttemptContext job, String baseName) throws IOException,   
  132.                 InterruptedException {   
  133.             Configuration conf = job.getConfiguration();   
  134.            //查看是否使用解码器     
  135.             boolean isCompressed = getCompressOutput(job);   
  136.             String keyValueSeparator = ",";   
  137.             RecordWriter<K, V> recordWriter = null;   
  138.             if (isCompressed) {   
  139.                 Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class);   
  140.                 CompressionCodec codec = ReflectionUtils.newInstance(codecClass, conf);   
  141.                 Path file = new Path(workPath, baseName + codec.getDefaultExtension());   
  142.                 FSDataOutputStream fileOut = file.getFileSystem(conf).create(file, false);   
  143.                 //这里我使用的自定义的OutputFormat    
  144.                 recordWriter = new LineRecordWriter<K, V>(new DataOutputStream(codec.createOutputStream(fileOut)),   
  145.                         keyValueSeparator);   
  146.             } else {   
  147.                 Path file = new Path(workPath, baseName);   
  148.                 FSDataOutputStream fileOut = file.getFileSystem(conf).create(file, false);   
  149.                 //这里我使用的自定义的OutputFormat    
  150.                 recordWriter = new LineRecordWriter<K, V>(fileOut, keyValueSeparator);   
  151.             }   
  152.             return recordWriter;   
  153.         }   
  154.     }   
  155.   
  156.   
  157. }  
  

import java.io.DataOutputStream;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.ReflectionUtils;


/**
 * This abstract class extends the FileOutputFormat, allowing to write the
 * output data to different output files. There are three basic use cases for
 * this class. 
 * Created on 2012-07-08
 * @author zhoulongliu
 * @param <K>
 * @param <V>
 */
public abstract class MultipleOutputFormat<K extends WritableComparable<?>, V extends Writable> extends
        FileOutputFormat<K, V> {


   //接口类,需要在调用程序中实现generateFileNameForKeyValue来获取文件名
    private MultiRecordWriter writer = null;


    public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
        if (writer == null) {
            writer = new MultiRecordWriter(job, getTaskOutputPath(job));
        }
        return writer;
    }


    /**
     * get task output path
     * @param conf
     * @return
     * @throws IOException
     */
    private Path getTaskOutputPath(TaskAttemptContext conf) throws IOException {
        Path workPath = null;
        OutputCommitter committer = super.getOutputCommitter(conf);
        if (committer instanceof FileOutputCommitter) {
            workPath = ((FileOutputCommitter) committer).getWorkPath();
        } else {
            Path outputPath = super.getOutputPath(conf);
            if (outputPath == null) {
                throw new IOException("Undefined job output-path");
            }
            workPath = outputPath;
        }
        return workPath;
    }


    /**
     * 通过key, value, conf来确定输出文件名(含扩展名) Generate the file output file name based
     * on the given key and the leaf file name. The default behavior is that the
     * file name does not depend on the key.
     * 
     * @param key the key of the output data
     * @param name the leaf file name
     * @param conf the configure object
     * @return generated file name
     */
    protected abstract String generateFileNameForKeyValue(K key, V value, Configuration conf);


   /**
    * 实现记录写入器RecordWriter类
    * (内部类)
    * @author zhoulongliu
    *
    */
    public class MultiRecordWriter extends RecordWriter<K, V> {
        /** RecordWriter的缓存 */
        private HashMap<String, RecordWriter<K, V>> recordWriters = null;
        private TaskAttemptContext job = null;
        /** 输出目录 */
        private Path workPath = null;


        public MultiRecordWriter(TaskAttemptContext job, Path workPath) {
            super();
            this.job = job;
            this.workPath = workPath;
            recordWriters = new HashMap<String, RecordWriter<K, V>>();
        }


        @Override
        public void close(TaskAttemptContext context) throws IOException, InterruptedException {
            Iterator<RecordWriter<K, V>> values = this.recordWriters.values().iterator();
            while (values.hasNext()) {
                values.next().close(context);
            }
            this.recordWriters.clear();
        }


        @Override
        public void write(K key, V value) throws IOException, InterruptedException {
            // 得到输出文件名
            String baseName = generateFileNameForKeyValue(key, value, job.getConfiguration());
           //如果recordWriters里没有文件名,那么就建立。否则就直接写值。
            RecordWriter<K, V> rw = this.recordWriters.get(baseName);
            if (rw == null) {
                rw = getBaseRecordWriter(job, baseName);
                this.recordWriters.put(baseName, rw);
            }
            rw.write(key, value);
        }


        // ${mapred.out.dir}/_temporary/_${taskid}/${nameWithExtension}
        private RecordWriter<K, V> getBaseRecordWriter(TaskAttemptContext job, String baseName) throws IOException,
                InterruptedException {
            Configuration conf = job.getConfiguration();
           //查看是否使用解码器  
            boolean isCompressed = getCompressOutput(job);
            String keyValueSeparator = ",";
            RecordWriter<K, V> recordWriter = null;
            if (isCompressed) {
                Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class);
                CompressionCodec codec = ReflectionUtils.newInstance(codecClass, conf);
                Path file = new Path(workPath, baseName + codec.getDefaultExtension());
                FSDataOutputStream fileOut = file.getFileSystem(conf).create(file, false);
                //这里我使用的自定义的OutputFormat 
                recordWriter = new LineRecordWriter<K, V>(new DataOutputStream(codec.createOutputStream(fileOut)),
                        keyValueSeparator);
            } else {
                Path file = new Path(workPath, baseName);
                FSDataOutputStream fileOut = file.getFileSystem(conf).create(file, false);
                //这里我使用的自定义的OutputFormat 
                recordWriter = new LineRecordWriter<K, V>(fileOut, keyValueSeparator);
            }
            return recordWriter;
        }
    }


}

  接着你还需要自定义一个LineRecordWriter实现记录写入器RecordWriter类,自定义输出格式。

 

Java代码 复制代码  收藏代码
  1. import java.io.DataOutputStream;   
  2. import java.io.IOException;   
  3. import java.io.UnsupportedEncodingException;   
  4.   
  5. import org.apache.hadoop.io.NullWritable;   
  6. import org.apache.hadoop.io.Text;   
  7. import org.apache.hadoop.mapreduce.RecordWriter;   
  8. import org.apache.hadoop.mapreduce.TaskAttemptContext;   
  9.   
  10. /**  
  11.  *   
  12.  * 重新构造实现记录写入器RecordWriter类  
  13.  * Created on 2012-07-08  
  14.  * @author zhoulongliu  
  15.  * @param <K>  
  16.  * @param <V>  
  17.  */  
  18. public class LineRecordWriter<K, V> extends RecordWriter<K, V> {   
  19.   
  20.     private static final String utf8 = "UTF-8";//定义字符编码格式  
  21.     private static final byte[] newline;   
  22.     static {   
  23.         try {   
  24.             newline = "\n".getBytes(utf8);//定义换行符  
  25.         } catch (UnsupportedEncodingException uee) {   
  26.             throw new IllegalArgumentException("can't find " + utf8 + " encoding");   
  27.         }   
  28.     }   
  29.     protected DataOutputStream out;   
  30.     private final byte[] keyValueSeparator;   
  31.   
  32.      //实现构造方法,出入输出流对象和分隔符   
  33.     public LineRecordWriter(DataOutputStream out, String keyValueSeparator) {   
  34.         this.out = out;   
  35.         try {   
  36.             this.keyValueSeparator = keyValueSeparator.getBytes(utf8);   
  37.         } catch (UnsupportedEncodingException uee) {   
  38.             throw new IllegalArgumentException("can't find " + utf8 + " encoding");   
  39.         }   
  40.     }   
  41.   
  42.     public LineRecordWriter(DataOutputStream out) {   
  43.         this(out, "\t");   
  44.     }   
  45.   
  46.     private void writeObject(Object o) throws IOException {   
  47.         if (o instanceof Text) {   
  48.             Text to = (Text) o;   
  49.             out.write(to.getBytes(), 0, to.getLength());   
  50.         } else {   
  51.             out.write(o.toString().getBytes(utf8));   
  52.         }   
  53.     }   
  54.       
  55.     /**  
  56.      * 将mapreduce的key,value以自定义格式写入到输出流中  
  57.      */  
  58.     public synchronized void write(K key, V value) throws IOException {   
  59.         boolean nullKey = key == null || key instanceof NullWritable;   
  60.         boolean nullValue = value == null || value instanceof NullWritable;   
  61.         if (nullKey && nullValue) {   
  62.             return;   
  63.         }   
  64.         if (!nullKey) {   
  65.             writeObject(key);   
  66.         }   
  67.         if (!(nullKey || nullValue)) {   
  68.             out.write(keyValueSeparator);   
  69.         }   
  70.         if (!nullValue) {   
  71.             writeObject(value);   
  72.         }   
  73.         out.write(newline);   
  74.     }   
  75.   
  76.     public synchronized void close(TaskAttemptContext context) throws IOException {   
  77.         out.close();   
  78.     }   
  79.   
  80. }  
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

/**
 * 
 * 重新构造实现记录写入器RecordWriter类
 * Created on 2012-07-08
 * @author zhoulongliu
 * @param <K>
 * @param <V>
 */
public class LineRecordWriter<K, V> extends RecordWriter<K, V> {

    private static final String utf8 = "UTF-8";//定义字符编码格式
    private static final byte[] newline;
    static {
        try {
            newline = "\n".getBytes(utf8);//定义换行符
        } catch (UnsupportedEncodingException uee) {
            throw new IllegalArgumentException("can't find " + utf8 + " encoding");
        }
    }
    protected DataOutputStream out;
    private final byte[] keyValueSeparator;

     //实现构造方法,出入输出流对象和分隔符
    public LineRecordWriter(DataOutputStream out, String keyValueSeparator) {
        this.out = out;
        try {
            this.keyValueSeparator = keyValueSeparator.getBytes(utf8);
        } catch (UnsupportedEncodingException uee) {
            throw new IllegalArgumentException("can't find " + utf8 + " encoding");
        }
    }

    public LineRecordWriter(DataOutputStream out) {
        this(out, "\t");
    }

    private void writeObject(Object o) throws IOException {
        if (o instanceof Text) {
            Text to = (Text) o;
            out.write(to.getBytes(), 0, to.getLength());
        } else {
            out.write(o.toString().getBytes(utf8));
        }
    }
   
    /**
     * 将mapreduce的key,value以自定义格式写入到输出流中
     */
    public synchronized void write(K key, V value) throws IOException {
        boolean nullKey = key == null || key instanceof NullWritable;
        boolean nullValue = value == null || value instanceof NullWritable;
        if (nullKey && nullValue) {
            return;
        }
        if (!nullKey) {
            writeObject(key);
        }
        if (!(nullKey || nullValue)) {
            out.write(keyValueSeparator);
        }
        if (!nullValue) {
            writeObject(value);
        }
        out.write(newline);
    }

    public synchronized void close(TaskAttemptContext context) throws IOException {
        out.close();
    }

}

  接着,你实现刚刚重写MultipleOutputFormat类中的generateFileNameForKeyValue方法自定义返回需要输出文件的名称,我这里是以key值中以逗号分割取第一个字段的值作为输出文件名,这样第一个字段值相同的会输出到一个文件中并以其值作为文件名。

 

Java代码 复制代码  收藏代码
  1. public static class VVLogNameMultipleTextOutputFormat extends MultipleOutputFormat<Text, NullWritable> {   
  2.           
  3.        @Override  
  4.        protected String generateFileNameForKeyValue(Text key, NullWritable value, Configuration conf) {    
  5.            String sp[] = key.toString().split(",");   
  6.            String filename = sp[1];   
  7.            try {   
  8.                Long.parseLong(sp[1]);   
  9.            } catch (NumberFormatException e) {   
  10.                filename = "000000000000";   
  11.            }   
  12.            return filename;   
  13.        }   
  14.   
  15.   
  16.    }  
 public static class VVLogNameMultipleTextOutputFormat extends MultipleOutputFormat<Text, NullWritable> {
        
        @Override
        protected String generateFileNameForKeyValue(Text key, NullWritable value, Configuration conf) { 
            String sp[] = key.toString().split(",");
            String filename = sp[1];
            try {
                Long.parseLong(sp[1]);
            } catch (NumberFormatException e) {
                filename = "000000000000";
            }
            return filename;
        }


    }

   最后就是在job调用时设置了

        Configuration conf = getConf();
        Job job = new Job(conf);
        job.setNumReduceTasks(12);
        ......
        job.setMapperClass(VVEtlMapper.class); 
        job.setReducerClass(EtlReducer.class);
        job.setOutputFormatClass(VVLogNameMultipleTextOutputFormat.class);//设置自定义的多文件输出类
       FileInputFormat.setInputPaths(job,new Path(args[0]));
       FileOutputFormat.setOutputPath(job,new Path(args[1]));
       FileOutputFormat.setCompressOutput(job, true);//设置输出结果采用压缩 
       FileOutputFormat.setOutputCompressorClass(job, LzopCodec.class); //设置输出结果采用lzo压缩

   ok,这样你就完成了支持新的hadoop api自定义的多文件输出mapreduce编写。


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值