hadoop之MR字节码转换

最新推荐文章于 2021-10-03 00:07:37 发布

mc幻刺

最新推荐文章于 2021-10-03 00:07:37 发布

阅读量585

点赞数

分类专栏： hadoop源码文章标签： hadoop java 源码字符集编码

本文链接：https://blog.csdn.net/nju_mc/article/details/53010640

版权

hadoop源码专栏收录该内容

1 篇文章 0 订阅

订阅专栏

大家如果看过hadoop的文本文件输入字符集格式，就知道在TextOutputFormat源码中写死了输出字节码格式是UTF-8，源码如下

  
import java.io.DataOutputStream;  
import java.io.IOException;  
import java.io.UnsupportedEncodingException;  
  
import org.apache.hadoop.conf.Configuration;  
import org.apache.hadoop.fs.FileSystem;  
import org.apache.hadoop.fs.Path;  
import org.apache.hadoop.fs.FSDataOutputStream;  
  
import org.apache.hadoop.io.NullWritable;  
import org.apache.hadoop.io.Text;  
import org.apache.hadoop.io.compress.CompressionCodec;  
import org.apache.hadoop.io.compress.GzipCodec;  
import org.apache.hadoop.mapreduce.OutputFormat;  
import org.apache.hadoop.mapreduce.RecordWriter;  
import org.apache.hadoop.mapreduce.TaskAttemptContext;  
import org.apache.hadoop.util.*;  
  
/** An {@link OutputFormat} that writes plain text files. */  
public class TextOutputFormat<K, V> extends FileOutputFormat<K, V> {//TextInputFormat是默认的输出文件格式   
  protected static class LineRecordWriter<K, V>//默认   
    extends RecordWriter<K, V> {  
    private static final String utf8 = "UTF-8"; //这个地方写死了输出字符集是UTF-8 
    private static final byte[] newline;//行结束符？   
    static {  
      try {  
        newline = "\n".getBytes(utf8);  
      } catch (UnsupportedEncodingException uee) {  
        throw new IllegalArgumentException("can't find " + utf8 + " encoding");  
      }  
    }  
  
    protected DataOutputStream out;  
    private final byte[] keyValueSeparator;//key和value的分隔符，默认的好像是Tab   
  
    public LineRecordWriter(DataOutputStream out, String keyValueSeparator) {//构造函数，初始化输出流及分隔符    
      this.out = out;  
      try {  
        this.keyValueSeparator = keyValueSeparator.getBytes(utf8);  
      } catch (UnsupportedEncodingException uee) {  
        throw new IllegalArgumentException("can't find " + utf8 + " encoding");  
      }  
    }  
  
    public LineRecordWriter(DataOutputStream out) {//默认的分隔符   
      this(out, "\t");  
    }  
  
    /** 
     * Write the object to the byte stream, handling Text as a special输出流是byte格式的 
     * case. 
     * @param o the object to print是要输出的对象 
     * @throws IOException if the write throws, we pass it on 
     */  
    private void writeObject(Object o) throws IOException {//应该是一行一行的写 key keyValueSeparator value \n   
      if (o instanceof Text) {//如果o是Text的实例   
        Text to = (Text) o;  
        out.write(to.getBytes(), 0, to.getLength());//写出   
      } else {  
        out.write(o.toString().getBytes(utf8));  
      }  
    }  
  
    public synchronized void write(K key, V value)//给写线程加锁，写是互斥行为   
      throws IOException {  
<span style="white-space:pre">    </span>//下面是为了判断key和value是否为空值   
      boolean nullKey = key == null || key instanceof NullWritable;//这语句太牛了   
      boolean nullValue = value == null || value instanceof NullWritable;  
      if (nullKey && nullValue) {//   
        return;  
      }  
      if (!nullKey) {  
        writeObject(key);  
      }  
      if (!(nullKey || nullValue)) {  
        out.write(keyValueSeparator);  
      }  
      if (!nullValue) {  
        writeObject(value);  
      }  
      out.write(newline);  
    }  
  
    public synchronized   
    void close(TaskAttemptContext context) throws IOException {  
      out.close();  
    }  
  }  
  
  public RecordWriter<K, V>    getRecordWriter(TaskAttemptContext job//获得writer实例   
                         ) throws IOException, InterruptedException {  
    Configuration conf = job.getConfiguration();  
    boolean isCompressed = getCompressOutput(job);//   
    String keyValueSeparator= conf.get("mapred.textoutputformat.separator",  
                                       "\t");  
    CompressionCodec codec = null;//压缩格式 还是？   
    String extension = "";  
    if (isCompressed) {  
      Class<? extends CompressionCodec> codecClass =   
        getOutputCompressorClass(job, GzipCodec.class);  
      codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);  
      extension = codec.getDefaultExtension();  
    }  
    Path file = getDefaultWorkFile(job, extension);//这个是获取缺省的文件路径及名称，在FileOutput中有对其的实现   
    FileSystem fs = file.getFileSystem(conf);  
    if (!isCompressed) {  
      FSDataOutputStream fileOut = fs.create(file, false);  
      return new LineRecordWriter<K, V>(fileOut, keyValueSeparator);  
    } else {  
      FSDataOutputStream fileOut = fs.create(file, false);  
      return new LineRecordWriter<K, V>(new DataOutputStream  
                                        (codec.createOutputStream(fileOut)),  
                                        keyValueSeparator);  
    }  
  }  
}

但是在生产环境中，输入输出字符集格式总是不一定会是utf-8格式，有可能处理之后的文本要求输出格式是GBK、BIG5等之类，作为下一个程序的输入格式，尤其是银行业，日志格式一般都是GBK，指定输出格式是GBK替换上面的源代码中的UTF-8即可，但是字符集编码格式那么多，做一个大数据的平台产品，面向的就是全世界的客户，这样去指定输出格式没有那么自动化，受众也是极窄的。如果是我们能够在MR程序的设置指定字符集那就完美契合生产环境中复杂的需求。故而修改源码如下：

package com.huateng.hadoop.mapred.transcoding.format;

import java.io.DataOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.ReflectionUtils;

public class EncodingOutputFormat<K, V> extends FileOutputFormat<K, V>{
	  
	  public static String SEPERATOR = "mapreduce.output.textoutputformat.separator";
	  protected static class LineRecordWriter<K, V>
	    extends RecordWriter<K, V>  {
	    private   String charset;//手动指定的参数,在这里我们可以任意去指定一个输出的字符集
	    private   byte[] newline;
	   
	    protected DataOutputStream out;
	    private final byte[] keyValueSeparator;
//传入的参数，在下面写进输出行记录，那么指定的字符集参数从哪里传进去了？必须在MR执行的时候能够起作用。所以我们就必须在构造器中赋予该参数
	    public LineRecordWriter(DataOutputStream out, String keyValueSeparator,String dsc_charset) {
	      this.out = out;
	      charset=dsc_charset; 
	      try {
	    	  	newline = "\n".getBytes(charset);
	    	  	this.keyValueSeparator = keyValueSeparator.getBytes(charset);
	      } catch (UnsupportedEncodingException uee) {
	    	  	throw new IllegalArgumentException("can't find " + charset + " encoding");
	      }
	    }
	    /**
	     * Write the object to the byte stream, handling Text as a special case.
	     * @param o the object to print
	     * @throws IOException if the write throws, we pass it on
	     */
	    private void writeObject(Object o) throws IOException {
	      if (o instanceof Text) {
//	        	Text to = (Text) o;
//	        	out.write(to.getBytes(), 0, to.getLength());
//	      } else {
	    	  	out.write(o.toString().getBytes(charset));
	      }
	    }

	    public synchronized void write(K key, V value)
	    		throws IOException {

	      boolean nullKey = key == null || key instanceof NullWritable;
	      boolean nullValue = value == null || value instanceof NullWritable;
	      if (nullKey && nullValue) {
	        return;
	      }
	      if (!nullKey) {
	        writeObject(key);
	      }
	      if (!(nullKey || nullValue)) {
	        out.write(keyValueSeparator);
	      }
	      if (!nullValue) {
	        writeObject(value);
	      }
	      out.write("\n".getBytes());
	    }

	    public synchronized 
	    		void close(TaskAttemptContext context) throws IOException {
	      out.close();
	    }
	  }

	  public RecordWriter<K, V> 
	         getRecordWriter(
	        		 TaskAttemptContext job
	                         ) throws IOException, InterruptedException {
	    Configuration conf = job.getConfiguration();
	    String dst_charset = job.getConfiguration().get("ark.dsccodec");
//	    compress压缩

//参数就是从这里传进来的，我们得到MR的job的参数，进行指定
	    boolean isCompressed = getCompressOutput(job);
	    String keyValueSeparator= conf.get(SEPERATOR, "\t");
	    CompressionCodec codec = null;
	    String extension = "";
	    if (isCompressed) {
	      Class<? extends CompressionCodec> codecClass = 
	        getOutputCompressorClass(job, GzipCodec.class);
	      codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
	      extension = codec.getDefaultExtension();
	    }
//	    setOutputName(job,"transform");
	    Path file = getDefaultWorkFile(job, extension);
	    FileSystem fs = file.getFileSystem(conf);
	    if (!isCompressed) {
	      FSDataOutputStream fileOut = fs.create(file, false);
	      return new LineRecordWriter<K, V>(fileOut, keyValueSeparator,dst_charset);
	    } else {
	      FSDataOutputStream fileOut = fs.create(file, false);
	      return new LineRecordWriter<K, V>(new DataOutputStream(codec.createOutputStream(fileOut)),
	                                				keyValueSeparator,dst_charset);
	    }
	  }
	}

 private   String charset;//手动指定的参数,在这里我们可以任意去指定一个输出的字符集

/传入的参数，在下面写进输出行记录，那么指定的字符集参数从哪里传进去了？必须在MR执行的时候能够起作用。所以我们就必须在构造器中赋予该参数
	    public LineRecordWriter(DataOutputStream out, String keyValueSeparator,String dsc_charset) {}

<pre name="code" class="html">protected static class LineRecordWriter<K, V>
	    extends RecordWriter<K, V>

在LineRecordWriter的父类RecordWriter中得到指定的字符集

<pre name="code" class="html">public RecordWriter<K, V> 
	         getRecordWriter(
	        		 TaskAttemptContext job
	                         ) throws IOException, InterruptedException {
	    Configuration conf = job.getConfiguration();
	    String dst_charset = job.getConfiguration().get("ark.dsccodec");

//参数就是从这里传进来的，我们得到MR的job的参数，进行指定

}

<pre name="code" class="html">job.getConfiguration().get("ark.dsccodec")这个设定Configuration则来源于创建Job实例的进行指定，至此完成。本人生产环境（华为FI集群）亲测，JDK支持的字符集都可

转换。具体main方法设定参数如下：

<pre name="code" class="html">
import java.io.IOException;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import com.google.common.base.Preconditions;
import com.huateng.hadoop.mapred.MapRedAdapter;
import com.huateng.hadoop.mapred.transcoding.format.EncodingOutputFormat;
//import com.huateng.hadoop.mapred.transcoding.format.GB2312OutputFormat;
//import com.huateng.hadoop.mapred.transcoding.format.GBKOutputFormat;
import com.huateng.hdfs.common.HDFSClient;
import com.huateng.util.common.StringUtils;

/*
 * @author canMao
 */
public class TranscodingJob 
{
	String other_code=null;
	private Job internalJob;
	public TranscodingJob(String in_path,String src_charset,
			String out_path,String dst_charset)throws Exception{
		Preconditions.checkArgument(!StringUtils.hasNullOrEmpty(new String[]{src_charset, dst_charset})
				," source_encoding and destination_encoding is null at least one");
		Job job = MapRedAdapter.createJob();
		job.getConfiguration().set("ark.codec", src_charset);
		job.getConfiguration().set("ark.dsccodec", dst_charset);
		job.setJarByClass(TranscodingJob.class);
		job.setMapperClass(TranscodingMapper.class);
		job.setNumReduceTasks(0);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(NullWritable.class);
		if (dst_charset.equals("UTF-8")) {
			job.setOutputFormatClass(TextOutputFormat.class);
		}else{
			job.setOutputFormatClass(EncodingOutputFormat.class);
		}
		FileInputFormat.setInputPaths(job, new Path(in_path));
		if (HDFSClient.getFileSystem().exists(new Path(out_path))) {
			HDFSClient.getFileSystem().delete(new Path(out_path),true);
		}
		FileOutputFormat.setOutputPath(job, new Path(out_path));
		internalJob = job;
	}
	
	public boolean submit() throws ClassNotFoundException, IOException, InterruptedException
	{
//		float progress=0.0f;
//		internalJob.submit();
//		while(true){
//			internalJob.mapProgress();
//		}
		
		return internalJob.waitForCompletion(true);
//		internalJob.submit();
	}
	
	
}

mc幻刺

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
hadoop之MR字节码转换

但是在生产环境中，输入输出字符集格式总是不一定会是utf-8格式，有可能处理之后的文本要求输出格式是GBK、BIG5等之类，作为下一个程序的输入格式，尤其是银行业，日志格式一般都是GBK，指定输出格式是GBK替换上面的源代码中的UTF-8即可，但是字符集编码格式那么多，做一个大数据的平台产品，面向的就是全世界的客户，这样去指定输出格式没有那么自动化，受众也是极窄的。如果是我们能够在MR程序的设置指定字符集那就完美契合生产环境中复杂的需求。故而修改源码如下：
复制链接

扫一扫

专栏目录