大家如果看过hadoop的文本文件输入字符集格式,就知道在TextOutputFormat源码中写死了输出字节码格式是UTF-8,源码如下
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.util.*;
/** An {@link OutputFormat} that writes plain text files. */
public class TextOutputFormat<K, V> extends FileOutputFormat<K, V> {//TextInputFormat是默认的输出文件格式
protected static class LineRecordWriter<K, V>//默认
extends RecordWriter<K, V> {
private static final String utf8 = "UTF-8"; //这个地方写死了输出字符集是UTF-8
private static final byte[] newline;//行结束符?
static {
try {
newline = "\n".getBytes(utf8);
} catch (UnsupportedEncodingException uee) {
throw new IllegalArgumentException("can't find " + utf8 + " encoding");
}
}
protected DataOutputStream out;
private final byte[] keyValueSeparator;//key和value的分隔符,默认的好像是Tab
public LineRecordWriter(DataOutputStream out, String keyValueSeparator) {//构造函数,初始化输出流及分隔符
this.out = out;
try {
this.keyValueSeparator = keyValueSeparator.getBytes(utf8);
} catch (UnsupportedEncodingException uee) {
throw new IllegalArgumentException("can't find " + utf8 + " encoding");
}
}
public LineRecordWriter(DataOutputStream out) {//默认的分隔符
this(out, "\t");
}
/**
* Write the object to the byte stream, handling Text as a special输出流是byte格式的
* case.
* @param o the object to print是要输出的对象
* @throws IOException if the write throws, we pass it on
*/
private void writeObject(Object o) throws IOException {//应该是一行一行的写 key keyValueSeparator value \n
if (o instanceof Text) {//如果o是Text的实例
Text to = (Text) o;
out.write(to.getBytes(), 0, to.getLength());//写出
} else {
out.write(o.toString().getBytes(utf8));
}
}
public synchronized void write(K key, V value)//给写线程加锁,写是互斥行为
throws IOException {
<span style="white-space:pre"> </span>//下面是为了判断key和value是否为空值
boolean nullKey = key == null || key instanceof NullWritable;//这语句太牛了
boolean nullValue = value == null || value instanceof NullWritable;
if (nullKey && nullValue) {//
return;
}
if (!nullKey) {
writeObject(key);
}
if (!(nullKey || nullValue)) {
out.write(keyValueSeparator);
}
if (!nullValue) {
writeObject(value);
}
out.write(newline);
}
public synchronized
void close(TaskAttemptContext context) throws IOException {
out.close();
}
}
public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job//获得writer实例
) throws IOException, InterruptedException {
Configuration conf = job.getConfiguration();
boolean isCompressed = getCompressOutput(job);//
String keyValueSeparator= conf.get("mapred.textoutputformat.separator",
"\t");
CompressionCodec codec = null;//压缩格式 还是?
String extension = "";
if (isCompressed) {
Class<? extends CompressionCodec> codecClass =
getOutputCompressorClass(job, GzipCodec.class);
codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
extension = codec.getDefaultExtension();
}
Path file = getDefaultWorkFile(job, extension);//这个是获取缺省的文件路径及名称,在FileOutput中有对其的实现
FileSystem fs = file.getFileSystem(conf);
if (!isCompressed) {
FSDataOutputStream fileOut = fs.create(file, false);
return new LineRecordWriter<K, V>(fileOut, keyValueSeparator);
} else {
FSDataOutputStream fileOut = fs.create(file, false);
return new LineRecordWriter<K, V>(new DataOutputStream
(codec.createOutputStream(fileOut)),
keyValueSeparator);
}
}
}
但是在生产环境中,输入输出字符集格式总是不一定会是utf-8格式,有可能处理之后的文本要求输出格式是GBK、BIG5等之类,作为下一个程序的输入格式,尤其是银行业,日志格式一般都是GBK,指定输出格式是GBK替换上面的源代码中的UTF-8即可,但是字符集编码格式那么多,做一个大数据的平台产品,面向的就是全世界的客户,这样去指定输出格式没有那么自动化,受众也是极窄的。如果是我们能够在MR程序的设置指定字符集那就完美契合生产环境中复杂的需求。故而修改源码如下:
package com.huateng.hadoop.mapred.transcoding.format;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.ReflectionUtils;
public class EncodingOutputFormat<K, V> extends FileOutputFormat<K, V>{
public static String SEPERATOR = "mapreduce.output.textoutputformat.separator";
protected static class LineRecordWriter<K, V>
extends RecordWriter<K, V> {
private String charset;//手动指定的参数,在这里我们可以任意去指定一个输出的字符集
private byte[] newline;
protected DataOutputStream out;
private final byte[] keyValueSeparator;
//传入的参数,在下面写进输出行记录,那么指定的字符集参数从哪里传进去了?必须在MR执行的时候能够起作用。所以我们就必须在构造器中赋予该参数
public LineRecordWriter(DataOutputStream out, String keyValueSeparator,String dsc_charset) {
this.out = out;
charset=dsc_charset;
try {
newline = "\n".getBytes(charset);
this.keyValueSeparator = keyValueSeparator.getBytes(charset);
} catch (UnsupportedEncodingException uee) {
throw new IllegalArgumentException("can't find " + charset + " encoding");
}
}
/**
* Write the object to the byte stream, handling Text as a special case.
* @param o the object to print
* @throws IOException if the write throws, we pass it on
*/
private void writeObject(Object o) throws IOException {
if (o instanceof Text) {
// Text to = (Text) o;
// out.write(to.getBytes(), 0, to.getLength());
// } else {
out.write(o.toString().getBytes(charset));
}
}
public synchronized void write(K key, V value)
throws IOException {
boolean nullKey = key == null || key instanceof NullWritable;
boolean nullValue = value == null || value instanceof NullWritable;
if (nullKey && nullValue) {
return;
}
if (!nullKey) {
writeObject(key);
}
if (!(nullKey || nullValue)) {
out.write(keyValueSeparator);
}
if (!nullValue) {
writeObject(value);
}
out.write("\n".getBytes());
}
public synchronized
void close(TaskAttemptContext context) throws IOException {
out.close();
}
}
public RecordWriter<K, V>
getRecordWriter(
TaskAttemptContext job
) throws IOException, InterruptedException {
Configuration conf = job.getConfiguration();
String dst_charset = job.getConfiguration().get("ark.dsccodec");
// compress压缩
//参数就是从这里传进来的,我们得到MR的job的参数,进行指定
boolean isCompressed = getCompressOutput(job);
String keyValueSeparator= conf.get(SEPERATOR, "\t");
CompressionCodec codec = null;
String extension = "";
if (isCompressed) {
Class<? extends CompressionCodec> codecClass =
getOutputCompressorClass(job, GzipCodec.class);
codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
extension = codec.getDefaultExtension();
}
// setOutputName(job,"transform");
Path file = getDefaultWorkFile(job, extension);
FileSystem fs = file.getFileSystem(conf);
if (!isCompressed) {
FSDataOutputStream fileOut = fs.create(file, false);
return new LineRecordWriter<K, V>(fileOut, keyValueSeparator,dst_charset);
} else {
FSDataOutputStream fileOut = fs.create(file, false);
return new LineRecordWriter<K, V>(new DataOutputStream(codec.createOutputStream(fileOut)),
keyValueSeparator,dst_charset);
}
}
}
private String charset;//手动指定的参数,在这里我们可以任意去指定一个输出的字符集
/传入的参数,在下面写进输出行记录,那么指定的字符集参数从哪里传进去了?必须在MR执行的时候能够起作用。所以我们就必须在构造器中赋予该参数
public LineRecordWriter(DataOutputStream out, String keyValueSeparator,String dsc_charset) {}
<pre name="code" class="html">protected static class LineRecordWriter<K, V>
extends RecordWriter<K, V>
在LineRecordWriter的父类RecordWriter中得到指定的字符集
<pre name="code" class="html">public RecordWriter<K, V>
getRecordWriter(
TaskAttemptContext job
) throws IOException, InterruptedException {
Configuration conf = job.getConfiguration();
String dst_charset = job.getConfiguration().get("ark.dsccodec");
//参数就是从这里传进来的,我们得到MR的job的参数,进行指定
}
<pre name="code" class="html">job.getConfiguration().get("ark.dsccodec")这个设定Configuration则来源于创建Job实例的进行指定,至此完成。本人生产环境(华为FI集群)亲测,JDK支持的字符集都可
转换。具体main方法设定参数如下:
<pre name="code" class="html">
import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import com.google.common.base.Preconditions;
import com.huateng.hadoop.mapred.MapRedAdapter;
import com.huateng.hadoop.mapred.transcoding.format.EncodingOutputFormat;
//import com.huateng.hadoop.mapred.transcoding.format.GB2312OutputFormat;
//import com.huateng.hadoop.mapred.transcoding.format.GBKOutputFormat;
import com.huateng.hdfs.common.HDFSClient;
import com.huateng.util.common.StringUtils;
/*
* @author canMao
*/
public class TranscodingJob
{
String other_code=null;
private Job internalJob;
public TranscodingJob(String in_path,String src_charset,
String out_path,String dst_charset)throws Exception{
Preconditions.checkArgument(!StringUtils.hasNullOrEmpty(new String[]{src_charset, dst_charset})
," source_encoding and destination_encoding is null at least one");
Job job = MapRedAdapter.createJob();
job.getConfiguration().set("ark.codec", src_charset);
job.getConfiguration().set("ark.dsccodec", dst_charset);
job.setJarByClass(TranscodingJob.class);
job.setMapperClass(TranscodingMapper.class);
job.setNumReduceTasks(0);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
if (dst_charset.equals("UTF-8")) {
job.setOutputFormatClass(TextOutputFormat.class);
}else{
job.setOutputFormatClass(EncodingOutputFormat.class);
}
FileInputFormat.setInputPaths(job, new Path(in_path));
if (HDFSClient.getFileSystem().exists(new Path(out_path))) {
HDFSClient.getFileSystem().delete(new Path(out_path),true);
}
FileOutputFormat.setOutputPath(job, new Path(out_path));
internalJob = job;
}
public boolean submit() throws ClassNotFoundException, IOException, InterruptedException
{
// float progress=0.0f;
// internalJob.submit();
// while(true){
// internalJob.mapProgress();
// }
return internalJob.waitForCompletion(true);
// internalJob.submit();
}
}