前言
写在前面,如果输入的文件是UTF-8,不需要进行任何转换处理。
如果是其他类型的字符,则需要进行处理,否则Hadoop显示和处理会出错,而且写入输出文件是乱码。
思路
读取文件的时候如果文件的格式是GBK那么,用Hadoop默认的读取会出现乱码,同时也会影响操作。
因此在读取的时候需要按照GBK的格式进行读取。
当处理完成后,需要将输出文件也设置成GBK的格式进行输出。
在map函数中执行读取的操作进行编码转换。
public static class Map extends Mapper<LongWritable, Text, Text, IntWritable>
{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException
{
String line = new String(value.getBytes(),0,value.getLength(),"GBK");
......................
}
...............
}
输出的时候需要将输出文件的内容设置成GBK,这个时候需要重写输出处理类。
以前用的是这个:
job.setOutputFormatClass(TextOutputFormat.class);
代码实现
现在实现一个这个,基本上就是复制粘贴了TextOutputFormat这个类,里面做了很少的处理。
TextOutputFormat源码
public class TextOutputFormat<K, V> extends FileOutputFormat<K, V> {
public static String SEPERATOR = "mapreduce.output.textoutputformat.separator";
protected static class LineRecordWriter<K, V>
extends RecordWriter<K, V> {
private static final String utf8 = "UTF-8";
private static final byte[] newline;
static {
try {
newline = "\n".getBytes(utf8);
} catch (UnsupportedEncodingException uee) {
throw new IllegalArgumentException("can't find " + utf8 + " encoding");
}
}
protected DataOutputStream out;
private final byte[] keyValueSeparator;
public LineRecordWriter(DataOutputStream out, String keyValueSeparator) {
this.out = out;
try {
this.keyValueSeparator = keyValueSeparator.getBytes(utf8);
} catch (UnsupportedEncodingException uee) {
throw new IllegalArgumentException("can't find " + utf8 + " encoding");
}
}
public LineRecordWriter(DataOutputStream out) {
this(out, "\t");
}
/**
* Write the object to the byte stream, handling Text as a special
* case.
* @param o the object to print
* @throws IOException if the write throws, we pass it on
*/
private void writeObject(Object o) throws IOException {
if (o instanceof Text) {
Text to = (Text) o;
out.write(to.getBytes(), 0, to.getLength());
} else {
out.write(o.toString().getBytes(utf8));
}
}
public synchronized void write(K key, V value)
throws IOException {
boolean nullKey = key == null || key instanceof NullWritable;
boolean nullValue = value == null || value instanceof NullWritable;
if (nullKey && nullValue) {
return;
}
if (!nullKey) {
writeObject(key);
}
if (!(nullKey || nullValue)) {
out.write(keyValueSeparator);
}
if (!nullValue) {
writeObject(value);
}
out.write(newline);
}
public synchronized
void close(TaskAttemptContext context) throws IOException {
out.close();
}
}
public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
Configuration conf = job.getConfiguration();
boolean isCompressed = getCompressOutput(job);
String keyValueSeparator= conf.get(SEPERATOR, "\t");
CompressionCodec codec = null;
String extension = "";
if (isCompressed) {
Class<? extends CompressionCodec> codecClass =
getOutputCompressorClass(job, GzipCodec.class);
codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
extension = codec.getDefaultExtension();
}
Path file = getDefaultWorkFile(job, extension);
FileSystem fs = file.getFileSystem(conf);
if (!isCompressed) {
FSDataOutputStream fileOut = fs.create(file, false);
return new LineRecordWriter<K, V>(fileOut, keyValueSeparator);
} else {
FSDataOutputStream fileOut = fs.create(file, false);
return new LineRecordWriter<K, V>(new DataOutputStream
(codec.createOutputStream(fileOut)),
keyValueSeparator);
}
}
}
GBKTextOutputFormat的源码
public class GBKTextOutputFormat<K, V> extends FileOutputFormat<K, V>
{
public static String SEPERATOR = "mapreduce.output.textoutputformat.separator";
protected static class LineRecordWriter<K, V> extends RecordWriter<K, V>
{
private static final String gbk = "gbk";//******
private static final byte[] newline;
static
{
try
{
newline = "\n".getBytes(gbk);
}
catch (UnsupportedEncodingException uee)
{
throw new IllegalArgumentException("can't find " + gbk + " encoding");
}
}
protected DataOutputStream out;
private final byte[] keyValueSeparator;
public LineRecordWriter(DataOutputStream out, String keyValueSeparator)
{
this.out = out;
try
{
this.keyValueSeparator = keyValueSeparator.getBytes(gbk);
}
catch (UnsupportedEncodingException uee)
{
throw new IllegalArgumentException("can't find " + gbk + " encoding");
}
}
public LineRecordWriter(DataOutputStream out)
{
this(out, "\t");
}
/**
* Write the object to the byte stream, handling Text as a special case.
*
* @param o
* the object to print
* @throws IOException
* if the write throws, we pass it on
*/
private void writeObject(Object o) throws IOException
{
//******
out.write(o.toString().getBytes(gbk));
//******
}
public synchronized void write(K key, V value) throws IOException
{
boolean nullKey = key == null || key instanceof NullWritable;
boolean nullValue = value == null || value instanceof NullWritable;
if (nullKey && nullValue)
{
return;
}
if (!nullKey)
{
writeObject(key);
}
if (!(nullKey || nullValue))
{
out.write(keyValueSeparator);
}
if (!nullValue)
{
writeObject(value);
}
out.write(newline);
}
public synchronized void close(TaskAttemptContext context) throws IOException
{
out.close();
}
}
public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException
{
Configuration conf = job.getConfiguration();
boolean isCompressed = getCompressOutput(job);
String keyValueSeparator = conf.get(SEPERATOR, "\t");
CompressionCodec codec = null;
String extension = "";
if (isCompressed)
{
Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class);
codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
extension = codec.getDefaultExtension();
}
Path file = getDefaultWorkFile(job, extension);
FileSystem fs = file.getFileSystem(conf);
if (!isCompressed)
{
FSDataOutputStream fileOut = fs.create(file, false);
return new LineRecordWriter<K, V>(fileOut, keyValueSeparator);
}
else
{
FSDataOutputStream fileOut = fs.create(file, false);
return new LineRecordWriter<K, V>(new DataOutputStream(codec.createOutputStream(fileOut)),
keyValueSeparator);
}
}
}
使用的时候
job.setOutputFormatClass(GBKTextOutputFormat.class);
看输出结果,正确。
原因就是hadoop为了通用性,所有的处理都是UTF-8,这个只是特殊处理,建议输入文件采用UTF-8的格式。