hadoop解决中文输出乱码

最新推荐文章于 2023-06-06 10:07:45 发布

码上富贵

最新推荐文章于 2023-06-06 10:07:45 发布

阅读量9.5k

点赞数

分类专栏： Hadoop 文章标签：编码乱码 java hadoop mapreduce

Hadoop 专栏收录该内容

10 篇文章 0 订阅

订阅专栏

hadoop涉及输出文本的默认输出编码统一用没有BOM的UTF-8的形式，但是对于中文的输出window系统默认的是GBK，有些格式文件例如CSV格式的文件用excel打开输出编码为没有BOM的UTF-8文件时，输出的结果为乱码，只能由UE或者记事本打开才能正常显示。因此将hadoop默认输出编码更改为GBK成为非常常见的需求。

方法一：

String line = transformText(value, "gbk");

public static String transformText(Text text, String encoding) {
String value = null;
try {
value = new String(text.getBytes(), 0, text.getLength(), encoding);
}catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
return value;
}

方法二：

默认的情况下MR主程序中，设定输出编码的设置语句为：

    Java代码   
    
 job.setOutputFormatClass(TextOutputFormat.class);

    Java代码   
    
 TextOutputFormat.class

的代码如下：

    Java代码   
    
  
 /** 
  * Licensed to the Apache Software Foundation (ASF) under one 
  * or more contributor license agreements.  See the NOTICE file 
  * distributed with this work for additional information 
  * regarding copyright ownership.  The ASF licenses this file 
  * to you under the Apache License, Version 2.0 (the 
  * "License"); you may not use this file except in compliance 
  * with the License.  You may obtain a copy of the License at 
  * 
  *     http://www.apache.org/licenses/LICENSE-2.0 
  * 
  * Unless required by applicable law or agreed to in writing, software 
  * distributed under the License is distributed on an "AS IS" BASIS, 
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
  * See the License for the specific language governing permissions and 
  * limitations under the License. 
  */  
   
 package org.apache.hadoop.mapreduce.lib.output;  
   
 import java.io.DataOutputStream;  
 import java.io.IOException;  
 import java.io.UnsupportedEncodingException;  
   
 import org.apache.hadoop.classification.InterfaceAudience;  
 import org.apache.hadoop.classification.InterfaceStability;  
 import org.apache.hadoop.conf.Configuration;  
 import org.apache.hadoop.fs.FileSystem;  
 import org.apache.hadoop.fs.Path;  
 import org.apache.hadoop.fs.FSDataOutputStream;  
   
 import org.apache.hadoop.io.NullWritable;  
 import org.apache.hadoop.io.Text;  
 import org.apache.hadoop.io.compress.CompressionCodec;  
 import org.apache.hadoop.io.compress.GzipCodec;  
 import org.apache.hadoop.mapreduce.OutputFormat;  
 import org.apache.hadoop.mapreduce.RecordWriter;  
 import org.apache.hadoop.mapreduce.TaskAttemptContext;  
 import org.apache.hadoop.util.*;  
   
 /** An {@link OutputFormat} that writes plain text files. */  
 @InterfaceAudience.Public  
 @InterfaceStability.Stable  
 public class TextOutputFormat<K, V> extends FileOutputFormat<K, V> {  
   public static String SEPERATOR = "mapreduce.output.textoutputformat.separator";  
   protected static class LineRecordWriter<K, V>  
     extends RecordWriter<K, V> {  
     private static final String utf8 = "UTF-8";  // 将UTF-8转换成GBK   
     private static final byte[] newline;  
     static {  
       try {  
         newline = "\n".getBytes(utf8);  
       } catch (UnsupportedEncodingException uee) {  
         throw new IllegalArgumentException("can't find " + utf8 + " encoding");  
       }  
     }  
   
     protected DataOutputStream out;  
     private final byte[] keyValueSeparator;  
   
     public LineRecordWriter(DataOutputStream out, String keyValueSeparator) {  
       this.out = out;  
       try {  
         this.keyValueSeparator = keyValueSeparator.getBytes(utf8);  
       } catch (UnsupportedEncodingException uee) {  
         throw new IllegalArgumentException("can't find " + utf8 + " encoding");  
       }  
     }  
   
     public LineRecordWriter(DataOutputStream out) {  
       this(out, "\t");  
     }  
   
     /** 
      * Write the object to the byte stream, handling Text as a special 
      * case. 
      * @param o the object to print 
      * @throws IOException if the write throws, we pass it on 
      */  
     private void writeObject(Object o) throws IOException {  
       if (o instanceof Text) {  
         Text to = (Text) o;   // 将此行代码注释掉  
         out.write(to.getBytes(), 0, to.getLength());  // 将此行代码注释掉  
       } else { // 将此行代码注释掉        
         out.write(o.toString().getBytes(utf8));  
       }  
     }  
   
     public synchronized void write(K key, V value)  
       throws IOException {  
   
       boolean nullKey = key == null || key instanceof NullWritable;  
       boolean nullValue = value == null || value instanceof NullWritable;  
       if (nullKey && nullValue) {  
         return;  
       }  
       if (!nullKey) {  
         writeObject(key);  
       }  
       if (!(nullKey || nullValue)) {  
         out.write(keyValueSeparator);  
       }  
       if (!nullValue) {  
         writeObject(value);  
       }  
       out.write(newline);  
     }  
   
     public synchronized   
     void close(TaskAttemptContext context) throws IOException {  
       out.close();  
     }  
   }  
   
   public RecordWriter<K, V>   
          getRecordWriter(TaskAttemptContext job  
                          ) throws IOException, InterruptedException {  
     Configuration conf = job.getConfiguration();  
     boolean isCompressed = getCompressOutput(job);  
     String keyValueSeparator= conf.get(SEPERATOR, "\t");  
     CompressionCodec codec = null;  
     String extension = "";  
     if (isCompressed) {  
       Class<? extends CompressionCodec> codecClass =   
         getOutputCompressorClass(job, GzipCodec.class);  
       codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);  
       extension = codec.getDefaultExtension();  
     }  
     Path file = getDefaultWorkFile(job, extension);  
     FileSystem fs = file.getFileSystem(conf);  
     if (!isCompressed) {  
       FSDataOutputStream fileOut = fs.create(file, false);  
       return new LineRecordWriter<K, V>(fileOut, keyValueSeparator);  
     } else {  
       FSDataOutputStream fileOut = fs.create(file, false);  
       return new LineRecordWriter<K, V>(new DataOutputStream  
                                         (codec.createOutputStream(fileOut)),  
                                         keyValueSeparator);  
     }  
   }  
 }  

从上述代码的第48行可以看出hadoop已经限定此输出格式统一为UTF-8，因此为了改变hadoop的输出代码的文本编码只需定义一个和TextOutputFormat相同的类GbkOutputFormat同样继承FileOutputFormat（注意是org.apache.hadoop.mapreduce.lib.output.FileOutputFormat）即可，如下代码：

    Java代码   
    
  
 import java.io.DataOutputStream;  
 import java.io.IOException;  
 import java.io.UnsupportedEncodingException;  
   
 import org.apache.hadoop.classification.InterfaceAudience;  
 import org.apache.hadoop.classification.InterfaceStability;  
 import org.apache.hadoop.conf.Configuration;  
 import org.apache.hadoop.fs.FileSystem;  
 import org.apache.hadoop.fs.Path;  
 import org.apache.hadoop.fs.FSDataOutputStream;  
   
 import org.apache.hadoop.io.NullWritable;  
 import org.apache.hadoop.io.Text;  
 import org.apache.hadoop.io.compress.CompressionCodec;  
 import org.apache.hadoop.io.compress.GzipCodec;  
 import org.apache.hadoop.mapreduce.OutputFormat;  
 import org.apache.hadoop.mapreduce.RecordWriter;  
 import org.apache.hadoop.mapreduce.TaskAttemptContext;  
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
 import org.apache.hadoop.util.*;  
   
 @InterfaceAudience.Public  
 @InterfaceStability.Stable  
 public class GbkOutputFormat<K, V> extends FileOutputFormat<K, V> {  
   public static String SEPERATOR = "mapreduce.output.textoutputformat.separator";  
   protected static class LineRecordWriter<K, V>  
     extends RecordWriter<K, V> {  
     private static final String utf8 = "GBK";  
     private static final byte[] newline;  
     static {  
       try {  
         newline = "\n".getBytes(utf8);  
       } catch (UnsupportedEncodingException uee) {  
         throw new IllegalArgumentException("can't find " + utf8 + " encoding");  
       }  
     }  
   
     protected DataOutputStream out;  
     private final byte[] keyValueSeparator;  
   
     public LineRecordWriter(DataOutputStream out, String keyValueSeparator) {  
       this.out = out;  
       try {  
         this.keyValueSeparator = keyValueSeparator.getBytes(utf8);  
       } catch (UnsupportedEncodingException uee) {  
         throw new IllegalArgumentException("can't find " + utf8 + " encoding");  
       }  
     }  
   
     public LineRecordWriter(DataOutputStream out) {  
       this(out, "\t");  
     }  
   
     /** 
      * Write the object to the byte stream, handling Text as a special 
      * case. 
      * @param o the object to print 
      * @throws IOException if the write throws, we pass it on 
      */  
     private void writeObject(Object o) throws IOException {  
       if (o instanceof Text) {  
 //        Text to = (Text) o;  
 //        out.write(to.getBytes(), 0, to.getLength());  
 //      } else {  
         out.write(o.toString().getBytes(utf8));  
       }  
     }  
   
     public synchronized void write(K key, V value)  
       throws IOException {  
   
       boolean nullKey = key == null || key instanceof NullWritable;  
       boolean nullValue = value == null || value instanceof NullWritable;  
       if (nullKey && nullValue) {  
         return;  
       }  
       if (!nullKey) {  
         writeObject(key);  
       }  
       if (!(nullKey || nullValue)) {  
         out.write(keyValueSeparator);  
       }  
       if (!nullValue) {  
         writeObject(value);  
       }  
       out.write(newline);  
     }  
   
     public synchronized   
     void close(TaskAttemptContext context) throws IOException {  
       out.close();  
     }  
   }  
   
   public RecordWriter<K, V>   
          getRecordWriter(TaskAttemptContext job  
                          ) throws IOException, InterruptedException {  
     Configuration conf = job.getConfiguration();  
     boolean isCompressed = getCompressOutput(job);  
     String keyValueSeparator= conf.get(SEPERATOR, "\t");  
     CompressionCodec codec = null;  
     String extension = "";  
     if (isCompressed) {  
       Class<? extends CompressionCodec> codecClass =   
         getOutputCompressorClass(job, GzipCodec.class);  
       codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);  
       extension = codec.getDefaultExtension();  
     }  
     Path file = getDefaultWorkFile(job, extension);  
     FileSystem fs = file.getFileSystem(conf);  
     if (!isCompressed) {  
       FSDataOutputStream fileOut = fs.create(file, false);  
       return new LineRecordWriter<K, V>(fileOut, keyValueSeparator);  
     } else {  
       FSDataOutputStream fileOut = fs.create(file, false);  
       return new LineRecordWriter<K, V>(new DataOutputStream  
                                         (codec.createOutputStream(fileOut)),  
                                         keyValueSeparator);  
     }  
   }  
 }  

最后将输出编码类型设置成GbkOutputFormat.class，如：

    Java代码   
    
 job.setOutputFormatClass(GbkOutputFormat.class);

码上富贵

关注

0
点赞
踩
6

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录