Java正确判别出文件的字符集-带BOM和不带BOM的UTF-8字符

最新推荐文章于 2021-09-01 22:19:41 发布

独正己身

最新推荐文章于 2021-09-01 22:19:41 发布

阅读量1.7k

点赞数

分类专栏： java 文章标签： java

原文链接：http://blog.csdn.net/tibib/article/details/7988735

版权

java 专栏收录该内容

29 篇文章 0 订阅

订阅专栏

Java正确判别出文件的字符集（尤其是带BOM和不带BOM的UTF-8字符)

前几天在项目中需要读取用户上传过来的txt文件，但不确定txt文件的字符集

UTF-16、UTF-8(带BOM)、Unicode可以根据前三个字节区别

public String getTxtEncode(FileInputStream in) throws IOException{  
        byte[] head = new byte[3];    
        in.read(head);      
        String code = "GBK";    
        if (head[0] == -1 && head[1] == -2 )    
            code = "UTF-16";    
        if (head[0] == -2 && head[1] == -1 )    
            code = "Unicode";  
        //带BOM  
        if(head[0]==-17 && head[1]==-69 && head[2] ==-65)    
            code = "UTF-8";    
        if("Unicode".equals(code)){  
         code = "UTF-16";  
        }  
        return code;  
 }

但不带BOM的UTF-8和GBK前三个字节不确定，用以上方法无法区别

通过在google上搜索发现不带BOM的识别是Java遗留的一个bug，呵呵，终于找到根源了，Java提供了此bug的解决方案

package com.justsy.sts.utf8;  
  
import java.io.*;    
  
/**  
 * This inputstream will recognize unicode BOM marks and will skip bytes if  
 * getEncoding() method is called before any of the read(...) methods.  
 *   
 * Usage pattern: String enc = "ISO-8859-1"; // or NULL to use systemdefault  
 * FileInputStream fis = new FileInputStream(file); UnicodeInputStream uin = new  
 * UnicodeInputStream(fis, enc); enc = uin.getEncoding(); // check and skip  
 * possible BOM bytes InputStreamReader in; if (enc == null) in = new  
 * InputStreamReader(uin); else in = new InputStreamReader(uin, enc);  
 */    
public class UnicodeInputStream extends InputStream {    
    PushbackInputStream internalIn;    
    boolean isInited = false;    
    String defaultEnc;    
    String encoding;    
    
    private static final int BOM_SIZE = 4;    
    
    public UnicodeInputStream(InputStream in, String defaultEnc) {    
        internalIn = new PushbackInputStream(in, BOM_SIZE);    
        this.defaultEnc = defaultEnc;    
    }    
    
    public String getDefaultEncoding() {    
        return defaultEnc;    
    }    
    
    public String getEncoding() {    
        if (!isInited) {    
            try {    
                init();    
            } catch (IOException ex) {    
                IllegalStateException ise = new IllegalStateException(    
                        "Init method failed.");    
                ise.initCause(ise);    
                throw ise;    
            }    
        }    
        return encoding;    
    }    
    
    /**  
     * Read-ahead four bytes and check for BOM marks. Extra bytes are unread  
     * back to the stream, only BOM bytes are skipped.  
     */    
    protected void init() throws IOException {    
        if (isInited)    
            return;    
    
        byte bom[] = new byte[BOM_SIZE];    
        int n, unread;    
        n = internalIn.read(bom, 0, bom.length);    
    
        if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00)    
                && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {    
            encoding = "UTF-32BE";    
            unread = n - 4;    
        } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)    
                && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {    
            encoding = "UTF-32LE";    
            unread = n - 4;    
        } else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB)    
                && (bom[2] == (byte) 0xBF)) {    
            encoding = "UTF-8";    
            unread = n - 3;    
        } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {    
            encoding = "UTF-16BE";    
            unread = n - 2;    
        } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {    
            encoding = "UTF-16LE";    
            unread = n - 2;    
        } else {    
            // Unicode BOM mark not found, unread all bytes    
            encoding = defaultEnc;    
            unread = n;    
        }    
        // System.out.println("read=" + n + ", unread=" + unread);    
    
        if (unread > 0)    
            internalIn.unread(bom, (n - unread), unread);    
    
        isInited = true;    
    }    
    
    public void close() throws IOException {    
        // init();    
        isInited = true;    
        internalIn.close();    
    }    
    
    public int read() throws IOException {    
        // init();    
        isInited = true;    
        return internalIn.read();    
    }    
}

通过使用上述InputStream类的实现可以正确的读取出不带BOM和带BOM的字符集

package com.justsy.sts.utf8;  
  
import java.io.BufferedReader;    
import java.io.File;    
import java.io.FileInputStream;    
import java.io.IOException;    
import java.io.InputStreamReader;  
import java.nio.charset.Charset;  
    
public class UTF8Test {    
    public static void main(String[] args) throws IOException {    
        File f  = new File("D:"+File.separator+"Order.txt");    
        FileInputStream in = new FileInputStream(f);    
        String dc  = Charset.defaultCharset().name();  
        UnicodeInputStream uin = new UnicodeInputStream(in,dc);  
        BufferedReader br = new BufferedReader(new InputStreamReader(uin));    
        String line = br.readLine();    
        while(line != null)    
        {    
            System.out.println(line);    
            line = br.readLine();    
        }    
    }    
}

结合Java提供的方案，我们就可以比较完整的判别出各种字符集了

public String getTxtEncode(FileInputStream in) throws IOException{  
   
 String dc  = Charset.defaultCharset().name();  
       UnicodeInputStream uin = new UnicodeInputStream(in,dc);  
         
       if("UTF-8".equals(uin.getEncoding())){  
        uin.close();  
        return "UTF-8";  
       }  
       uin.close();  
         
       byte[] head = new byte[3];    
       in.read(head);      
       String code = "GBK";    
       if (head[0] == -1 && head[1] == -2 )    
           code = "UTF-16";    
       if (head[0] == -2 && head[1] == -1 )    
           code = "Unicode";  
       //带BOM  
       if(head[0]==-17 && head[1]==-69 && head[2] ==-65)    
           code = "UTF-8";    
       if("Unicode".equals(code)){  
        code = "UTF-16";  
       }  
       return code;  
}

本文的转载地址为：http://blog.csdn.net/tibib/article/details/7988735