Java根据文件的BOM判断文件的编码类型

[size=medium] 随着多字节文本数据的处理,尤其是非ASCII文字的出现,典型的如中文,文件的编码判断就提上日程,有很多字节流和字符流默认能处理的编码格式的是和程序文件的编码一致,例如:程序文件编码是UTF-8,默认处理的文本也是UTF-8。处理其他格式的文本时,当不提供具体的编码时,就非常容易把其他格式的文本当成乱码处理。

当前处理的方式一般通过相关reader或writer的装饰类:[/size]
InputStreamReader(InputStream in, String charsetName) 
OutputStreamWriter(OutputStream out, String charsetName) 

[size=medium] 实现显示地将字符编码传进去,但是无法实现自动的发掘文件字符编码,也就是说,此种模式仅仅支持用户将文件的编码传进去。

对中文字符编码主要是GBK(GB2312,GB18030)系列和UTF系列的区别,UTF系列的编码通常在文件的头部若干个字节已经告诉用户此文件的字符编码格式,即文件包含BOM(Byte Order Mark),此标志标志文件的编码方式,常见的有:
[/size]
[size=large]BOMs:[/size]
[size=medium][list]
[*] 00 00 FE FF = UTF-32, big-endian
[*] FF FE 00 00 = UTF-32, little-endian
[*] EF BB BF = UTF-8,
[*] FE FF = UTF-16, big-endian
[*] FF FE = UTF-16, little-endian
[/list][/size]

[size=medium]在此处提供两个输入流方法,一种是基于字符的reader:[/size]
/**
* http://www.unicode.org/unicode/faq/utf_bom.html
*BOMs:
* 00 00 FE FF = UTF-32, big-endian
* FF FE 00 00 = UTF-32, little-endian
* EF BB BF = UTF-8,
* FE FF = UTF-16, big-endian
* FF FE = UTF-16, little-endian
*
*Win2k Notepad:
* Unicode format = UTF-16LE
*
* @author Semantic Wang
*
*/
public class UnicodeReader extends Reader{

PushbackInputStream pbin;
InputStreamReader reader = null;
String defaultEnc;

private static final int BOM_SIZE = 4;


/**
*
* @param in
* inputstream to be read
*
*/
public UnicodeReader(InputStream in) {
this(in, "GBK");
}

/**
*
* @param in
* inputstream to be read
* @param defaultEnc
* default encoding if stream does not have BOM marker. Give NULL
* to use system-level default.
*/
public UnicodeReader(InputStream in, String defaultEnc) {
pbin = new PushbackInputStream(in, BOM_SIZE);
this.defaultEnc = defaultEnc;
}

public String getDefaultEncoding() {
return defaultEnc;
}

/**
* Get stream encoding or NULL if stream is uninitialized. Call init() or
* read() method to initialize it.
*/
public String getEncoding() {
if (reader == null)
return null;
return reader.getEncoding();
}

/**
* Read-ahead four bytes and check for BOM marks. Extra bytes are unread
* back to the stream, only BOM bytes are skipped.
*/
protected void init() throws IOException {
if (reader != null)
return;

String encoding;
byte bom[] = new byte[BOM_SIZE];
int n, unread;
n = pbin.read(bom, 0, bom.length);

if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00)
&& (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
encoding = "UTF-32BE";
unread = n - 4;
} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)
&& (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
encoding = "UTF-32LE";
unread = n - 4;
} else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB)
&& (bom[2] == (byte) 0xBF)) {
encoding = "UTF-8";
unread = n - 3;
} else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
encoding = "UTF-16BE";
unread = n - 2;
} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
encoding = "UTF-16LE";
unread = n - 2;
} else {
// Unicode BOM mark not found, unread all bytes
encoding = defaultEnc;
unread = n;
}
// System.out.println("read=" + n + ", unread=" + unread);

if (unread > 0)
pbin.unread(bom, (n - unread), unread);

// Use given encoding
if (encoding == null) {
reader = new InputStreamReader(pbin);
} else {
reader = new InputStreamReader(pbin, encoding);
}
}

public void close() throws IOException {
init();
reader.close();
}

public int read(char[] cbuf, int off, int len) throws IOException {
init();
return reader.read(cbuf, off, len);
}

}

[size=medium]另一种是基于字节的输入流InputStream:[/size]

/**
* @author Semantic Wang
*
*/
public class UnicodeInputStream extends InputStream {

PushbackInputStream pbin;
boolean isInited = false;
String defaultEnc;
String encoding;

private static final int BOM_SIZE = 4;

public UnicodeInputStream(InputStream in) {
this(in, "GBK");
}

public UnicodeInputStream(InputStream in, String defaultEnc) {
pbin = new PushbackInputStream(in, BOM_SIZE);
this.defaultEnc = defaultEnc;
}

public String getDefaultEncoding() {
return defaultEnc;
}

public String getEncoding() {
if (!isInited) {
try {
init();
} catch (IOException ex) {
IllegalStateException ise = new IllegalStateException(
"Init method failed.");
ise.initCause(ise);
throw ise;
}
}
return encoding;
}

/**
* Read-ahead four bytes and check for BOM marks. Extra bytes are unread
* back to the stream, only BOM bytes are skipped.
*/
protected void init() throws IOException {
if (isInited)
return;

byte bom[] = new byte[BOM_SIZE];
int n, unread;
n = pbin.read(bom, 0, bom.length);

if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00)
&& (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
encoding = "UTF-32BE";
unread = n - 4;
} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)
&& (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
encoding = "UTF-32LE";
unread = n - 4;
} else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB)
&& (bom[2] == (byte) 0xBF)) {
encoding = "UTF-8";
unread = n - 3;
} else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
encoding = "UTF-16BE";
unread = n - 2;
} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
encoding = "UTF-16LE";
unread = n - 2;
} else {
// Unicode BOM mark not found, unread all bytes
encoding = defaultEnc;
unread = n;
}
// System.out.println("read=" + n + ", unread=" + unread);

if (unread > 0)
pbin.unread(bom, (n - unread), unread);

isInited = true;
}

public void close() throws IOException {
// init();
isInited = true;
pbin.close();
}

public int read() throws IOException {
// init();
isInited = true;
return pbin.read();
}

}


[size=medium]最后的调用方式为:[/size]

InputStream in = new FileInputStream(fileName);
BufferedReader reader = new BufferedReader(new UnicodeReader(in));
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值