跳过UTF-8的BOM

/**
version: 1.1 / 2007-01-25
- changed BOM recognition ordering (longer boms first)

Original pseudocode : Thomas Weidenfeller
Implementation tweaked: Aki Nieminen

http://www.unicode.org/unicode/faq/utf_bom.html
BOMs in byte length ordering:
00 00 FE FF = UTF-32, big-endian
FF FE 00 00 = UTF-32, little-endian
EF BB BF = UTF-8,
FE FF = UTF-16, big-endian
FF FE = UTF-16, little-endian

Win2k Notepad:
Unicode format = UTF-16LE
***/

public class UnicodeInputStream extends InputStream
{
public static void main(String[] args) throws Exception
{
UnicodeInputStream ui = new UnicodeInputStream(new FileInputStream("data.txt"), "UTF-8");
CharBuffer cb = CharBuffer.allocate(10);
String encoding = ui.getEncoding();
System.out.println(encoding);
InputStreamReader reader = new InputStreamReader(ui, encoding);
int len = 0;
while((len = reader.read(cb)) != -1) {
cb.flip();
System.out.print("===" + cb + "===");
cb.clear();
}
reader.close();
}

private PushbackInputStream internalIn;
private boolean isInited = false;
private String defaultEnc;
private String encoding;
private static final int BOM_SIZE = 4;

public UnicodeInputStream(InputStream in, String defaultEncoding)
{
internalIn = new PushbackInputStream(in, BOM_SIZE);
defaultEnc = defaultEncoding;
}

void init() throws IOException
{
if (isInited)
return;
byte[] bom = new byte[BOM_SIZE];
int n = internalIn.read(bom, 0, bom.length);
int skip;

if ((bom[0] == (byte)0x00) && (bom[1] == (byte)0x00) && (bom[2] == (byte)0xFE) && (bom[3] == (byte)0xFF))
{
skip = 4;
encoding = "UTF-32BE";
}
else if ((bom[0] == (byte)0xFF) && (bom[1] == (byte)0xFE) && (bom[2] == (byte)0x00) && (bom[3] == (byte)0x00))
{
skip = 4;
encoding = "UTF-32LE";
}
else if ((bom[0] == (byte)0xEF) && (bom[1] == (byte)0xBB) && (bom[2] == (byte)0xBF))
{
skip = 3;
encoding = "UTF-8";
}
else if ((bom[0] == (byte)0xFE) && (bom[1] == (byte)0xFF))
{
skip = 2;
encoding = "UTF-16BE";
}
else if ((bom[0] == (byte)0xFF) && (bom[1] == (byte)0xFE))
{
skip = 2;
encoding = "UTF-16LE";
}
else
{
skip = 0;
encoding = defaultEnc;
}

if(n != -1)
internalIn.unread(bom, skip, n - skip);
isInited = true;
}

public String getEncoding()
{
try
{
init();
}
catch (IOException e)
{
throw new IllegalStateException(e);
}
return encoding;
}

public String getDefaultEncoding()
{
return defaultEnc;
}

@Override
public int read() throws IOException
{
return internalIn.read();
}

@Override
public void close() throws IOException
{
internalIn.close();
}
}



public class UnicodeReader extends Reader
{
public static void main(String[] args) throws Exception
{
UnicodeReader reader = new UnicodeReader(new FileInputStream("data.txt"), "UTF-8");
// CharBuffer cb = CharBuffer.allocate(10);
// System.out.println(reader.getEncoding());
// int len = 0;
// while ((len = reader.read(cb)) != -1)
// {
// cb.flip();
// System.out.print("===" + cb + "===");
// cb.clear();
// }
// reader.close();

BufferedReader br = new BufferedReader(reader);
String line;
while((line = br.readLine()) != null)
System.out.println(line);
br.close();
}

private static final int BOM_SIZE = 4;
private PushbackInputStream internalIn;
private InputStreamReader reader;
private String encoding;
private String defaultEnc;
private boolean isInited = false;

public UnicodeReader(InputStream in, String defaultEncoding)
{
defaultEnc = defaultEncoding;
internalIn = new PushbackInputStream(in, BOM_SIZE);
try
{
init();
reader = new InputStreamReader(internalIn, encoding);
}
catch (IOException e)
{
try
{
internalIn.close();
}
catch (IOException e1)
{
e1.printStackTrace();
}

throw new ExceptionInInitializerError("initialization failed");
}
}

public String getEncoding()
{
return encoding;
}

protected void init() throws IOException
{
if (isInited)
return;
byte[] bom = new byte[BOM_SIZE];
int n = internalIn.read(bom, 0, bom.length), skip;

if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF))
{
skip = 4;
encoding = "UTF-32BE";
}
else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00))
{
skip = 4;
encoding = "UTF-32LE";
}
else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF))
{
skip = 3;
encoding = "UTF-8";
}
else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF))
{
skip = 2;
encoding = "UTF-16BE";
}
else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE))
{
skip = 2;
encoding = "UTF-16LE";
}
else
{
skip = 0;
encoding = defaultEnc;
}

if (n != -1)
internalIn.unread(bom, skip, n - skip);
isInited = true;
}

@Override
public void close() throws IOException
{
reader.close();
}

@Override
public int read(char[] cbuf, int off, int len) throws IOException
{
return reader.read(cbuf, off, len);
}
}


参考
http://koti.mbnet.fi/akini/java/unicodereader/
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值