/**
version: 1.1 / 2007-01-25
- changed BOM recognition ordering (longer boms first)
Original pseudocode : Thomas Weidenfeller
Implementation tweaked: Aki Nieminen
http://www.unicode.org/unicode/faq/utf_bom.html
BOMs in byte length ordering:
00 00 FE FF = UTF-32, big-endian
FF FE 00 00 = UTF-32, little-endian
EF BB BF = UTF-8,
FE FF = UTF-16, big-endian
FF FE = UTF-16, little-endian
Win2k Notepad:
Unicode format = UTF-16LE
***/
public class UnicodeInputStream extends InputStream
{
public static void main(String[] args) throws Exception
{
UnicodeInputStream ui = new UnicodeInputStream(new FileInputStream("data.txt"), "UTF-8");
CharBuffer cb = CharBuffer.allocate(10);
String encoding = ui.getEncoding();
System.out.println(encoding);
InputStreamReader reader = new InputStreamReader(ui, encoding);
int len = 0;
while((len = reader.read(cb)) != -1) {
cb.flip();
System.out.print("===" + cb + "===");
cb.clear();
}
reader.close();
}
private PushbackInputStream internalIn;
private boolean isInited = false;
private String defaultEnc;
private String encoding;
private static final int BOM_SIZE = 4;
public UnicodeInputStream(InputStream in, String defaultEncoding)
{
internalIn = new PushbackInputStream(in, BOM_SIZE);
defaultEnc = defaultEncoding;
}
void init() throws IOException
{
if (isInited)
return;
byte[] bom = new byte[BOM_SIZE];
int n = internalIn.read(bom, 0, bom.length);
int skip;
if ((bom[0] == (byte)0x00) && (bom[1] == (byte)0x00) && (bom[2] == (byte)0xFE) && (bom[3] == (byte)0xFF))
{
skip = 4;
encoding = "UTF-32BE";
}
else if ((bom[0] == (byte)0xFF) && (bom[1] == (byte)0xFE) && (bom[2] == (byte)0x00) && (bom[3] == (byte)0x00))
{
skip = 4;
encoding = "UTF-32LE";
}
else if ((bom[0] == (byte)0xEF) && (bom[1] == (byte)0xBB) && (bom[2] == (byte)0xBF))
{
skip = 3;
encoding = "UTF-8";
}
else if ((bom[0] == (byte)0xFE) && (bom[1] == (byte)0xFF))
{
skip = 2;
encoding = "UTF-16BE";
}
else if ((bom[0] == (byte)0xFF) && (bom[1] == (byte)0xFE))
{
skip = 2;
encoding = "UTF-16LE";
}
else
{
skip = 0;
encoding = defaultEnc;
}
if(n != -1)
internalIn.unread(bom, skip, n - skip);
isInited = true;
}
public String getEncoding()
{
try
{
init();
}
catch (IOException e)
{
throw new IllegalStateException(e);
}
return encoding;
}
public String getDefaultEncoding()
{
return defaultEnc;
}
@Override
public int read() throws IOException
{
return internalIn.read();
}
@Override
public void close() throws IOException
{
internalIn.close();
}
}
public class UnicodeReader extends Reader
{
public static void main(String[] args) throws Exception
{
UnicodeReader reader = new UnicodeReader(new FileInputStream("data.txt"), "UTF-8");
// CharBuffer cb = CharBuffer.allocate(10);
// System.out.println(reader.getEncoding());
// int len = 0;
// while ((len = reader.read(cb)) != -1)
// {
// cb.flip();
// System.out.print("===" + cb + "===");
// cb.clear();
// }
// reader.close();
BufferedReader br = new BufferedReader(reader);
String line;
while((line = br.readLine()) != null)
System.out.println(line);
br.close();
}
private static final int BOM_SIZE = 4;
private PushbackInputStream internalIn;
private InputStreamReader reader;
private String encoding;
private String defaultEnc;
private boolean isInited = false;
public UnicodeReader(InputStream in, String defaultEncoding)
{
defaultEnc = defaultEncoding;
internalIn = new PushbackInputStream(in, BOM_SIZE);
try
{
init();
reader = new InputStreamReader(internalIn, encoding);
}
catch (IOException e)
{
try
{
internalIn.close();
}
catch (IOException e1)
{
e1.printStackTrace();
}
throw new ExceptionInInitializerError("initialization failed");
}
}
public String getEncoding()
{
return encoding;
}
protected void init() throws IOException
{
if (isInited)
return;
byte[] bom = new byte[BOM_SIZE];
int n = internalIn.read(bom, 0, bom.length), skip;
if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF))
{
skip = 4;
encoding = "UTF-32BE";
}
else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00))
{
skip = 4;
encoding = "UTF-32LE";
}
else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF))
{
skip = 3;
encoding = "UTF-8";
}
else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF))
{
skip = 2;
encoding = "UTF-16BE";
}
else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE))
{
skip = 2;
encoding = "UTF-16LE";
}
else
{
skip = 0;
encoding = defaultEnc;
}
if (n != -1)
internalIn.unread(bom, skip, n - skip);
isInited = true;
}
@Override
public void close() throws IOException
{
reader.close();
}
@Override
public int read(char[] cbuf, int off, int len) throws IOException
{
return reader.read(cbuf, off, len);
}
}
参考
http://koti.mbnet.fi/akini/java/unicodereader/