packagecom.example.file.file;/**
version: 1.1 / 2007-01-25
- changed BOM recognition ordering (longer boms first)
Original pseudocode : Thomas Weidenfeller
Implementation tweaked: Aki Nieminen
http://www.unicode.org/unicode/faq/utf_bom.html
BOMs:
00 00 FE FF = UTF-32, big-endian
FF FE 00 00 = UTF-32, little-endian
EF BB BF = UTF-8,
FE FF = UTF-16, big-endian
FF FE = UTF-16, little-endian
Win2k Notepad:
Unicode format = UTF-16LE
***/importjava.io.*;/**
* Generic unicode textreader, which will use BOM mark
* to identify the encoding to be used. If BOM is not found
* then use a given default or system encoding.
*/publicclassUnicodeReaderextendsReader{PushbackInputStream internalIn;InputStreamReader internalIn2 =null;String defaultEnc;privatestaticfinalint BOM_SIZE =4;/**
* @param in inputstream to be read
* @param defaultEnc default encoding if stream does not have
* BOM marker. Give NULL to use system-level default.
*/publicUnicodeReader(InputStream in,String defaultEnc){
internalIn =newPushbackInputStream(in, BOM_SIZE);this.defaultEnc = defaultEnc;}publicStringgetDefaultEncoding(){return defaultEnc;}/**
* Get stream encoding or NULL if stream is uninitialized.
* Call init() or read() method to initialize it.
*/publicStringgetEncoding(){if(internalIn2 ==null)returnnull;return internalIn2.getEncoding();}/**
* Read-ahead four bytes and check for BOM marks. Extra bytes are
* unread back to the stream, only BOM bytes are skipped.
*/protectedvoidinit()throwsIOException{if(internalIn2 !=null)return;String encoding;byte bom[]=newbyte[BOM_SIZE];int n, unread;
n = internalIn.read(bom,0, bom.length);if((bom[0]==(byte)0x00)&&(bom[1]==(byte)0x00)&&(bom[2]==(byte)0xFE)&&(bom[3]==(byte)0xFF)){
encoding ="UTF-32BE";
unread = n -4;}elseif((bom[0]==(byte)0xFF)&&(bom[1]==(byte)0xFE)&&(bom[2]==(byte)0x00)&&(bom[3]==(byte)0x00)){
encoding ="UTF-32LE";
unread = n -4;}elseif((bom[0]==(byte)0xEF)&&(bom[1]==(byte)0xBB)&&(bom[2]==(byte)0xBF)){
encoding ="UTF-8";
unread = n -3;}elseif((bom[0]==(byte)0xFE)&&(bom[1]==(byte)0xFF)){
encoding ="UTF-16BE";
unread = n -2;}elseif((bom[0]==(byte)0xFF)&&(bom[1]==(byte)0xFE)){
encoding ="UTF-16LE";
unread = n -2;}else{// Unicode BOM mark not found, unread all bytes
encoding = defaultEnc;
unread = n;}//System.out.println("read=" + n + ", unread=" + unread);if(unread >0) internalIn.unread(bom,(n - unread), unread);// Use given encodingif(encoding ==null){
internalIn2 =newInputStreamReader(internalIn);}else{
internalIn2 =newInputStreamReader(internalIn, encoding);}}publicvoidclose()throwsIOException{init();
internalIn2.close();}publicintread(char[] cbuf,int off,int len)throwsIOException{init();return internalIn2.read(cbuf, off, len);}}