public static Encoding determineEncoding(RandomAccessFile file) {
Encoding enc = Encoding.GBK;
try {
file.seek(0);
if(file.length() < 3) return enc;
byte[] bom = new byte[3]; //byte order mark
file.read(bom);
if((bom[0] & 0XFF) == 0xFF && (bom[1] & 0XFF) == 0xFE)
enc = Encoding.UTF16LE;
else if((bom[0] & 0XFF) == 0xFE && (bom[1] & 0XFF) == 0xFF)
enc = Encoding.UTF16BE;
else if((bom[0] & 0XFF) == 0xEF && (bom[1] & 0XFF) == 0xBB && (bom[2] & 0XFF) == 0xBF)
enc = Encoding.UTF8;
else {//test if the file is encoded using GBK or BIG5 character set
int gbkCount = 0;
int big5Count = 0;
int utf16leCount = 0;
int utf16beCount = 0;
int utf8Count = 0;
file.seek(0);
byte[] bs = new byte[4096];
file.read(bs);
int len = bs.length - 2;
//look up the Chinese characters "�?
for(int i = 0; i < len; ++i) {
if((bs[i] & 0xFF) == 0xB5 && (bs[i + 1] & 0xFF) == 0xC4) {
++gbkCount;
++i;
} else if ((bs[i] & 0xFF) == 0xE7 && (bs[i + 1] & 0xFF) == 0x9A && (bs[i + 2] & 0xFF) == 0x84) {
++utf8Count;
i += 2;
} else if ((bs[i] & 0xFF) == 0x84 && (bs[i + 1] & 0xFF) == 0x76) {
++utf16leCount;
++i;
} else if ((bs[i] & 0xFF) == 0x76 && (bs[i + 1] & 0xFF) == 0x84) {
++utf16beCount;
++i;
} else if ((bs[i] & 0xFF) == 0xAA && (bs[i + 1] & 0xFF) == 0xBA) {
++big5Count;
++i;
}
}
if(gbkCount > utf8Count && gbkCount > big5Count && gbkCount > utf16leCount && gbkCount > utf16beCount)
enc = Encoding.GBK;
else if(utf8Count > gbkCount && utf8Count > big5Count && utf8Count > utf16leCount && utf8Count > utf16beCount)
enc = Encoding.UTF8;
else if(utf16leCount > gbkCount && utf16leCount > big5Count && utf16leCount > utf8Count && utf16leCount > utf16beCount)
enc = Encoding.UTF16LE;
else if(utf16beCount > gbkCount && utf16beCount > big5Count && utf16beCount > utf16leCount && utf16beCount > utf16leCount)
enc = Encoding.UTF16BE;
else if(big5Count > gbkCount && big5Count > utf8Count && big5Count > utf16leCount && big5Count > utf16beCount)
enc = Encoding.BIG5;
}
} catch (Exception ex) {
Log.e("File ERROR", "encoding detection failed.");
}
return enc;
}
public enum Encoding {
GBK("GBK"),
BIG5("BIG5"),
UTF8("UTF-8"),
UTF16BE("UTF-16BE"),
UTF16LE("UTF-16LE"),
UNKNOWN("UNKNOWN");
private Encoding (String name) {
this.name = name;
try {
maxCharLength = "中".getBytes(name).length;
} catch (Exception e) {}
}
private String name;
public String getName() {
return name;
}
private int maxCharLength;
public int getMaxCharLength() {
return maxCharLength;
}
}