最标准的途径是检测文本最开头的几个字节,开头字节 Charset/encoding,如下表:
EF BB BF UTF-8
FE FF UTF-16/UCS-2, little endian
FF FE UTF-16/UCS-2, big endian
FF FE 00 00 UTF-32/UCS-4, little endian.
00 00 FE FF UTF-32/UCS-4, big-endian.
int[] head = new int[4];
InputStream inputStream = new FileInputStream(path);
for(int i=0; i<4; i++){
head[0]=inputStream.read();
}
inputStream.close();
String code = "ANSI";
if (head[0]==0xef && head[1]==0xbb && head[2]==0xbf) {
code = "UTF-8";
} else if(head[0]==0xfe && head[1]==0xff) {
code = "utf-16/ucs2, little endian";
} else if(head[0]==0xff && head[1]==0xfe) {
code = "utf-16/ucs2, big endian";
} else if(head[0]==0xff && head[1]==0xfe && head[2]==0x0 && head[3]==0x0) {
code = "UTF-32/ucs4, little endian";
} else if (head[0]==0x0 && head[1]==0x0 && head[2]==0xfe && head[3]==0xff) {
code = "UTF-32/ucs4, big endian";
}