获取文件的编码是一个重要的任务,尤其是在处理不同语言环境的文本文件时。不同的编码方式(如UTF-8、GBK、ISO-8859-1等)可能导致文本内容的错误解析。因此,准确地确定文件的编码方式对于正确处理文件内容至关重要。
Java中有几种方法可以用来确定文件的编码:
1、可以使用 JDK 自带的 InputStreamReader 类来获取文件编码。具体代码如下:
import java.io.*;
public class Main {
public static void main(String[] args) {
String filePath = "path/to/file.txt";
try {
FileInputStream fis = new FileInputStream(filePath);
InputStreamReader isr = new InputStreamReader(fis);
String encoding = isr.getEncoding();
System.out.println("File encoding: " + encoding);
} catch (IOException e) {
e.printStackTrace();
}
}
}
2、可以通过读取文件的前几个字节来判断编码。常见的编码标识符有 UTF-8 的字节序列(EF BB BF)、UTF-16 的字节序列(FF FE 或 FE FF)、UTF-32 的字节序列(00 00 FE FF 或 FF FE 00 00)等。具体代码如下:
import java.io.*;
public class Main {
public static void main(String[] args) {
String filePath = "path/to/file.txt";
String encoding = GetEncoding(new File(filePath));
System.out.println(encoding);
}
public static String GetEncoding(File file) {
String charset = "GBK";
byte[] first3Bytes = new byte[3];
try {
boolean checked = false;
InputStream is = new FileInputStream(file);
int read = is.read(first3Bytes, 0, 3);
if (read == -1)
return charset;
if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) {
charset = "UTF-16LE";
checked = true;
} else if (first3Bytes[0] == (byte) 0xFE
&& first3Bytes[1] == (byte) 0xFF) {
charset = "UTF-16BE";
checked = true;
} else if (first3Bytes[0] == (byte) 0xEF
&& first3Bytes[1] == (byte) 0xBB
&& first3Bytes[2] == (byte) 0xBF) {
charset = "UTF-8";
checked = true;
}else if (first3Bytes[0] == (byte) 0xA
&& first3Bytes[1] == (byte) 0x5B
&& first3Bytes[2] == (byte) 0x30) {
charset = "UTF-8";
checked = true;
}else if (first3Bytes[0] == (byte) 0xD
&& first3Bytes[1] == (byte) 0xA
&& first3Bytes[2] == (byte) 0x5B) {
charset = "GBK";
checked = true;
}else if (first3Bytes[0] == (byte) 0x5B
&& first3Bytes[1] == (byte) 0x54
&& first3Bytes[2] == (byte) 0x49) {
charset = "windows-1251";
checked = true;
}
InputStream istmp = new FileInputStream(file);
if (!checked) {
int loc = 0;
while ((read = istmp.read()) != -1) {
loc++;
if (read >= 0xF0)
break;
if (0x80 <= read && read <= 0xBF)
break;
if (0xC0 <= read && read <= 0xDF) {
read = istmp.read();
if (0x80 <= read && read <= 0xBF)
continue;
else
break;
} else if (0xE0 <= read && read <= 0xEF) {
read = istmp.read();
if (0x80 <= read && read <= 0xBF) {
read = istmp.read();
if (0x80 <= read && read <= 0xBF) {
charset = "UTF-8";
break;
} else
break;
} else
break;
}
}
}
is.close();
istmp.close();
} catch (Exception e) {
e.printStackTrace();
}
return charset;
}
}
注意:以上代码仅针对常见的文件编码进行了判断,对于一些特殊的编码,可能需要使用更复杂的方法来判断。