java 获取文件编码

最新推荐文章于 2024-07-03 02:34:55 发布

weixin_58494422

最新推荐文章于 2024-07-03 02:34:55 发布

阅读量905

点赞数 14

分类专栏： develop 文章标签： java jvm 开发语言

本文链接：https://blog.csdn.net/weixin_58494422/article/details/139510091

版权

develop 专栏收录该内容

33 篇文章 0 订阅

订阅专栏

获取文件的编码是一个重要的任务，尤其是在处理不同语言环境的文本文件时。不同的编码方式（如UTF-8、GBK、ISO-8859-1等）可能导致文本内容的错误解析。因此，准确地确定文件的编码方式对于正确处理文件内容至关重要。

Java中有几种方法可以用来确定文件的编码：

1、可以使用 JDK 自带的 InputStreamReader 类来获取文件编码。具体代码如下：

import java.io.*;

public class Main {

public static void main(String[] args) {

String filePath = "path/to/file.txt";

try {

FileInputStream fis = new FileInputStream(filePath);

InputStreamReader isr = new InputStreamReader(fis);

String encoding = isr.getEncoding();

System.out.println("File encoding: " + encoding);

} catch (IOException e) {

e.printStackTrace();

}

}

}

2、可以通过读取文件的前几个字节来判断编码。常见的编码标识符有 UTF-8 的字节序列（EF BB BF）、UTF-16 的字节序列（FF FE 或 FE FF）、UTF-32 的字节序列（00 00 FE FF 或 FF FE 00 00）等。具体代码如下：

import java.io.*;

public class Main {

public static void main(String[] args) {

String filePath = "path/to/file.txt";

String encoding = GetEncoding(new File(filePath));

System.out.println(encoding);

}

public static String GetEncoding(File file) {

String charset = "GBK";

byte[] first3Bytes = new byte[3];

try {

boolean checked = false;

InputStream is = new FileInputStream(file);

int read = is.read(first3Bytes, 0, 3);

if (read == -1)

return charset;

if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) {

charset = "UTF-16LE";

checked = true;

} else if (first3Bytes[0] == (byte) 0xFE

&& first3Bytes[1] == (byte) 0xFF) {

charset = "UTF-16BE";

checked = true;

} else if (first3Bytes[0] == (byte) 0xEF

&& first3Bytes[1] == (byte) 0xBB

&& first3Bytes[2] == (byte) 0xBF) {

charset = "UTF-8";

checked = true;

}else if (first3Bytes[0] == (byte) 0xA

&& first3Bytes[1] == (byte) 0x5B

&& first3Bytes[2] == (byte) 0x30) {

charset = "UTF-8";

checked = true;

}else if (first3Bytes[0] == (byte) 0xD

&& first3Bytes[1] == (byte) 0xA

&& first3Bytes[2] == (byte) 0x5B) {

charset = "GBK";

checked = true;

}else if (first3Bytes[0] == (byte) 0x5B

&& first3Bytes[1] == (byte) 0x54

&& first3Bytes[2] == (byte) 0x49) {

charset = "windows-1251";

checked = true;

}

InputStream istmp = new FileInputStream(file);

if (!checked) {

int loc = 0;

while ((read = istmp.read()) != -1) {

loc++;

if (read >= 0xF0)

break;

if (0x80 <= read && read <= 0xBF)

break;

if (0xC0 <= read && read <= 0xDF) {

read = istmp.read();

if (0x80 <= read && read <= 0xBF)

continue;

else

break;

} else if (0xE0 <= read && read <= 0xEF) {

read = istmp.read();

if (0x80 <= read && read <= 0xBF) {

read = istmp.read();

if (0x80 <= read && read <= 0xBF) {

charset = "UTF-8";

break;

} else

break;

} else

break;

}

}

}

is.close();

istmp.close();

} catch (Exception e) {

e.printStackTrace();

}

return charset;

}

}