java 自动判断文件编码_(转)自动判断文本文件编码来读取文本文件内容(.net版本和java版本)...

import org.slf4j.Logger;

import org.slf4j.LoggerFactory;

import java.io.BufferedInputStream;

import java.io.FileInputStream;

import java.util.BitSet;

public class EncodeUtils {

private static final Logger logger = LoggerFactory.getLogger(EncodeUtils.class);

private static final int BYTE_SIZE = 8;

private static final String CODE_UTF8 = "UTF-8";

private static final String CODE_UTF16 = "UTF-16";//Unicode

private static final String CODE_UTF16LE = "UTF-16LE";//Unicode big endian

private static final String CODE_GBK = "GBK"; //ABSU

/**

* 通过文件全名称获取编码集名称

*/

public static String getEncode(String fullFileName) throws Exception {

BufferedInputStream bis = new BufferedInputStream(new FileInputStream(fullFileName));

return getEncode(bis, CODE_GBK);

}

/**

* 通过文件全名称获取编码集名称

*/

public static String getEncode(String fullFileName, String defaultEncoding) throws Exception {

BufferedInputStream bis = new BufferedInputStream(new FileInputStream(fullFileName));

return getEncode(bis, defaultEncoding);

}

/**

* 通过文件缓存流获取编码集名称,文件流必须为未曾

*

* @param bis 文件流

*/

public static String getEncode(BufferedInputStream bis, String defaultEncoding) throws Exception {

bis.mark(0);

String encodeType;

byte[] head = new byte[3];

bis.read(head);

if (head[0] == -1 && head[1] == -2 && head[2] == (byte) 0x41) {

encodeType = CODE_UTF16;

} else if (head[0] == -2 && head[1] == -1 && head[2] == 0) {

//encodeType = "Unicode";

encodeType = CODE_UTF16LE;

} else if (head[0] == -17 && head[1] == -69 && head[2] == -65) {

//带BOM的UTF8 (CODE_UTF8_BOM)

encodeType = CODE_UTF8;

} else {

if (isUTF8(bis)) {

encodeType = CODE_UTF8;

} else {

encodeType = defaultEncoding;

}

}

return encodeType;

}

/**

* 是否是无BOM的UTF8格式,不判断常规场景,只区分无BOM UTF8和GBK

*/

private static boolean isUTF8(BufferedInputStream bis) throws Exception {

bis.reset();

//读取第一个字节

int code = bis.read();

do {

BitSet bitSet = convert2BitSet(code);

//判断是否为单字节

if (bitSet.get(0)) {//多字节时,再读取N个字节

if (!checkMultiByte(bis, bitSet)) {//未检测通过,直接返回

return false;

}

}

code = bis.read();

} while (code != -1);

return true;

}

/**

* 检测多字节,判断是否为utf8,已经读取了一个字节

*/

private static boolean checkMultiByte(BufferedInputStream bis, BitSet bitSet) throws Exception {

int count = getCountOfSequential(bitSet);

byte[] bytes = new byte[count - 1];//已经读取了一个字节,不能再读取

bis.read(bytes);

for (byte b : bytes) {

if (!checkUtf8Byte(b)) {

return false;

}

}

return true;

}

/**

* 检测bitSet中从开始有多少个连续的1

*/

private static int getCountOfSequential(BitSet bitSet) {

int count = 0;

for (int i = 0; i < BYTE_SIZE; i++) {

if (bitSet.get(i)) {

count++;

} else {

break;

}

}

return count;

}

/**

* 检测单字节,判断是否为utf8

*/

private static boolean checkUtf8Byte(byte b) throws Exception {

BitSet bitSet = convert2BitSet(b);

return bitSet.get(0) && !bitSet.get(1);

}

/**

* 将整形转为BitSet

*/

private static BitSet convert2BitSet(int code) {

BitSet bitSet = new BitSet(BYTE_SIZE);

for (int i = 0; i < BYTE_SIZE; i++) {

int tmp3 = code >> (BYTE_SIZE - i - 1);

int tmp2 = 0x1 & tmp3;

if (tmp2 == 1) {

bitSet.set(i);

}

}

return bitSet;

}

public static void main(String[] args) {

String filePath = "C:\\110025.txt";

try {

String encoding = getEncode(filePath);

System.out.println(encoding);

} catch (Exception ex) {

logger.warn("文件检测编码出错!", ex);

}

}

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值