java 判断 文件字符集_【解决方案】Java获取文件字符集格式

packagecom.example.test;importlombok.Cleanup;importlombok.NonNull;importlombok.extern.slf4j.Slf4j;importorg.apache.commons.lang3.StringUtils;import java.io.*;importjava.util.BitSet;/***  @Description: 编码集工具类

*@authormiaoying

*  @date 2020/9/24*/@Slf4jpublic classEncodeUtil {private static int BYTE_SIZE = 8;public static String CODE_UTF8 = "UTF-8";public static String CODE_UTF8_BOM = "UTF-8_BOM";public static String CODE_GBK = "GBK";public static String CODE_UNICODE = "Unicode";public static String CODE_UTF16 = "UTF-16";/*** 通过文件获取编码集名称

*

*@paramfile

*@paramignoreBom

*@return*@throwsException*/

public static String getEncode(File file, boolean ignoreBom) throwsException {

BufferedInputStream bis= new BufferedInputStream(newFileInputStream(file));returngetEncode(bis, ignoreBom);

}/*** 通过文件缓存流获取编码集名称

*

*@parambis

*@return*@throwsException*/

public static String getEncode(@NonNull BufferedInputStream bis, boolean ignoreBom) throwsException {

bis.mark(0);

String encodeType=StringUtils.EMPTY;byte[] head = new byte[3];

bis.read(head);if (head[0] == -1 && head[1] == -2) {

encodeType=CODE_UTF16;

}else if (head[0] == -2 && head[1] == -1) {

encodeType=CODE_UNICODE;

}//带BOM

else if (head[0] == -17 && head[1] == -69 && head[2] == -65) {if(ignoreBom) {

encodeType=CODE_UTF8;

}else{

encodeType=CODE_UTF8_BOM;

}

}else if(CODE_UNICODE.equals(encodeType)) {

encodeType=CODE_UTF16;

}else if(isUTF8(bis)) {

encodeType=CODE_UTF8;

}else{

encodeType=CODE_GBK;

}

log.info("encodeType : " +encodeType);returnencodeType;

}/*** 是否是无BOM的UTF8格式,不判断常规场景,只区分无BOM UTF8和GBK

*

*@parambis

*@return

*/

private static boolean isUTF8(@NonNull BufferedInputStream bis) throwsException {

bis.reset();int code =bis.read();do{

BitSet bitSet=convert2BitSet(code);//判断是否为单字节

if (bitSet.get(0)) {//多字节时,再读取N个字节

if (!checkMultiByte(bis, bitSet)) {return false;

}

}else{//单字节时什么都不用做,再次读取字节

}

code=bis.read();

}while (code != -1);return true;

}/*** 检测多字节,判断是否为utf8,已经读取了一个字节

*

*@parambis

*@parambitSet

*@return

*/

private static boolean checkMultiByte(@NonNull BufferedInputStream bis, @NonNull BitSet bitSet) throwsException {int count =getCountOfSequential(bitSet);//已经读取了一个字节,不能再读取

byte[] bytes = new byte[count - 1];

bis.read(bytes);for (byteb : bytes) {if (!checkUtf8Byte(b)) {return false;

}

}return true;

}/*** 检测单字节,判断是否为utf8

*

*@paramb

*@return

*/

private static boolean checkUtf8Byte(byteb) {

BitSet bitSet=convert2BitSet(b);return bitSet.get(0) && !bitSet.get(1);

}/*** 检测bitSet中从开始有多少个连续的1

*

*@parambitSet

*@return

*/

private static intgetCountOfSequential(@NonNull BitSet bitSet) {int count = 0;for (int i = 0; i < BYTE_SIZE; i++) {if(bitSet.get(i)) {

count++;

}else{break;

}

}returncount;

}/*** 将整形转为BitSet

*

*@paramcode

*@return

*/

private static BitSet convert2BitSet(intcode) {

BitSet bitSet= newBitSet(BYTE_SIZE);for (int i = 0; i < BYTE_SIZE; i++) {int tmp3 = code >> (BYTE_SIZE - i - 1);int tmp2 = 0x1 &tmp3;if (tmp2 == 1) {

bitSet.set(i);

}

}returnbitSet;

}/*** 将一指定编码的文件转换为另一编码的文件

*

*@paramoldFullFileName

*@paramoldCharsetName

*@paramnewFullFileName

*@paramnewCharsetName*/

public static void convert(String oldFullFileName, String oldCharsetName, String newFullFileName, String newCharsetName) throwsException {

log.info("the old file name is : {}, The oldCharsetName is : {}", oldFullFileName, oldCharsetName);

log.info("the new file name is : {}, The newCharsetName is : {}", newFullFileName, newCharsetName);

StringBuffer content= newStringBuffer();

@Cleanup

BufferedReader bin= new BufferedReader(new InputStreamReader(newFileInputStream(oldFullFileName), oldCharsetName));

String line;while ((line = bin.readLine()) != null) {

content.append(line);

content.append(System.getProperty("line.separator"));

}

newFullFileName= newFullFileName.replace("\\", "/");

File dir= new File(newFullFileName.substring(0, newFullFileName.lastIndexOf("/")));if (!dir.exists()) {

dir.mkdirs();

}

@Cleanup

Writer out= new OutputStreamWriter(newFileOutputStream(newFullFileName), newCharsetName);

out.write(content.toString());

}

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值