java识别文件编码格式代码(无引用jar包,测试可行)

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class Test {
	public static List<String> readFileToList(String path) {
		String codeString = codeString(path);
		boolean haveBom=true;
		if ("utf-8无bom".equals(codeString) || "gbk".equals(codeString)) {
			haveBom=false;
		}
		if ("utf-8无bom".equals(codeString)){
			codeString="utf-8";
		}
		List<String> list = new ArrayList<String>();
		BufferedReader reader = null;
		try {
			reader = new BufferedReader(new InputStreamReader(new FileInputStream(path), codeString));
			String str = "";
			while ((str = reader.readLine()) != null) {
				if(haveBom){
					list.add(removeBom(str,codeString));
					haveBom=false;
				}else{
					list.add(str);
				}
			}
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			if (reader != null) {
				try {
					reader.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
		}
		return list;
	}
	
	
	public static String getRefData(String key){
		Map<String, String> map=new HashMap<String, String>();
		map.put("UTF-8", "EFBBBF");
		map.put("Unicode", "FFFE");
		map.put("UTF-16BE", "FEFF");
		return map.get(key);
	}
	/**
	 * 在window下用记事本保存为utf8时会在文件开始处加入EFBBBF标识符,本方法可以判断并去掉这个标识符
	 * BOMs:
	 * 00 00 FE FF = UTF-32, big-endian
	 * FF FE 00 00 = UTF-32, little-endian
	 * EF BB BF = UTF-8,
	 * FE FF = UTF-16, big-endian
	 * FF FE = UTF-16, little-endian
	 * 
	 * @param line
	 * @return
	 * @throws UnsupportedEncodingException
	 */
	public static String removeBom(String line,String codeString) {
		byte[] allbytes = null;
		try {
			allbytes = line.getBytes(codeString);
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		}
		String start = "";
		for (int i = 0; i < allbytes.length; i++)
		{
			int tmp = allbytes[i];
			String hexString = Integer.toHexString(tmp);
			// 1个byte变成16进制的,只需要2位就可以表示了,取后面两位,去掉前面的符号填充
			if (hexString.length()<2) {
				continue;
			}
			hexString = hexString.substring(hexString.length() - 2);
			start += hexString.toUpperCase();
			if (start.equals(getRefData(codeString))) {
				break;
			}
		}
		if (start.equals(getRefData(codeString))) {
			try {
				return new String(Arrays.copyOfRange(allbytes, getRefData(codeString).length()/2, allbytes.length),codeString);
			} catch (UnsupportedEncodingException e) {
				e.printStackTrace();
			}
		}
		try {
			return new String(Arrays.copyOfRange(allbytes, 0, allbytes.length),codeString);
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		}
		return line;
	}
	
	/**
	 * 判断文件的编码格式
	 * 目前能判断UTF-8(有或无BOM) Unicode UTF-16BE gbk五种类型
	 * @param fileName
	 * @return 文件编码格式
	 * @throws Exception
	 */
	public static String codeString(String fileName) {
		String code = null;
		try {
			BufferedInputStream bin = new BufferedInputStream(
					new FileInputStream(fileName));
			int p = (bin.read() << 8) + bin.read();
			switch (p) {
			case 0xefbb:
				code = "UTF-8";
				break;
			case 0xfffe:
				code = "Unicode";
				break;
			case 0xfeff:
				code = "UTF-16BE";
				break;
			default:
				{
					if (isUTF8(new File(fileName))) {
						code = "utf-8无bom";
					} else {
						code = "gbk";
					}
				}
			}
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return code;
	}
	
	/**
	 * 判断文件是无dom的utf8还是gbk
	 * @param file
	 * @return
	 */
	public static boolean isUTF8(File file){
	        byte [] buf = FileUtil.readFileToByteArray (file);
	        /*for (byte b : buf) {
				System.out.println(Integer.toHexString(b));
			}*/
	        List<String> readFileToList = FileUtil.readFileToList(file,"utf-8");
	        byte[] list=new byte[10];
	        boolean isFirst=true;
	        int count=0;
	        for (String string : readFileToList) {
	        	if (isFirst) {
	        		try {
						list=string.getBytes("utf-8");
					} catch (UnsupportedEncodingException e) {
						e.printStackTrace();
					}
	        		isFirst=false;
				}else{
					byte[] temp = null;
					try {
						temp = string.getBytes("utf-8");
					} catch (UnsupportedEncodingException e) {
						e.printStackTrace();
					}
					int index=list.length;
					list=Arrays.copyOf(list, list.length+temp.length);
					for (byte b : temp) {
						list[index++]=b;
					}
				}
	        	if (count<readFileToList.size()-1) {
	        		list=Arrays.copyOf(list, list.length+2);
		        	list[list.length-2]=0xd;
		        	list[list.length-1]=0xa;
				}
	        	count++;
			}
	        for (int i = 0; i < list.length; i++) {
	        	if (i>=buf.length-1) {
					break;
				}
	        	byte a = list[i];
				byte b = buf[i];
				if (a!=b) {
					return false;
				}
			}
		return true;
	}
	
	public static void main(String[] args) {
		//读取任意编码格式的文件
		readFileToList("文件路径");
	}
}


 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值