检查文件编码是什么格式的

最新推荐文章于 2024-07-06 14:58:04 发布

tanguang_honesty

最新推荐文章于 2024-07-06 14:58:04 发布

阅读量2.1k

点赞数

分类专栏：工作流&&IO流

本文链接：https://blog.csdn.net/tanguang_honesty/article/details/16861299

版权

工作流&&IO流专栏收录该内容

5 篇文章 0 订阅

订阅专栏

可以使用一个开源项目cpdetector，它所在的网址是：http://cpdetector.sourceforge.net/。它的类库很小，只有500K左右，cpDetector是基于统计学原理的，不保证完全正确，利用该类库判定文本文件的代码如下：

/**
	 * 利用第三方开源包cpdetector获取文件编码格式
	 * 
	 * @param path
	 *            要判断文件编码格式的源文件的路径
	 * @author tg
	 * @version 2012-7-12 14:05
	 */
	public static String getFileEncode(String path) {
		/*
		 * detector是探测器，它把探测任务交给具体的探测实现类的实例完成。
		 * cpDetector内置了一些常用的探测实现类，这些探测实现类的实例可以通过add方法 加进来，如ParsingDetector、
		 * JChardetFacade、ASCIIDetector、UnicodeDetector。
		 * detector按照“谁最先返回非空的探测结果，就以该结果为准”的原则返回探测到的
		 * 字符集编码。使用需要用到三个第三方JAR包：antlr.jar、chardet.jar和cpdetector.jar
		 * cpDetector是基于统计学原理的，不保证完全正确。
		 */
		CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance();
		/*
		 * ParsingDetector可用于检查HTML、XML等文件或字符流的编码,构造方法中的参数用于
		 * 指示是否显示探测过程的详细信息，为false不显示。
		 */
		detector.add(new ParsingDetector(false));
		/*
		 * JChardetFacade封装了由Mozilla组织提供的JChardet，它可以完成大多数文件的编码
		 * 测定。所以，一般有了这个探测器就可满足大多数项目的要求，如果你还不放心，可以
		 * 再多加几个探测器，比如下面的ASCIIDetector、UnicodeDetector等。
		 */
		detector.add(JChardetFacade.getInstance());// 用到antlr.jar、chardet.jar
		// ASCIIDetector用于ASCII编码测定
		detector.add(ASCIIDetector.getInstance());
		// UnicodeDetector用于Unicode家族编码的测定
		detector.add(UnicodeDetector.getInstance());
		java.nio.charset.Charset charset = null;
		File f = new File(path);
		try {
			charset = detector.detectCodepage(f.toURI().toURL());
		} catch (Exception ex) {
			ex.printStackTrace();
		}
		if (charset != null)
			return charset.name();
		else
			return null;
	}


	/**
	 * 利用第三方开源包cpdetector获取URL对应的文件编码
	 * 
	 * @param path
	 *            要判断文件编码格式的源文件的URL
	 * @author tg
	 * @version 2012-7-12 14:05
	 */
	public static String getFileEncode(URL url) {
		/*
		 * detector是探测器，它把探测任务交给具体的探测实现类的实例完成。
		 * cpDetector内置了一些常用的探测实现类，这些探测实现类的实例可以通过add方法 加进来，如ParsingDetector、
		 * JChardetFacade、ASCIIDetector、UnicodeDetector。
		 * detector按照“谁最先返回非空的探测结果，就以该结果为准”的原则返回探测到的
		 * 字符集编码。使用需要用到三个第三方JAR包：antlr.jar、chardet.jar和cpdetector.jar
		 * cpDetector是基于统计学原理的，不保证完全正确。
		 */
		CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance();
		/*
		 * ParsingDetector可用于检查HTML、XML等文件或字符流的编码,构造方法中的参数用于
		 * 指示是否显示探测过程的详细信息，为false不显示。
		 */
		detector.add(new ParsingDetector(false));
		/*
		 * JChardetFacade封装了由Mozilla组织提供的JChardet，它可以完成大多数文件的编码
		 * 测定。所以，一般有了这个探测器就可满足大多数项目的要求，如果你还不放心，可以
		 * 再多加几个探测器，比如下面的ASCIIDetector、UnicodeDetector等。
		 */
		detector.add(JChardetFacade.getInstance());// 用到antlr.jar、chardet.jar
		// ASCIIDetector用于ASCII编码测定
		detector.add(ASCIIDetector.getInstance());
		// UnicodeDetector用于Unicode家族编码的测定
		detector.add(UnicodeDetector.getInstance());
		java.nio.charset.Charset charset = null;
		try {
			charset = detector.detectCodepage(url);// 就这一行url不同，getFileEncode是根据文件路径来
		} catch (Exception ex) {
			ex.printStackTrace();
		}
		if (charset != null)
			return charset.name();
		else
			return null;
	}


	/**
	 * 根据编码 和文件路径读取对应的文件
	 * 
	 * @param configFilePath
	 *            文件路径
	 * @param encoding
	 *            编码
	 */
	public static void readFile(String configFilePath, String encoding) {
		FileInputStream inputStream = null;
		BufferedReader reader = null;
		try {
			inputStream = new FileInputStream(configFilePath);
			reader = new BufferedReader(new InputStreamReader(inputStream,
					"utf-8"));
			String tempString = null;
			int line = 1;
			// 一次读入一行，直到读入null为文件结束
			while ((tempString = reader.readLine()) != null) {
				// 显示行号
				System.out.println("line " + line + ": " + tempString);
				line++;
			}
		} catch (FileNotFoundException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (UnsupportedEncodingException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} finally {
			if (inputStream != null) {
				try {
					inputStream.close();
				} catch (IOException e1) {
				}
			}
			if (reader != null) {
				try {
					reader.close();
				} catch (IOException e1) {
				}
			}
		}
	}


	public static void main(String[] args) {
		// String configFilePath="E:\\ftpshare\\ANZBS20131113.txt.backup";
		// String configFilePath="E:\\ftpshare\\测试编码\\ANZBS20131113.txt";
		// String configFilePath="D:\\output\\ANZBS20131113.txt";
		String configFilePath = "D:\\input\\ANZBS20131113.txt";
		String charsetName = getFileEncode(configFilePath);
		System.out.println(charsetName);
		
		URL url = TestEncoding.class.getResource("../../../mule-config.xml" );
		try {
		//	URLConnection urlConnection = url.openConnection();
			String charsetName1 = getFileEncode(url);
			System.out.println(charsetName1);
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}


	}

同时还有如何批量将gbk转为utf-8的方式，具体可以参见附件代码

public class FileGBK2UTF8 {
	public static void main(String[] args) {
		// 需要转换的文件目录
		String fromPath = "D:\\input";
		// 转换到指定的文件目录
		String toPath = "D:\\output";

		info("start transform [from path]={0} [to path]={1}", fromPath, toPath);

		// 递归取到所有的文件进行转换
		transform(fromPath, toPath);
	}

	/**
	 * 把一个目录中的文件转换到另一个目录中
	 * 
	 * @param fromPath
	 *            -- 来源文件目录
	 * @param toPath
	 *            -- 目标文件目录
	 * @return
	 */
	public static boolean transform(String fromPath, String toPath) {
		File ftmp = new File(fromPath);
		if (!ftmp.exists()) {
			info("转换文件路径错误！");
			return false;
		}

		info("frompath is [{0}], topath is [{1}]", fromPath, toPath);

		// 如果是文件，则转换，结束
		if (ftmp.isFile()) {
			byte[] value = fileToBytes(fromPath);
			String content = convEncoding(value, "gbk", "utf-8");
			return saveFileUtf8(toPath, content);
		} else {
			// 查找目录下面的所有文件与文件夹
			File[] childFiles = ftmp.listFiles();
			for (int i = 0, n = childFiles.length; i < n; i++) {
				File child = childFiles[i];
				String childFrom = fromPath + "/" + child.getName();
				String childTo = toPath + "/" + child.getName();

				transform(childFrom, childTo);
			}
		}

		return true;
	}

	/**
	 * 把文件内容保存到指定的文件中，如果指定的文件已存在，则先删除这个文件， 如果没有则创建一个新文件，文件内容采用UTF-8编码方式保存。
	 * 如果指定的文件路径不存在，则先创建文件路径，文件路径从根目录开始创建。
	 * 
	 * @param fileName
	 *            -- 文件路径
	 * @param content
	 *            -- 文件内容
	 * @return
	 */
	public static boolean saveFileUtf8(String fileName, String content) {
		if (fileName == null || fileName.length() == 0)
			return false;
		if (content == null)
			return false;

		// 路径中的\转换为/
		fileName = fileName.replace('\\', '/');
		// 处理文件路径
		createPath(fileName.substring(0, fileName.lastIndexOf('/')));

		File file = null;
		FileOutputStream out = null;
		try {
			// 创建或修改文件
			file = new File(fileName);

			if (file.exists()) {
				file.delete();
			} else {
				file.createNewFile();
			}

			out = new FileOutputStream(file);
			// 添加三个字节标识为UTF-8格式，也是BOM码
			// out.write(new byte[]{(byte)0xEF,(byte)0xBB,(byte)0xBF});
			out.write(content.getBytes("UTF-8"));
		} catch (FileNotFoundException e) {
			e.printStackTrace();
			return false;
		} catch (IOException e) {
			e.printStackTrace();
			return false;
		} finally {
			if (out != null) {
				try {
					out.flush();
					out.close();
				} catch (IOException e) {
					e.printStackTrace();
					return false;
				}
			}
		}

		return true;
	}

	/**
	 * 把文件内容转换为字节数组输出。
	 * 
	 * @param fileName
	 *            -- 文件名
	 * @return
	 */
	public static byte[] fileToBytes(String fileName) {
		FileInputStream ins = null;
		ByteArrayOutputStream bos = null;
		try {
			// 创建文件读入流
			ins = new FileInputStream(new File(fileName));
			// 创建目标输出流
			bos = new ByteArrayOutputStream();

			// 取流中的数据
			int len = 0;
			byte[] buf = new byte[256];
			while ((len = ins.read(buf, 0, 256)) > -1) {
				bos.write(buf, 0, len);
			}

			// 目标流转为字节数组返回到前台
			return bos.toByteArray();
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			try {
				if (ins != null) {
					ins.close();
					ins = null;
				}
				if (bos != null) {
					bos.close();
					bos = null;
				}
			} catch (Exception e) {
				e.printStackTrace();
			}
		}

		return null;
	}

	/**
	 * 检查指定的文件路径，如果文件路径不存在，则创建新的路径， 文件路径从根目录开始创建。
	 * 
	 * @param filePath
	 * @return
	 */
	public static boolean createPath(String filePath) {
		if (filePath == null || filePath.length() == 0)
			return false;

		// 路径中的\转换为/
		filePath = filePath.replace('\\', '/');
		// 处理文件路径
		String[] paths = filePath.split("/");

		// 处理文件名中没有的路径
		StringBuilder sbpath = new StringBuilder();
		for (int i = 0, n = paths.length; i < n; i++) {
			sbpath.append(paths[i]);
			// 检查文件路径如果没有则创建
			File ftmp = new File(sbpath.toString());
			if (!ftmp.exists()) {
				ftmp.mkdir();
			}

			sbpath.append("/");
		}

		return true;
	}

	/**
	 * 取路径中的文件名
	 * 
	 * @param path
	 *            -- 文件路径，含文件名
	 * @return
	 */
	public static String getFileName(String path) {
		if (path == null || path.length() == 0)
			return "";

		path = path.replaceAll("\\\\", "/");
		int last = path.lastIndexOf("/");

		if (last >= 0) {
			return path.substring(last + 1);
		} else {
			return path;
		}
	}

	/**
	 * 字符串的编码格式转换
	 * 
	 * @param value
	 *            -- 要转换的字符串
	 * @param oldCharset
	 *            -- 原编码格式
	 * @param newCharset
	 *            -- 新编码格式
	 * @return
	 */
	public static String convEncoding(byte[] value, String oldCharset,
			String newCharset) {
		OutputStreamWriter outWriter = null;
		ByteArrayInputStream byteIns = null;
		ByteArrayOutputStream byteOuts = new ByteArrayOutputStream();
		InputStreamReader inReader = null;

		char cbuf[] = new char[1024];
		int retVal = 0;
		try {
			byteIns = new ByteArrayInputStream(value);
			inReader = new InputStreamReader(byteIns, oldCharset);
			outWriter = new OutputStreamWriter(byteOuts, newCharset);
			while ((retVal = inReader.read(cbuf)) != -1) {
				outWriter.write(cbuf, 0, retVal);
			}
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			try {
				if (inReader != null)
					inReader.close();
				if (outWriter != null)
					outWriter.close();
			} catch (Exception e) {
				e.printStackTrace();
			}
		}

		String temp = null;
		try {
			temp = new String(byteOuts.toByteArray(), newCharset);
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		}
		// System.out.println("temp" + temp);
		return temp;
	}

	/**
	 * 显示提示信息
	 * 
	 * @param message
	 *            -- 信息内容
	 * @param params
	 *            -- 参数
	 */
	private static void info(String message, Object... params) {
		message = MessageFormat.format(message, params);

		System.out.println(message);
	}
}