读取文件,判断文件编码格式
BufferedInputStream 才支持 mark、reset功能。
package sample.test.name;
import df.util.Util;
import df.util.type.StringUtil;
import df.util.type.SysLog;
import java.io.*;
import java.util.Arrays;
/**
* Created by andrew on 2015/7/12.
*/
public class NameApp {
private static final String TAG = Util.toTAG(NameApp.class);
public static void main(String[] args) {
File dir = new File("C:\\Users\\andrew\\name");
if (null != dir
&& dir.isDirectory()) {
File[] files = dir.listFiles(new FileFilter() {
@Override
public boolean accept(File pathname) {
String name = pathname.getName();
if (name.toLowerCase().endsWith(".txt")) {
return true;
}
return false;
}
});
StringBuffer buf = new StringBuffer();
for (File f : files) {
InputStream reader = null;
try {
reader = new BufferedInputStream(new FileInputStream(f));
boolean isSupport = reader.markSupported();
SysLog.v(TAG, " IS SUPPORT = ", isSupport);
reader.mark(0);
byte[] first3Bytes = new byte[3];
int read = reader.read(first3Bytes);
String charset = "gbk";
if (3 == read) {
//其中的 0xefbb、0xfffe、0xfeff、0x5c75这些都是这个文件的前面两个字节的16进制数
if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) {
charset = "UTF-16LE";
} else if (first3Bytes[0] == (byte) 0xFE && first3Bytes[1] == (byte) 0xFF) {
charset = "UTF-16BE";
} else if (first3Bytes[0] == (byte) 0xEF && first3Bytes[1] == (byte) 0xBB && first3Bytes[2] == (byte) 0xBF) {
charset = "UTF-8";
} else{
reader.reset();
}
}else{
reader.reset();
}
byte[] line = new byte[10];
buf.setLength(0);
int len = 0;
Arrays.fill(line, (byte) 0);
while ((len = reader.read(line, 0, line.length)) != -1) {
SysLog.v(TAG, " LINE=", StringUtil.toHexString(line));
buf.append(new String(line,charset));
Arrays.fill(line, (byte) 0);
}
SysLog.v(TAG,"filename=",f.getName(), charset,"end=", buf.length(),buf.toString());
} catch (Exception e) {
SysLog.v(TAG, ", e=", e.getMessage());
e.printStackTrace();
} finally {
if (null != reader) {
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
}
}