原理:了解utf8的编码方式
可以判断该文本是否是utf8编码的
参考:http://www.iteye.com/topic/398782
http://www.iteye.com/topic/191552
http://bbs.csdn.net/topics/240042395
代码:
import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
public class ReadUnknowCodeTxt {
public byte[] readTxtFile(String filename) {
byte[] b = new byte[104];//取前100个左右字节进行判断
FileInputStream fis;
try {
fis = new FileInputStream(filename);
DataInputStream dis = new DataInputStream(fis);
int len = dis.read(b);
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return b;
}
public String _readTxtFile(String filename,String code){
InputStreamReader isr = null;
String tmp = null;
StringBuffer sb = null;
try {
isr = new InputStreamReader(new FileInputStream(filename),code);
BufferedReader bufferedReader = new BufferedReader(isr);
sb = new StringBuffer();
while ((tmp = bufferedReader.readLine()) != null) {
sb.append(tmp).append("\r\n");
}
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return sb!=null?sb.toString():null;
}
public boolean isUTF8(byte[] c){
boolean flag = true;
int len = c.length-4;//为了防止后面的数组取值越界
for(int i=0;i<len;i++){
System.out.println("c["+i+"]"+Integer.toHexString(c[i] & 0xFF));
if((c[i]>>7 & 0xff)==0x00){
continue;
}
if((c[i]>>4 & 0xff)==0xff && (c[++i]>>6 & 0xff)==0xfe && (c[++i]>>6 & 0xff)==0xfe && (c[++i]>>6 & 0xff)==0xfe){
continue;
}
if((c[i]>>5 & 0xff)==0xff && (c[++i]>>6 & 0xff)==0xfe && (c[++i]>>6 & 0xff)==0xfe){
continue;
}
if((c[i]>>6 & 0xff)==0xff && (c[++i]>>6 & 0xff)==0xfe){
continue;
}
flag = false;
System.out.println("gbk: i="+i);
break;
}
return flag;
}
public static void main(String[] args){
String path = "D:\\gbk.txt";
String code = "utf8";
ReadUnknowCodeTxt ruct = new ReadUnknowCodeTxt();
if(ruct.isUTF8(ruct.readTxtFile(path))){
System.out.println("utf8");
}else{
System.out.println("gbk");
code = "gbk";
}
String str = ruct._readTxtFile(path, code);
System.out.println(str);
}
}
测试了一些文本,暂时没发现问题