一、相关知识说明:
1. txt默认的选项是ANSI,即GBK编码
2. txt文本文档有四种编码选项:ANSI、Unicode、Unicode big endian、UTF-8
3. 我们在读取txt文件时可能不知道其编码格式,所以需要动态判断获取txt文件编码进而避免读取乱码问题
二、编码格式说明:
ANSI: 无格式定义
Unicode: 前两个字节为FFFE Unicode文档以0xFFFE开头
Unicode big endian: 前两字节为FEFF
UTF-8: 前两字节为EFBB UTF-8以0xEFBBBF开头
三、解决方案:
用程序取出前几个字节并进行判断,然后根据相应的编码读取文档。
四、java编码与txt编码对应
java | txt |
Unicode | Unicode big endian |
utf-8 | utf-8 |
utf-16 | utf-8 |
gb2312 | ANSI |
五、代码示例:
import java.io.*;
/**
* Created by glin on 2018/7/11 0011.
*/
public class standard {
public static void main(String[] args) throws Exception {
String inputPath = "F:\\备用\\省市区划\\街道.txt";
String outputPath = "F:\\备用\\省市区划\\街道_ns.txt";
String content = readTxt(inputPath,outputPath);
System.out.println(content);
}
/**
* 解析普通文本文件 如txt
* @param path
* @return
*/
@SuppressWarnings("unused")
public static String readTxt(String path,String outputPath){
StringBuilder content = new StringBuilder("");
try {
FileOutputStream writerStream = new FileOutputStream(outputPath, true);
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(writerStream, "UTF-8"));
String code = getCode(path);
File file = new File(path);
InputStream is = new FileInputStream(file);
InputStreamReader isr = new InputStreamReader(is, code);
BufferedReader br = new BufferedReader(isr);
String str = "";
String jd =null;
while (null != (str = br.readLine())) {
jd = str.trim();
writer.write(jd+" "+"ns");
writer.write("\r\n");
}
br.close();
writer.close();
return "ok";
} catch (Exception e) {
e.printStackTrace();
System.err.println("读取文件:" + path + "失败!");
return "读取文件:" + path + "失败!";
}
}
public static String getCode(String path) throws Exception {
InputStream inputStream = new FileInputStream(path);
byte[] head = new byte[3];
inputStream.read(head);
String code = "gb2312"; //或GBK
if (head[0] == -1 && head[1] == -2 )
code = "UTF-16";
else if (head[0] == -2 && head[1] == -1 )
code = "Unicode";
else if(head[0]==-17 && head[1]==-69 && head[2] ==-65)
code = "UTF-8";
inputStream.close();
System.out.println(code);
return code;
}
}