知识点:
1. txt默认的选项是ANSI,即GBK编码
2. txt文本文档有四种编码选项:ANSI、Unicode、Unicode big endian、UTF-8
3. 因此我们读取txt文件可能有时候并不知道其编码格式,所以需要用程序动态判断获取txt文件编码
ANSI: 无格式定义
Unicode: 前两个字节为FFFE Unicode文档以0xFFFE开头
Unicode big endian: 前两字节为FEFF
UTF-8: 前两字节为EFBB UTF-8以0xEFBBBF开头
1. txt默认的选项是ANSI,即GBK编码
2. txt文本文档有四种编码选项:ANSI、Unicode、Unicode big endian、UTF-8
3. 因此我们读取txt文件可能有时候并不知道其编码格式,所以需要用程序动态判断获取txt文件编码
ANSI: 无格式定义
Unicode: 前两个字节为FFFE Unicode文档以0xFFFE开头
Unicode big endian: 前两字节为FEFF
UTF-8: 前两字节为EFBB UTF-8以0xEFBBBF开头
用程序取出前几个字节并进行判断即可。
4.java编码与txt编码对应
java | txt |
Unicode | Unicode big endian |
utf-8 | utf-8 |
utf-16 | utf-8 |
gb2312 | ANSI |
java读取txt文件,如果编码格式不匹配,就会出现乱码现象。所以读取txt文件的时候需要设置读取编码。txt文档编码格式都是写在文件头的,在程序中需要先解析文件的编码格式,获得编码格式后,在按此格式读取文件就不会产生乱码了
package com.sun;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
public class read_file {
public static void main(String[] args) throws Exception {
String filePath = "g:\\aaa\\789.txt";
// String filePath = "D:/article333.txt";
// String filePath = "D:/article111.txt";
String content = readTxt(filePath);
System.out.println(content);
}
/**
* 解析普通文本文件 流式文件 如txt
* @param path
* @return
*/
@SuppressWarnings("unused")
public static String readTxt(String path){
StringBuilder content = new StringBuilder("");
try {
String code = resolveCode(path);
File file = new File(path);
InputStream is = new FileInputStream(file);
InputStreamReader isr = new InputStreamReader(is, code);
BufferedReader br = new BufferedReader(isr);
// char[] buf = new char[1024];
// int i = br.read(buf);
// String s= new String(buf);
// System.out.println(s);
String str = "";
while (null != (str = br.readLine())) {
content.append(str);
}
br.close();
} catch (Exception e) {
e.printStackTrace();
System.err.println("读取文件:" + path + "失败!");
}
return content.toString();
}
public static String resolveCode(String path) throws Exception {
InputStream inputStream = new FileInputStream(path);
byte[] head = new byte[3];
inputStream.read(head);
String code = "gb2312"; //或GBK
if (head[0] == -1 && head[1] == -2 )
code = "UTF-16";
else if (head[0] == -2 && head[1] == -1 )
code = "Unicode";
else if(head[0]==-17 && head[1]==-69 && head[2] ==-65)
code = "UTF-8";
inputStream.close();
System.out.println(code);
return code;
}
}