前段时间工作中需要做大量txt文件抽取,但txt文件编码格式又不统一,文件内容存在全角字符,无奈自己查询学习后编写了个工具类,其中提供:获取txt文件编码格式(中文简体系统使用,其他系统语言可修改使用),全角转半角功能,废话少说,看代码:
package demo.ok;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
public class TextReaderEncode {
public static void main(String[] args) throws Exception {
//String path1="C:\\Users\\Administrator\\Desktop\\1\\ANSI.txt";
//String path2="C:\\Users\\Administrator\\Desktop\\1\\Unicode.txt";
//String path3="C:\\Users\\Administrator\\Desktop\\1\\Unicode_big.txt";
String path4="C:\\Users\\Administrator\\Desktop\\1\\UTF-8.txt";
//String str=getFilecharset(path4);
//System.out.println(str); //编码
String results=getTextFromText(path4); //获取文件编码,根据编码读取文件内容,文件内容全角转成半角
System.out.println(results); //结果
}
public static String getTextFromText(String filePath){
try {
InputStreamReader isr = new InputStreamReader(new FileInputStream(filePath),getFilecharset(filePath)); //getFilecharset(path):判断文件的编码格式
BufferedReader br = new BufferedReader(isr);
StringBuffer sb = new StringBuffer();
String temp = null;
while((temp = br.readLine()) != null){
sb.append(temp+"\n");
}
String result=sb.toString();
//判断字符串内容是否是全角半角混合都是全角,如果是则全角转为半角
if(result.getBytes().length > result.length() && result.getBytes().length != result.length()){
result=qToB(result); //全角转半角
}
br.close();
return result;
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return null;
}
//全角字符串转换半角字符串
public static String qToB(String fullWidthStr){
if (null == fullWidthStr || fullWidthStr.length() <= 0) {
return "";
}
char[] charArray = fullWidthStr.toCharArray();
//对全角字符转换的char数组遍历
for (int i = 0; i < charArray.length; ++i) {
int charIntValue = (int) charArray[i];
//如果符合转换关系,将对应下标之间减掉偏移量65248;如果是空格的话,直接做转换
if (charIntValue >= 65281 && charIntValue <= 65374) {
charArray[i] = (char) (charIntValue - 65248);
} else if (charIntValue == 12288) {
charArray[i] = (char) 32;
}
}
return new String(charArray);
}
/**
* * 判断编码格式方法 *
* 新建txt默认为ANSI编码,但是ANSI编码的文件具体的编码格式根据系统语言决定,
* 中文简体的系统语言使用GBK读取ANSI的文件,繁体使用BIG5读取ANSI文件
* 一般都是使用简体系统环境,所以这里直接默认使用GBK
* */
private static String getFilecharset(String sourceFile) {
String charset = "GBK";
byte[] first3Bytes = new byte[3];
try {
boolean checked = false;
BufferedInputStream bis = new BufferedInputStream(new FileInputStream(sourceFile));
bis.mark(0);
int read = bis.read(first3Bytes, 0, 3);
if (read == -1) {
return charset; //文件编码为 ANSI ,简体默认使用GBK
} else if (first3Bytes[0] == (byte) 0xFF //判断头标识
&& first3Bytes[1] == (byte) 0xFE) {
charset = "Unicode"; //文件编码为UTF-16LE即 Unicode
checked = true;
} else if (first3Bytes[0] == (byte) 0xFE
&& first3Bytes[1] == (byte) 0xFF) {
charset = "UTF-16BE"; //文件编码为 Unicode big endian
checked = true;
} else if (first3Bytes[0] == (byte) 0xEF
&& first3Bytes[1] == (byte) 0xBB
&& first3Bytes[2] == (byte) 0xBF) {
charset = "UTF-8"; //文件编码为 UTF-8
checked = true;
}
bis.reset(); //回位到上一个mark
if (!checked) {
int loc = 0;
while ((read = bis.read()) != -1) {
loc++;
if (read >= 0xF0)
break;
if (0x80 <= read && read <= 0xBF) // 单独出现BF以下的,也算是GBK
break;
if (0xC0 <= read && read <= 0xDF) {
read = bis.read();
if (0x80 <= read && read <= 0xBF) // 双字节 (0xC0 - 0xDF)
// (0x80 - 0xBF),也可能在GB编码内
continue;
else
break;
} else if (0xE0 <= read && read <= 0xEF) {// 也有可能出错,但是几率较小
read = bis.read();
if (0x80 <= read && read <= 0xBF) {
read = bis.read();
if (0x80 <= read && read <= 0xBF) {
charset = "UTF-8";
break;
} else
break;
} else
break;
}
}
}
bis.close();
} catch (Exception e) {
e.printStackTrace();
}
return charset;
}
}