在Java Server后台可能会遇到读取文本文件时的乱码问题,个人认为解决问题的最有效方法就是使用文件内容的字符编码来读取该文件。
问题又来了,文件是用户上传的,我们小程序员又没办法控制,让用户只能上传UTF-8文件,或者只能上传GBK文件,那只能程序员在代码中对文件进行检测,看用户究竟传的是那种编码格式。
而jchardet是firefox使用的字节流编码检测算法的java开源实现,协议为MPL(Mozilla Public License),对商业友好。可以考虑用它来检测文件的编码。相关资料链接 http://www.mozilla.org/projects/intl/chardet.html http://jchardet.sourceforge.net/
用jchardet的API依葫芦画瓢写上一个小Demo:
import java.io.BufferedInputStream;
import java.io.IOException;
import org.apache.commons.fileupload.FileItem;
import org.mozilla.intl.chardet.nsDetector;
import org.mozilla.intl.chardet.nsICharsetDetectionObserver;
public class FileCharsetDetector
{
private boolean found = false;
private String encoding = null;
public String guestFileEncoding(FileItem fileItem) throws IOException
{
return guestFileEncoding(fileItem, new nsDetector());
}
private String guestFileEncoding(FileItem fileItem, nsDetector det) throws IOException
{
// Set an observer...
// The Notify() will be called when a matching charset is found.
det.Init(new nsICharsetDetectionObserver()
{
public void Notify(String charset)
{
found = true;
encoding = charset;
}
});
BufferedInputStream imp = new BufferedInputStream(fileItem.getInputStream());
byte[] buf = new byte[1024];
int len;
boolean done = false;
boolean isAscii = true;
while ((len = imp.read(buf, 0, buf.length)) != -1)
{
// Check if the stream is only ascii.
if (isAscii)
{
isAscii = det.isAscii(buf, len);
}
// DoIt if non-ascii and not done yet.
if (!isAscii && !done)
{
done = det.DoIt(buf, len, false);
}
}
det.DataEnd();
if (isAscii)
{
encoding = "ASCII";
found = true;
}
if (!found)
{
String prob[] = det.getProbableCharsets();
if (prob.length > 0)
{
encoding = prob[0];
} else
{
return null;
}
}
return encoding;
}
}