需求:
某文件或者某字节流要检测他的编码格式。
实现:
基于jchardet
<dependency> <groupId>net.sourceforge.jchardet</groupId> <artifactId>jchardet</artifactId> <version>1.0</version> </dependency>
代码如下:
public class DetectorUtils {
private DetectorUtils() {
}
static class ChineseCharsetDetectionObserver implements
nsICharsetDetectionObserver {
private boolean found = false;
private String result;
public void Notify(String charset) {
found = true;
result = charset;
}
public ChineseCharsetDetectionObserver(boolean found, String result) {
super();
this.found = found;
this.result = result;
}
public boolean isFound() {
return found;
}
public String getResult() {
return result;
}
}
public static String[] detectChineseCharset(InputStream in)
throws Exception {
String[] prob=null;
BufferedInputStream imp = null;
try {
boolean found = false;
String result = Charsets.UTF_8.toString();
int lang = nsPSMDetector.CHINESE;
nsDetector det = new nsDetector(lang);
ChineseCharsetDetectionObserver detectionObserver = new ChineseCharsetDetectionObserver(
found, result);
det.Init(detectionObserver);
imp = new BufferedInputStream(in);
byte[] buf = new byte[1024];
int len;
boolean isAscii = true;
while ((len = imp.read(buf, 0, buf.length)) != -1) {
if (isAscii)
isAscii = det.isAscii(buf, len);
if (!isAscii) {
if (det.DoIt(buf, len, false))
break;
}
}
det.DataEnd();
boolean isFound = detectionObserver.isFound();
if (isAscii) {
isFound = true;
prob = new String[] { "ASCII" };
} else if (isFound) {
prob = new String[] { detectionObserver.getResult() };
} else {
prob = det.getProbableCharsets();
}
return prob;
} finally {
IOUtils.closeQuietly(imp);
IOUtils.closeQuietly(in);
}
}
}
测试:
String file = "C:/3737001.xml";
String[] probableSet = DetectorUtils.detectChineseCharset(new FileInputStream(file));
for (String charset : probableSet) {
System.out.println(charset);
}
依赖的jar参见附件