jchardet是mozilla自动字符集探测算法代码的java移植。这个算法的最初作者是frank Tang,C++源代码在http://lxr.mozilla.org/mozilla/source/intl/chardet/,可以从http: //www.mozilla.org/projects/intl/chardet.html 得到更多关于这个算法的信息。
示例代码:
示例代码:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
|
// Initalize the nsDetector() ;
int
lang = (argv.length ==
2
)? Integer.parseInt(argv[
1
])
: nsPSMDetector.ALL ;
nsDetector det =
new
nsDetector(lang) ;
// Set an observer...
// The Notify() will be called when a matching charset is found.
det.Init(
new
nsICharsetDetectionObserver() {
public
void
Notify(String charset) {
HtmlCharsetDetector.found =
true
;
System.out.println(
"CHARSET = "
+ charset);
}
});
URL url =
new
URL(argv[
0
]);
BufferedInputStream imp =
new
BufferedInputStream(url.openStream());
byte
[] buf =
new
byte
[
1024
] ;
int
len;
boolean
done =
false
;
boolean
isAscii =
true
;
while
( (len=imp.read(buf,
0
,buf.length)) != -
1
) {
// Check if the stream is only ascii.
if
(isAscii)
isAscii = det.isAscii(buf,len);
// DoIt if non-ascii and not done yet.
if
(!isAscii && !done)
done = det.DoIt(buf,len,
false
);
}
det.DataEnd();
if
(isAscii) {
System.out.println(
"CHARSET = ASCII"
);
found =
true
;
}
|