网上流行的文本编码自动检测,一般都是拷贝过来的,一个名为erik peterson的人写的。
看了一下其代码,似乎有个地方写错了。
看代码中添加的中文注解:
sjis_probability(unsigned char* rawtext) {
unsigned int i, rawtextlen = 0;
int dbchars = 1, jpchars = 1;
long jpfreq = 0, totalfreq = 1;
float rangeval = 0, freqval = 0;
int row, column, adjust;
// Stage 1: Check to see if characters fit into acceptable ranges
rawtextlen = strlen((char*)rawtext);
for (i = 0; i < rawtextlen-1; i++) {
//System.err.println(rawtext[i]);
if (rawtext[i] <= 0x7E) {
//asciichars++;
} else {
dbchars++;
if (i+1 < rawtextlen &&
((0x81 <= rawtext[i] && rawtext[i] <= 0x9F) ||
(0xE0 <= rawtext[i] && rawtext[i] <= 0xEF)) &&
((0x40 <= rawtext[i+1] && rawtext[i+1] <= 0x7E) ||
(0x80 <= rawtext[i+1] && rawtext[i+1] <= 0xFC)))
{
jpchars++;
totalfreq += maxfreq;
row = rawtext[i];
column = rawtext[i+1];
if (column < 0x9f) {
adjust = 1;
if (column > 0x7f) {
column -= 0x20;
} else {
column -= 0x19;
}
} else {
adjust = 0;
column -= 0x7e;
}
if (row < 0xa0) {
row = ((row - 0x70) << 1) - adjust;
} else {
row = ((row - 0xb0) << 1) - adjust;
}
row -= 0x20;
column = 0x20; // 这里应该改为:column -= 0x20