package com.thinkgem.jeesite.modules.utils;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.net.URL;
public class EncodingDetect {
private static final int GB2312 = 0;
private static final int GBK = 1;
private static final int HZ = 2;
private static final int BIG5 = 3;
private static final int EUC_TW = 4;
private static final int ISO_2022_CN = 5;
private static final int UTF8 = 6;
private static final int UNICODE = 7;
private static final int ASCII = 8;
private static final int OTHER = 9;
private static final int TOTAL_ENCODINGS = 10;
// Frequency tables to hold the GB, Big5, and EUC-TW character
// frequencies
private int GBFreq[][];
private int GBKFreq[][];
private int Big5Freq[][];
private int EUC_TWFreq[][];
// int UnicodeFreq[94][128];
public static String[] nicename;
public static String[] codings;
public EncodingDetect() {
// Initialize the Frequency Table for GB, Big5, EUC-TW
GBFreq = new int[94][94];
GBKFreq = new int[126][191];
Big5Freq = new int[94][158];
EUC_TWFreq = new int[94][94];
codings = new String[TOTAL_ENCODINGS];
codings[GB2312] = "GB2312";
codings[GBK] = "GBK";
codings[HZ] = "HZ";
codings[BIG5] = "BIG5";
codings[EUC_TW] = "CNS11643";
codings[ISO_2022_CN] = "ISO2022CN";
codings[UTF8] = "UTF8";
codings[UNICODE] = "Unicode";
codings[ASCII] = "ASCII";
codings[OTHER] = "OTHER";
nicename = new String[TOTAL_ENCODINGS];
nicename[GB2312] = "GB2312";
nicename[GBK] = "GBK";
nicename[HZ] = "HZ";
nicename[BIG5] = "Big5";
nicename[EUC_TW] = "CNS 11643";
nicename[ISO_2022_CN] = "ISO 2022-CN";
nicename[UTF8] = "UTF-8";
nicename[UNICODE] = "Unicode";
nicename[ASCII] = "ASCII";
nicename[OTHER] = "OTHER";
initialize_frequencies();
}
/**
* Function : detectEncoding Aruguments: URL Returns : One of the encodings
* from the Encoding enumeration (GB2312, HZ, BIG5, EUC_TW, ASCII, or OTHER)
* Description: This function looks at the URL contents and assigns it a
* probability score for each encoding type. The encoding type with the
* highest probability is returned.
*/
public int detectEncoding(URL testurl) {
byte[] rawtext = new byte[10000];
int bytesread = 0, byteoffset = 0;
int guess = OTHER;
InputStream chinesestream;
try {
chinesestream = testurl.openStream();
while ((bytesread = chinesestream.read(rawtext, byteoffset,
rawtext.length - byteoffset)) > 0) {
byteoffset += bytesread;
}
;
chinesestream.close();
guess = detectEncoding(rawtext);
} catch (Exception e) {
System.err.println("Error loading or using URL " + e.toString());
guess = OTHER;
}
return guess;
}
/**
* Function : detectEncoding Aruguments: File Returns : One of the encodings
* from the Encoding enumeration (GB2312, HZ, BIG5, EUC_TW, ASCII, or OTHER)
* Description: This function looks at the file and assigns it a probability
* score for each encoding type. The encoding type with the highest
* probability is returned.
*/
public int detectEncoding(File testfile) {
FileInputStream chinesefile;
byte[] rawtext;
rawtext = new byte[(int) testfile.length()];
try {
chinesefile = new FileInputStream(testfile);
chinesefile.read(rawtext);
} catch (Exception e) {
System.err.println("Error: " + e);
}
return detectEncoding(rawtext);
}
/**
* Function : detectEncoding Aruguments: byte array Returns : One of the
* encodings from the Encoding enumeration (GB2312, HZ, BIG5, EUC_TW, ASCII,
* or OTHER) Description: This function looks at the byte array and assigns
* it a probability score for each encoding type. The encoding type with the
* highest probability is returned.
*/
public int detectEncoding(byte[] rawtext) {
int[] scores;
int index, maxscore = 0;
int encoding_guess = OTHER;
scores = new int[TOTAL_ENCODINGS];
// Assign Scores
scores[GB2312] = gb2312_probability(rawtext);
scores[GBK] = gbk_probability(rawtext);
scores[HZ] = hz_probability(rawtext);
scores[BIG5] = big5_probability(rawtext);
scores[EUC_TW] = euc_tw_probability(rawtext);
scores[ISO_2022_CN] = iso_2022_cn_probability(rawtext);
scores[UTF8] = utf8_probability(rawtext);
scores[UNICODE] = utf16_probability(rawtext);
scores[ASCII] = ascii_probability(rawtext);
scores[OTHER] = 0;
// Tabulate Scores
for (index = 0; index < TOTAL_ENCODINGS; index++) {
if (scores[index] > maxscore) {
encoding_guess = index;
maxscore = scores[index];
}
}
// Return OTHER if nothing scored above 50
if (maxscore <= 50) {
encoding_guess = OTHER;
}
return encoding_guess;
}
/*
* Function: gb2312_probability Argument: pointer to byte array Returns :
* number from 0 to 100 representing probability text in array uses GB-2312
* encoding
*/
int gb2312_probability(byte[] rawtext) {
int i, rawtextlen = 0;
int dbchars = 1, gbchars = 1;
long gbfreq = 0, totalfreq = 1;
float rangeval = 0, freqval = 0;
int row, column;
// Stage 1: Check to see if characters fit into acceptable ranges
rawtextlen = rawtext.length;
for (i = 0; i < rawtextlen - 1; i++) {
// System.err.println(rawtext[i]);
if (rawtext[i] >= 0) {
// asciichars++;
} else {
dbchars++;
if ((byte) 0xA1 <= rawtext[i] && rawtext[i] <= (byte) 0xF7
&& (byte) 0xA1 <= rawtext[i + 1]
&& rawtext[i + 1] <= (byte) 0xFE) {
gbchars++;
totalfreq += 500;
row = rawtext[i] + 256 - 0xA1;
column = rawtext[i + 1] + 256 - 0xA1;
if (GBFreq[row][column] != 0) {
gbfreq += GBFreq[row][column];
} else if (15 <= row && row < 55) {
gbfreq += 200;
}
}
i++;
}
}
rangeval = 50 * ((float) gbchars / (float) dbchars);
freqval = 50 * ((float) gbfreq / (float) totalfreq);
return (int) (rangeval + freqval);
}
/*
* Function: gb2312_probability Argument: pointer to byte array Returns :
* number from 0 to 100 representing probability text in array uses GB-2312
* encoding
*/
int gbk_probability(byte[] rawtext) {
int i, rawtextlen = 0;
int dbchars = 1, gbchars = 1;
long gbfreq = 0, totalfreq = 1;
float rangeval = 0, freqval = 0;
int row, column;
// Stage 1: Check to see if characters fit into acceptable ranges
rawtextlen = rawtext.length;
for (i = 0; i < rawtextlen - 1; i++) {
// System.err.println(rawtext[i]);
if (rawtext[i] >= 0) {
// asciichars++;
} else {
dbchars++;
if ((byte) 0xA1 <= rawtext[i] && rawtext[i] <= (byte) 0xF7
&& // Original GB range
(byte) 0xA1 <= rawtext[i + 1]
&& rawtext[i + 1] <= (byte) 0xFE) {
gbchars++;
totalfreq += 500;
row = rawtext[i] + 256 - 0xA1;
column = rawtext[i + 1] + 256 - 0xA1;
// System.out.println("original row " + row + " column " +
// column);
if (GBFreq[row][column] != 0) {
gbfreq += GBFreq[row][column];
} else if (15 <= row && row < 55) {
gbfreq += 200;
}
} else if ((byte) 0x81 <= rawtext[i]
&& rawtext[i] <= (byte) 0xFE && // Extended GB range
(((byte) 0x80 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0xFE) || ((byte) 0x40 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0x7E))) {
gbchars++;
totalfreq += 500;
row = rawtext[i] + 256 - 0x81;
if (0x40 <= rawtext[i + 1] && rawtext[i + 1] <= 0x7E) {
column = rawtext[i + 1] - 0x40;
} else {
column = rawtext[i + 1] + 256 - 0x80;
}
// System.out.println("extended row " + row + " column " +
// column + " rawtext[i] " + rawtext[i]);
if (GBKFreq[row][column] != 0) {
gbfreq += GBKFreq[row][column];
}
}
i++;
}
}
rangeval = 50 * ((float) gbchars / (float) dbchars);
freqval = 50 * ((float) gbfreq / (float) totalfreq);
// For regular GB files, this would give the same score, so I handicap
// it slightly
return (int) (rangeval + freqval) - 1;
}
/*
* Function: hz_probability Argument: byte array Returns : number from 0 to
* 100 representing probability text in array uses HZ encoding
*/
int hz_probability(byte[] rawtext) {
int i, rawtextlen;
int hzchars = 0, dbchars = 1;
long hzfreq = 0, totalfreq = 1;
float rangeval = 0, freqval = 0;
int hzstart = 0, hzend = 0;
int row, column;
rawtextlen = rawtext.length;
for (i = 0; i < rawtextlen; i++) {
if (rawtext[i] == '~') {
if (rawtext[i + 1] == '{') {
hzstart++;
i += 2;
while (i < rawtextlen - 1) {
if (rawtext[i] == 0x0A || rawtext[i] == 0x0D) {
break;
} else if (rawtext[i] == '~' && rawtext[i + 1] == '}') {
hzend++;
i++;
break;
} else if ((0x21 <= rawtext[i] && rawtext[i] <= 0x77)
&& (0x21 <= rawtext[i + 1] && rawtext[i + 1] <= 0x77)) {
hzchars += 2;
row = rawtext[i] - 0x21;
column = rawtext[i + 1] - 0x21;
totalfreq += 500;
if (GBFreq[row][column] != 0) {
hzfreq += GBFreq[row][column];
} else if (15 <= row && row < 55) {
hzfreq += 200;
}
} else if ((0xA1 <= rawtext[i] && rawtext[i] <= 0xF7)
&& (0xA1 <= rawtext[i + 1] && rawtext[i + 1] <= 0xF7)) {
hzchars += 2;
row = rawtext[i] + 256 - 0xA1;
column = rawtext[i + 1] + 256 - 0xA1;
totalfreq += 500;
if (GBFreq[row][column] != 0) {
hzfreq += GBFreq[row][column];
} else if (15 <= row && row < 55) {
hzfreq += 200;
}
}
dbchars += 2;
i += 2;
}
} else if (rawtext[i + 1] == '}') {
hzend++;
i++;
} else if (rawtext[i + 1] == '~') {
i++;
}
}
}
if (hzstart > 4) {
rangeval = 50;
} else if (hzstart > 1) {
rangeval = 41;
} else if (hzstart > 0) { // Only 39 in case the sequence happened to
// occur
rangeval = 39; // in otherwise non-Hz text
} else {
rangeval = 0;
}
freqval = 50 * ((float) hzfreq / (float) totalfreq);
return (int) (rangeval + freqval);
}
/**
* Function: big5_probability Argument: byte array Returns : number from 0
* to 100 representing probability text in array uses Big5 encoding
*/
int big5_probability(byte[] rawtext) {
int score = 0;
int i, rawtextlen = 0;
int dbchars = 1, bfchars = 1;
float rangeval = 0, freqval = 0;
long bffreq = 0, totalfreq = 1;
int row, column;
// Check to see if characters fit into acceptable ranges
rawtextlen = rawtext.length;
for (i = 0; i < rawtextlen - 1; i++) {
if (rawtext[i] >= 0) {
// asciichars++;
} else {
dbchars++;
if ((byte) 0xA1 <= rawtext[i]
&& rawtext[i] <= (byte) 0xF9
&& (((byte) 0x40 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0x7E) || ((byte) 0xA1 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0xFE))) {
bfchars++;
totalfreq += 500;
row = rawtext[i] + 256 - 0xA1;
if (0x40 <= rawtext[i + 1] && rawtext[i + 1] <= 0x7E) {
column = rawtext[i + 1] - 0x40;
} else {
column = rawtext[i + 1] + 256 - 0x61;
}
if (Big5Freq[row][column] != 0) {
bffreq += Big5Freq[row][column];
} else if (3 <= row && ro