import java.lang.*;
import java.util.*;
import java.io.*;
import java.net.*;
public class SinoDetect {
static final int GB2312 = 0;
static final int GBK = 1;
static final int HZ = 2;
static final int BIG5 = 3;
static final int EUC_TW = 4;
static final int ISO_2022_CN = 5;
static final int UTF8 = 6;
static final int UNICODE = 7;
static final int ASCII = 8;
static final int OTHER = 9;
static final int TOTAL_ENCODINGS = 10;
// Frequency tables to hold the GB, Big5, and EUC-TW character
// frequencies
int GBFreq[][];
int GBKFreq[][];
int Big5Freq[][];
int EUC_TWFreq[][];
//int UnicodeFreq[94][128];
public static String[] nicename;
public static String[] codings;
public SinoDetect() {
// Initialize the Frequency Table for GB, Big5, EUC-TW
GBFreq = new int[94][94];
GBKFreq = new int[126][191];
Big5Freq = new int[94][158];
EUC_TWFreq = new int[94][94];
codings = new String[TOTAL_ENCODINGS];
codings[GB2312] = "GB2312";
codings[GBK] = "GBK";
codings[HZ] = "HZ";
codings[BIG5] = "BIG5";
codings[EUC_TW] = "CNS11643";
codings[ISO_2022_CN] = "ISO2022CN";
codings[UTF8] = "UTF8";
codings[UNICODE] = "Unicode";
codings[ASCII] = "ASCII";
codings[OTHER] = "OTHER";
nicename = new String[TOTAL_ENCODINGS];
nicename[GB2312] = "GB2312";
nicename[GBK] = "GBK";
nicename[HZ] = "HZ";
nicename[BIG5] = "Big5";
nicename[EUC_TW] = "CNS 11643";
nicename[ISO_2022_CN] = "ISO 2022-CN";
nicename[UTF8] = "UTF-8";
nicename[UNICODE] = "Unicode";
nicename[ASCII] = "ASCII";
nicename[OTHER] = "OTHER";
initialize_frequencies();
}
public static void main(String argc[])
{
SinoDetect sinodetector;
int result = OTHER;
argc = new String[1];
//argc[0] = "c://chinesedata//codeconvert//voaunit.txt";
argc[0] = "中文";
sinodetector = new SinoDetect();
if (argc[0].startsWith("http://") == true)
{
try {
result = sinodetector.detectEncoding(new URL(argc[0]));
}
catch (Exception e) {
System.err.println("Bad URL " + e.toString());
}
} else {
//result = sinodetector.detectEncoding(new File(argc[0]));
result = sinodetector.detectEncoding(argc[0].getBytes());
}
System.out.println(nicename[result]);
}
/** Function : detectEncoding
Aruguments: URL
Returns : One of the encodings from the Encoding enumeration
(GB2312, HZ, BIG5, EUC_TW, ASCII, or OTHER)
Description: This function looks at the URL contents
and assigns it a probability score for each encoding type.
The encoding type with the highest probability is returned.
*/
public int detectEncoding(URL testurl) {
byte[] rawtext = new byte[10000];
int bytesread = 0, byteoffset = 0;
int guess = OTHER;
InputStream chinesestream;
try {
chinesestream = testurl.openStream();
while ((bytesread = chinesestream.read(rawtext, byteoffset, rawtext.length - byteoffset)) > 0) {
byteoffset += bytesread;
};
chinesestream.close();
guess = detectEncoding(rawtext);
}
catch (Exception e) {
System.err.println("Error loading or using URL " + e.toString());
guess = OTHER;
}
return guess;
}
/** Function : detectEncoding
Aruguments: File
Returns : One of the encodings from the Encoding enumeration
(GB2312, HZ, BIG5, EUC_TW, ASCII, or OTHER)
Description: This function looks at the file
and assigns it a probability score for each encoding type.
The encoding type with the highest probability is returned.
*/
public int detectEncoding(File testfile) {
FileInputStream chinesefile;
byte[] rawtext;
rawtext = new byte[(int)testfile.length()];
try {
chinesefile = new FileInputStream(testfile);
chinesefile.read(rawtext);
}
catch (Exception e) {
System.err.println("Error: " + e);
}
return detectEncoding(rawtext);
}
/** Function : detectEncoding
Aruguments: byte array
Returns : One of the encodings from the Encoding enumeration
(GB2312, HZ, BIG5, EUC_TW, ASCII, or OTHER)
Description: This function looks at the byte array
and assigns it a probability score for each encoding type.
The encoding type with the highest probability is returned.
*/
public int detectEncoding(byte[] rawtext) {
int[] scores;
int index, maxscore = 0;
int encoding_guess = OTHER;
scores = new int[TOTAL_ENCODINGS];
// Assign Scores
scores[GB2312] = gb2312_probability(rawtext);
scores[GBK] = gbk_probability(rawtext);
scores[HZ] = hz_probability(rawtext);
scores[BIG5] = big5_probability(rawtext);
scores[EUC_TW] = euc_tw_probability(rawtext);
scores[ISO_2022_CN] = iso_2022_cn_probability(rawtext);
scores[UTF8] = utf8_probability(rawtext);
scores[UNICODE] = utf16_probability(rawtext);
scores[ASCII] = ascii_probability(rawtext);
scores[OTHER] = 0;
// Tabulate Scores
for (index = 0; index < TOTAL_ENCODINGS; index++) {
if (scores[index] > maxscore) {
encoding_guess = index;
maxscore = scores[index];
}
}
// Return OTHER if nothing scored above 50
if (maxscore <= 50) {
encoding_guess = OTHER;
}
return encoding_guess;
}
/* Function: gb2312_probability
Argument: pointer to byte array
Returns : number from 0 to 100 representing probability
text in array uses GB-2312 encoding
*/
int gb2312_probability(byte[] rawtext) {
int i, rawtextlen = 0;
int dbchars = 1, gbchars = 1;
long gbfreq = 0, totalfreq = 1;
float rangeval = 0, freqval = 0;
int row, column;
// Stage 1: Check to see if characters fit into acceptable ranges
rawtextlen = rawtext.length;
for (i = 0; i < rawtextlen-1; i++) {
//System.err.println(rawtext[i]);
if (rawtext[i] >= 0) {
//asciichars++