java获取文件编码
工具类一:
工具类一:
package www.autocheck.email;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.net.URL;
public class ExchangeUtil extends Encoding {
// Frequency tables to hold the GB, Big5, and EUC-TW character
// frequencies
int GBFreq[][];
int GBKFreq[][];
int Big5Freq[][];
int Big5PFreq[][];
int EUC_TWFreq[][];
int KRFreq[][];
int JPFreq[][];
// int UnicodeFreq[94][128];
// public static String[] nicename;
// public static String[] codings;
public boolean debug;
public ExchangeUtil() {
super();
debug = false;
GBFreq = new int[94][94];
GBKFreq = new int[126][191];
Big5Freq = new int[94][158];
Big5PFreq = new int[126][191];
EUC_TWFreq = new int[94][94];
KRFreq = new int[94][94];
JPFreq = new int[94][94];
// Initialize the Frequency Table for GB, GBK, Big5, EUC-TW, KR, JP
initialize_frequencies();
}
// public static void main(String argc[]) {
// ExchangeUtil sinodetector;
// int result = OTHER;
// int i;
// sinodetector = new ExchangeUtil();
// for (i = 0; i < argc.length; i++) {
// if (argc[i].startsWith("http://") == true) {
// try {
// result = sinodetector.detectEncoding(new URL(argc[i]));
// } catch (Exception e) {
// System.err.println("Bad URL " + e.toString());
// }
// } else if (argc[i].equals("-d")) {
// sinodetector.debug = true;
// continue;
// } else {
// result = sinodetector.detectEncoding(new File(argc[i]));
// }
// System.out.println(nicename[result]);
// }
// }
/**
* Function : detectEncoding Aruguments: URL Returns : One of the encodings from the Encoding enumeration (GB2312, HZ, BIG5,
* EUC_TW, ASCII, or OTHER) Description: This function looks at the URL contents and assigns it a probability score for each
* encoding type. The encoding type with the highest probability is returned.
*/
public int detectEncoding(URL testurl) {
byte[] rawtext = new byte[10000];
int bytesread = 0, byteoffset = 0;
int guess = OTHER;
InputStream chinesestream;
try {
chinesestream = testurl.openStream();
while ((bytesread = chinesestream.read(rawtext, byteoffset, rawtext.length - byteoffset)) > 0) {
byteoffset += bytesread;
}
;
chinesestream.close();
guess = detectEncoding(rawtext);
} catch (Exception e) {
System.err.println("Error loading or using URL " + e.toString());
guess = -1;
}
return guess;
}
/**
* Function : detectEncoding Aruguments: File Returns : One of the encodings from the Encoding enumeration (GB2312, HZ, BIG5,
* EUC_TW, ASCII, or OTHER) Description: This function looks at the file and assigns it a probability score for each encoding
* type. The encoding type with the highest probability is returned.
*/
public int detectEncoding(File testfile) {
FileInputStream chinesefile;
byte[] rawtext;
rawtext = new byte[(int) testfile.length()];
try {
chinesefile = new FileInputStream(testfile);
chinesefile.read(rawtext);
chinesefile.close();
} catch (Exception e) {
System.err.println("Error: " + e);
}
return detectEncoding(rawtext);
}
/**
* Function : detectEncoding Aruguments: byte array Returns : One of the encodings from the Encoding enumeration (GB2312, HZ,
* BIG5, EUC_TW, ASCII, or OTHER) Description: This function looks at the byte array and assigns it a probability score for
* each encoding type. The encoding type with the highest probability is returned.
*/
public int detectEncoding(byte[] rawtext) {
int[] scores;
int index, maxscore = 0;
int encoding_guess = OTHER;
scores = new int[TOTALTYPES];
// Assign Scores
scores[GB2312] = gb2312_probability(rawtext);
scores[GBK] = gbk_probability(rawtext);
scores[GB18030] = gb18030_probability(rawtext);
scores[HZ] = hz_probability(rawtext);
scores[BIG5] = big5_probability(rawtext);
scores[CNS11643] = euc_tw_probability(rawtext);
scores[ISO2022CN] = iso_2022_cn_probability(rawtext);
scores[UTF8] = utf8_probability(rawtext);
scores[UNICODE] = utf16_probability(rawtext);
scores[EUC_KR] = euc_kr_probability(rawtext);
scores[CP949] = cp949_probability(rawtext);
scores[JOHAB] = 0;
scores[ISO2022KR] = iso_2022_kr_probability(rawtext);
scores[ASCII] = ascii_probability(rawtext);
scores[SJIS] = sjis_probability(rawtext);
scores[EUC_JP] = euc_jp_probability(rawtext);
scores[ISO2022JP] = iso_2022_jp_probability(rawtext);
scores[UNICODET] = 0;
scores[UNICODES] = 0;
scores[ISO2022CN_GB] = 0;
scores[ISO2022CN_CNS] = 0;
scores[OTHER] = 0;
// Tabulate Scores
for (index = 0; index < TOTALTYPES; index++) {
if (debug)
System.err.println("Encoding " + nicename[index] + " score " + scores[index]);
if (scores[index] > maxscore) {
encoding_guess = index;
maxscore = scores[index];
}
}
// Return OTHER if nothing scored above 50
if (maxscore <= 50) {
encoding_guess = OTHER;
}
return encoding_guess;
}
/*
* Function: gb2312_probability Argument: pointer to byte array Returns : number from 0 to 100 representing probability text
* in array uses GB-2312 encoding
*/
int gb2312_probability(byte[] rawtext) {
int i, rawtextlen = 0;
int dbchars = 1, gbchars = 1;
long gbfreq = 0, totalfreq = 1;
float rangeval = 0, freqval = 0;
int row, column;
// Stage 1: Check to see if characters fit into acceptable ranges
rawtextlen = rawtext.length;
for (i = 0; i < rawtextlen - 1; i++) {
// System.err.println(rawtext[i]);
if (rawtext[i] >= 0) {
// asciichars++;
} else {
dbchars++;
if ((byte) 0xA1 <= rawtext[i] && rawtext[i] <= (byte) 0xF7 && (byte) 0xA1 <= rawtext[i + 1]
&& rawtext[i + 1] <= (byte) 0xFE) {
gbchars++;
totalfreq += 500;
row = rawtext[i] + 256 - 0xA1;
column = rawtext[i + 1] + 256 - 0xA1;
if (GBFreq[row][column] != 0) {
gbfreq += GBFreq[row][column];
} else if (15 <= row && row < 55) {
// In GB high-freq character range
gbfreq += 200;
}
}
i++;
}
}
rangeval = 50 * ((float) gbchars / (float) dbchars);
freqval = 50 * ((float) gbfreq / (float) totalfreq);
return (int) (rangeval + freqval);
}
/*
* Function: gbk_probability Argument: pointer to byte array Returns : number from 0 to 100 representing probability text in
* array uses GBK encoding
*/
int gbk_probability(byte[] rawtext) {
int i, rawtextlen = 0;
int dbchars = 1, gbchars = 1;
long gbfreq = 0, totalfreq = 1;
float rangeval = 0, freqval = 0;
int row, column;
// Stage 1: Check to see if characters fit into acceptable ranges
rawtextlen = rawtext.length;
for (i = 0; i < rawtextlen - 1; i++) {
// System.err.println(rawtext[i]);
if (rawtext[i] >= 0) {
// asciichars++;
} else {
dbchars++;
if ((byte) 0xA1 <= rawtext[i] && rawtext[i] <= (byte) 0xF7 && // Original GB range
(byte) 0xA1 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0xFE) {
gbchars++;
totalfreq += 500;
row = rawtext[i] + 256 - 0xA1;
column = rawtext[i + 1] + 256 - 0xA1;
// System.out.println("original row " + row + " column " + column);
if (GBFreq[row][column] != 0) {
gbfreq += GBFreq[row][column];
} else if (15 <= row && row < 55) {
gbfreq += 200;
}
} else if ((byte) 0x81 <= rawtext[i]
&& rawtext[i] <= (byte) 0xFE
&& // Extended GB range
(((byte) 0x80 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0xFE) || ((byte) 0x40 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0x7E))) {
gbchars++;
totalfreq += 500;
row = rawtext[i] + 256 - 0x81;
if (0x40 <= rawtext[i + 1] && rawtext[i + 1] <= 0x7E) {
column = rawtext[i + 1] - 0x40;
} else {
column = rawtext[i + 1] + 256 - 0x40;
}
// System.out.println("extended row " + row + " column " + column + " rawtext[i] " + rawtext[i]);
if (GBKFreq[row][column] != 0) {
gbfreq += GBKFreq[row][column];
}
}
i++;
}
}
rangeval = 50 * ((float) gbchars / (float) dbchars);
freqval = 50 * ((float) gbfreq / (float) totalfreq);
// For regular GB files, this would give the same score, so I handicap it slightly
return (int) (rangeval + freqval) - 1;
}
/*
* Function: gb18030_probability Argument: pointer to byte array Returns : number from 0 to 100 representing probability text
* in array uses GBK encoding
*/
int gb18030_probability(byte[] rawtext) {
int i, rawtextlen = 0;
int dbchars = 1, gbchars = 1;
long gbfreq = 0, totalfreq = 1;
float rangeval = 0, freqval = 0;
int row, column;
// Stage 1: Check to see if characters fit into acceptable ranges
rawtextlen = rawtext.length;
for (i = 0; i < rawtextlen - 1; i++) {
// System.err.println(rawtext[i]);
if (rawtext[i] >= 0) {
// asciichars++;
} else {
dbchars++;
if ((byte) 0xA1 <= rawtext[i] && rawtext[i] <= (byte) 0xF7 && // Original GB range
i + 1 < rawtextlen && (byte) 0xA1 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0xFE) {
gbchars++;
totalfreq += 500;
row = rawtext[i] + 256 - 0xA1;
column = rawtext[i + 1] + 256 - 0xA1;
// System.out.println("original row " + row + " column " + column);
if (GBFreq[row][column] != 0) {
gbfreq += GBFreq[row][column];
} else if (15 <= row && row < 55) {
gbfreq += 200;
}
} else if ((byte) 0x81 <= rawtext[i] && rawtext[i] <= (byte) 0xFE
&& // Extended GB range
i + 1 < rawtextlen
&& (((byte) 0x80 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0xFE) || ((byte) 0x40 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0x7E))) {
gbchars++;
totalfreq += 500;
row = rawtext[i] + 256 - 0x81;
if (0x40 <= rawtext[i + 1] && rawtext[i + 1] <= 0x7E) {
column = rawtext[i + 1] - 0x40;
} else {
column = rawtext[i + 1] + 256 - 0x40;
}
// System.out.println("extended row " + row + " column " + column + " rawtext[i] " + rawtext[i]);
if (GBKFreq[row][column] != 0) {
gbfreq += GBKFreq[row][column];
}
} else if ((byte) 0x81 <= rawtext[i]
&& rawtext[i] <= (byte) 0xFE
&& // Extended GB range
i + 3 < rawtextlen && (byte) 0x30 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0x39
&& (byte) 0x81 <= rawtext[i + 2] && rawtext[i + 2] <= (byte) 0xFE && (byte) 0x30 <= rawtext[i + 3]
&& rawtext[i + 3] <= (byte) 0x39) {
gbchars++;
/*
* totalfreq += 500; row = rawtext[i] + 256 - 0x81; if (0x40 <= rawtext[i+1] && rawtext[i+1] <= 0x7E) { column =
* rawtext[i+1] - 0x40; } else { column = rawtext[i+1] + 256 - 0x40; } //System.out.println("extended row " + row + "
* column " + column + " rawtext[i] " + rawtext[i]); if (GBKFreq[row][column] != 0) { gbfreq += GBKFreq[row][column]; }
*/
}
i++;
}
}
rangeval = 50 * ((float) gbchars / (float) dbchars);
freqval = 50 * ((float) gbfreq / (float) totalfreq);
// For regular GB files, this would give the same score, so I handicap it slightly
return (int) (rangeval + freqval) - 1;
}
/*
* Function: hz_probability Argument: byte array Returns : number from 0 to 100 representing probability text in array uses HZ
* encoding
*/
int hz_probability(byte[] rawtext) {
int i, rawtextlen;
int hzchars = 0, dbchars = 1;
long hzfreq = 0, totalfreq = 1;
float rangeval = 0, freqval = 0;
int hzstart = 0, hzend = 0;
int row, column;
rawtextlen = rawtext.length;
for (i = 0; i < rawtextlen; i++) {
if (rawtext[i] == '~') {
if (rawtext[i + 1] == '{') {
hzstart++;
i += 2;
while (i < rawtextlen - 1) {
if (rawtext[i] == 0x0A || rawtext[i] == 0x0D) {
break;
} else if (rawtext[i] == '~' && rawtext[i + 1] == '}') {
hzend++;
i++;
break;
} else if ((0x21 <= rawtext[i] && rawtext[i] <= 0x77) && (0x21 <= rawtext[i + 1] && rawtext[i + 1] <= 0x77)) {
hzchars += 2;
row = rawtext[i] - 0x21;
column = rawtext[i + 1] - 0x21;
totalfreq += 500;
if (GBFreq[row][column] != 0) {
hzfreq += GBFreq[row][column];
} else if (15 <= row && row < 55) {
hzfreq += 200;
}
} else if ((0xA1 <= rawtext[i] && rawtext[i] <= 0xF7) && (0xA1 <= rawtext[i + 1] && rawtext[i + 1] <= 0xF7)) {
hzchars += 2;
row = rawtext[i] + 256 - 0xA1;
column = rawtext[i + 1] + 256 - 0xA1;
totalfreq += 500;
if (GBFreq[row][column] != 0) {
hzfreq += GBFreq[row][column];
} else if (15 <= row && row < 55) {
hzfreq += 200;
}
}
dbchars += 2;
i += 2;
}
} else if (rawtext[i + 1] == '}') {
hzend++;
i++;
} else if (rawtext[i + 1] == '~') {
i++;
}
}
}
if (hzstart > 4) {
rangeval = 50;
} else if (hzstart > 1) {
rangeval = 41;
} else if (hzstart > 0) { // Only 39 in case the sequence happened to occur
rangeval = 39; // in otherwise non-Hz text
} else {
rangeval = 0;
}
freqval = 50 * ((float) hzfreq / (float) totalfreq);
return (int) (rangeval + freqval);
}
/**
* Function: big5_probability Argument: byte array Returns : number from 0 to 100 representing probability text in array uses
* Big5 encoding
*/
int big5_probability(byte[] rawtext) {
int i, rawtextlen = 0;
int dbchars = 1, bfchars = 1;
float rangeval = 0, freqval = 0;
long bffreq = 0, totalfreq = 1;
int row, column;
// Check to see if characters fit into acceptable ranges
rawtextlen = rawtext.length;
for (i = 0; i < rawtextlen - 1; i++) {
if (rawtext[i] >= 0) {
// asciichars++;
} else {
dbchars++;
if ((byte) 0xA1 <= rawtext[i]
&& rawtext[i] <= (byte) 0xF9
&& (((byte) 0x40 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0x7E) || ((byte) 0xA1 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0xFE))) {
bfchars++;
totalfreq += 500;
row = rawtext[i] + 256 - 0xA1;
if (0x40 <= rawtext[i + 1] && rawtext[i + 1] <= 0x7E) {
column = rawtext[i + 1] - 0x40;
} else {
column = rawtext[i + 1] + 256 - 0x61;
}
if (Big5Freq[row][column] != 0) {
bffreq += Big5Freq[row][column];
} else if (3 <= row && row <= 37) {
bffreq += 200;
}
}
i++;
}
}
rangeval = 50 * ((float) bfchars / (float) dbchars);
freqval = 50 * ((float) bffreq / (float) totalfreq);
return (int) (rangeval + freqval);
}
/*
* Function: big5plus_probability Argument: pointer to unsigned char array Returns : number from 0 to 100 representing
* probability text in array uses Big5+ encoding
*/
int big5plus_probability(byte[] rawtext) {
int i, rawtextlen = 0;
int dbchars = 1, bfchars = 1;
long bffreq = 0, totalfreq = 1;
float rangeval = 0, freqval = 0;
int row, column;
// Stage 1: Check to see if characters fit into acceptable ranges
rawtextlen = rawtext.length;
for (i = 0; i < rawtextlen - 1; i++) {
// System.err.println(rawtext[i]);
if (rawtext[i] >= 128) {
// asciichars++;
} else {
dbchars++;
if (0xA1 <= rawtext[i] && rawtext[i] <= 0xF9 && // Original Big5 range
((0x40 <= rawtext[i + 1] && rawtext[i + 1] <= 0x7E) || (0xA1 <= rawtext[i + 1] && rawtext[i + 1] <= 0xFE))) {
bfchars++;
totalfreq += 500;
row = rawtext[i] - 0xA1;
if (0x40 <= rawtext[i + 1] && rawtext[i + 1] <= 0x7E) {
column = rawtext[i + 1] - 0x40;
} else {
column = rawtext[i + 1] - 0x61;
}
// System.out.println("original row " + row + " column " + column);
if (Big5Freq[row][column] != 0) {
bffreq += Big5Freq[row][column];
} else if (3 <= row && row < 37) {
bffreq += 200;
}
} else if (0x81 <= rawtext[i] && rawtext[i] <= 0xFE && // Extended Big5 range
((0x40 <= rawtext[i + 1] && rawtext[i + 1] <= 0x7E) || (0x80 <= rawtext[i + 1] && rawtext[i + 1] <= 0xFE))) {
bfchars++;
totalfreq += 500;
row = rawtext[i] - 0x81;
if (0x40 <= rawtext[i + 1] && rawtext[i + 1] <= 0x7E) {
column = rawtext[i + 1] - 0x40;
} else {
column = rawtext[i + 1] - 0x40;
}
// System.out.println("extended row " + row + " column " + column + " rawtext[i] " + rawtext[i]);
if (Big5PFreq[row][column] != 0) {
bffreq += Big5PFreq[row][column];
}
}
i++;
}
}
rangeval = 50 * ((float) bfchars / (float) dbchars);
freqval = 50 * ((float) bffreq / (float) totalfreq);
// For regular Big5 files, this would give the same score, so I handicap it slightly
return (int) (rangeval + freqval) - 1;
}
/*
* Function: euc_tw_probability Argument: byte array Returns : number from 0 to 100 representing probability text in array
* uses EUC-TW (CNS 11643) encoding
*/
int euc_tw_probability(byte[] rawtext) {
int i, rawtextlen = 0;
int dbchars = 1, cnschars = 1;
long cnsfreq = 0, totalfreq = 1;
float rangeval = 0, freqval = 0;
int row, column;
// Check to see if characters fit into acceptable ranges
// and have expected frequency of use
rawtextlen = rawtext.length;
for (i = 0; i < rawtextlen - 1; i++) {
if (rawtext[i] >= 0) { // in ASCII range
// asciichars++;
} else { // high bit set
dbchars++;
if (i + 3 < rawtextlen && (byte) 0x8E == rawtext[i] && (byte) 0xA1 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0xB0
&& (byte) 0xA1 <= rawtext[i + 2] && rawtext[i + 2] <= (byte) 0xFE && (byte) 0xA1 <= rawtext[i + 3]
&& rawtext[i + 3] <= (byte) 0xFE) { // Planes 1 - 16
cnschars++;
// System.out.println("plane 2 or above CNS char");
// These are all less frequent chars so just ignore freq
i += 3;
} else if ((byte) 0xA1 <= rawtext[i] && rawtext[i] <= (byte) 0xFE && // Plane 1
(byte) 0xA1 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0xFE) {
cnschars++;
totalfreq += 500;
row = rawtext[i] + 256 - 0xA1;
column = rawtext[i + 1] + 256 - 0xA1;
if (EUC_TWFreq[row][column] != 0) {
cnsfreq += EUC_TWFreq[row][column];
} else if (35 <= row && row <= 92) {
cnsfreq += 150;
}
i++;
}
}
}
rangeval = 50 * ((float) cnschars / (float) dbchars);
freqval = 50 * ((float) cnsfreq / (float) totalfreq);
return (int) (rangeval + freqval);
}
/*
* Function: iso_2022_cn_probability Argument: byte array Returns : number from 0 to 100 representing probability text in
* array uses ISO 2022-CN encoding WORKS FOR BASIC CASES, BUT STILL NEEDS MORE WORK
*/
int iso_2022_cn_probability(byte[] rawtext) {
int i, rawtextlen = 0;
int dbchars = 1, isochars = 1;
long isofreq = 0, totalfreq = 1;
float rangeval = 0, freqval = 0;
int row, column;
// Check to see if characters fit into acceptable ranges
// and have expected frequency of use
rawtextlen = rawtext.length;
for (i = 0; i < rawtextlen - 1; i++) {
if (rawtext[i] == (byte) 0x1B && i + 3 < rawtextlen) { // Escape char ESC
if (rawtext[i + 1] == (byte) 0x24 && rawtext[i + 2] == 0x29 && rawtext[i + 3] == (byte) 0x41) { // GB Escape $ ) A
i += 4;
while (rawtext[i] != (byte) 0x1B) {
dbchars++;
if ((0x21 <= rawtext[i] && rawtext[i] <= 0x77) && (0x21 <= rawtext[i + 1] && rawtext[i + 1] <= 0x77)) {
isochars++;
row = rawtext[i] - 0x21;
column = rawtext[i + 1] - 0x21;
totalfreq += 500;
if (GBFreq[row][column] != 0) {
isofreq += GBFreq[row][column];
} else if (15 <= row && row < 55) {
isofreq += 200;
}
i++;
}
i++;
}
} else if (i + 3 < rawtextlen && rawtext[i + 1] == (byte) 0x24 && rawtext[i + 2] == (byte) 0x29
&& rawtext[i + 3] == (byte) 0x47) {
// CNS Escape $ ) G
i += 4;
while (rawtext[i] != (byte) 0x1B) {
dbchars++;
if ((byte) 0x21 <= rawtext[i] && rawtext[i] <= (byte) 0x7E && (byte) 0x21 <= rawtext[i + 1]
&& rawtext[i + 1] <= (byte) 0x7E) {
isochars++;
totalfreq += 500;
row = rawtext[i] - 0x21;
column = rawtext[i + 1] - 0x21;
if (EUC_TWFreq[row][column] != 0) {
isofreq += EUC_TWFreq[row][column];
} else if (35 <= row && row <= 92) {
isofreq += 150;
}
i++;
}
i++;
}
}
if (rawtext[i] == (byte) 0x1B && i + 2 < rawtextlen && rawtext[i + 1] == (byte) 0x28 && rawtext[i + 2] == (byte) 0x42) { // ASCII:
// ESC
// ( B
i += 2;
}
}
}
rangeval = 50 * ((float) isochars / (float) dbchars);
freqval = 50 * ((float) isofreq / (float) totalfreq);
// System.out.println("isochars dbchars isofreq totalfreq " + isochars + " " + dbchars + " " + isofreq + " " + totalfreq + "
// " + rangeval + " " + freqval);
return (int) (rangeval + freqval);
// return 0;
}
/*
* Function: utf8_probability Argument: byte array Returns : number from 0 to 100 representing probability text in array uses
* UTF-8 encoding of Unicode
*/
int utf8_probability(byte[] rawtext) {
int score = 0;
int i, rawtextlen = 0;
int goodbytes = 0, asciibytes = 0;
// Maybe also use UTF8 Byte Order Mark: EF BB BF
// Check to see if characters fit into acceptable ranges
rawtextlen = rawtext.length;
for (i = 0; i < rawtextlen; i++) {
if ((rawtext[i] & (byte) 0x7F) == rawtext[i]) { // One byte
asciibytes++;
// Ignore ASCII, can throw off count
} else if (-64 <= rawtext[i] && rawtext[i] <= -33 && // Two bytes
i + 1 < rawtextlen && -128 <= rawtext[i + 1] && rawtext[i + 1] <= -65) {
goodbytes += 2;
i++;
} else if (-32 <= rawtext[i] && rawtext[i] <= -17
&& // Three bytes
i + 2 < rawtextlen && -128 <= rawtext[i + 1] && rawtext[i + 1] <= -65 && -128 <= rawtext[i + 2]
&& rawtext[i + 2] <= -65) {
goodbytes += 3;
i += 2;
}
}
if (asciibytes == rawtextlen) {
return 0;
}
score = (int) (100 * ((float) goodbytes / (float) (rawtextlen - asciibytes)));
// System.out.println("rawtextlen " + rawtextlen + " goodbytes " + goodbytes + " asciibytes " + asciibytes + " score " +
// score);
// If not above 98, reduce to zero to prevent coincidental matches
// Allows for some (few) bad formed sequences
if (score > 98) {
return score;
} else if (score > 95 && goodbytes > 30) {
return score;
} else {
return 0;
}
}
/*
* Function: utf16_probability Argument: byte array Returns : number from 0 to 100 representing probability text in array uses
* UTF-16 encoding of Unicode, guess based on BOM // NOT VERY GENERAL, NEEDS MUCH MORE WORK
*/
int utf16_probability(byte[] rawtext) {
// int score = 0;
// int i, rawtextlen = 0;
// int goodbytes = 0, asciibytes = 0;
if (rawtext.length > 1 && ((byte) 0xFE == rawtext[0] && (byte) 0xFF == rawtext[1]) || // Big-endian
((byte) 0xFF == rawtext[0] && (byte) 0xFE == rawtext[1])) { // Little-endian
return 100;
}
return 0;
/*
* // Check to see if characters fit into acceptable ranges rawtextlen = rawtext.length; for (i = 0; i < rawtextlen; i++) {
* if ((rawtext[i] & (byte)0x7F) == rawtext[i]) { // One byte goodbytes += 1; asciibytes++; } else if ((rawtext[i] &
* (byte)0xDF) == rawtext[i]) { // Two bytes if (i+1 < rawtextlen && (rawtext[i+1] & (byte)0xBF) == rawtext[i+1]) {
* goodbytes += 2; i++; } } else if ((rawtext[i] & (byte)0xEF) == rawtext[i]) { // Three bytes if (i+2 < rawtextlen &&
* (rawtext[i+1] & (byte)0xBF) == rawtext[i+1] && (rawtext[i+2] & (byte)0xBF) == rawtext[i+2]) { goodbytes += 3; i+=2; } } }
*
* score = (int)(100 * ((float)goodbytes/(float)rawtext.length)); // An all ASCII file is also a good UTF8 file, but I'd
* rather it // get identified as ASCII. Can delete following 3 lines otherwise if (goodbytes == asciibytes) { score = 0; } //
* If not above 90, reduce to zero to prevent coincidental matches if (score > 90) { return score; } else { return 0; }
*/
}
/*
* Function: ascii_probability Argument: byte array Returns : number from 0 to 100 representing probability text in array uses
* all ASCII Description: Sees if array has any characters not in ASCII range, if so, score is reduced
*/
int ascii_probability(byte[] rawtext) {
int score = 75;
int i, rawtextlen;
rawtextlen = rawtext.length;
for (i = 0; i < rawtextlen; i++) {
if (rawtext[i] < 0) {
score = score - 5;
} else if (rawtext[i] == (byte) 0x1B) { // ESC (used by ISO 2022)
score = score - 5;
}
if (score <= 0) {
return 0;
}
}
return score;
}
/*
* Function: euc_kr__probability Argument: pointer to byte array Returns : number from 0 to 100 representing probability text
* in array uses EUC-KR encoding
*/
int euc_kr_probability(byte[] rawtext) {
int i, rawtextlen = 0;
int dbchars = 1, krchars = 1;
long krfreq = 0, totalfreq = 1;
float rangeval = 0, freqval = 0;
int row, column;
// Stage 1: Check to see if characters fit into acceptable ranges
rawtextlen = rawtext.length;
for (i = 0; i < rawtextlen - 1; i++) {
// System.err.println(rawtext[i]);
if (rawtext[i] >= 0) {
// asciichars++;
} else {
dbchars++;
if ((byte) 0xA1 <= rawtext[i] && rawtext[i] <= (byte) 0xFE && (byte) 0xA1 <= rawtext[i + 1]
&& rawtext[i + 1] <= (byte) 0xFE) {
krchars++;
totalfreq += 500;
row = rawtext[i] + 256 - 0xA1;
column = rawtext[i + 1] + 256 - 0xA1;
if (KRFreq[row][column] != 0) {
krfreq += KRFreq[row][column];
} else if (15 <= row && row < 55) {
krfreq += 0;
}
}
i++;
}
}
rangeval = 50 * ((float) krchars / (float) dbchars);
freqval = 50 * ((float) krfreq / (float) totalfreq);
return (int) (rangeval + freqval);
}
/*
* Function: cp949__probability Argument: pointer to byte array Returns : number from 0 to 100 representing probability text
* in array uses Cp949 encoding
*/
int cp949_probability(byte[] rawtext) {
int i, rawtextlen = 0;
int dbchars = 1, krchars = 1;
long krfreq = 0, totalfreq = 1;
float rangeval = 0, freqval = 0;
int row, column;
// Stage 1: Check to see if characters fit into acceptable ranges
rawtextlen = rawtext.length;
for (i = 0; i < rawtextlen - 1; i++) {
// System.err.println(rawtext[i]);
if (rawtext[i] >= 0) {
// asciichars++;
} else {
dbchars++;
if ((byte) 0x81 <= rawtext[i]
&& rawtext[i] <= (byte) 0xFE
&& ((byte) 0x41 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0x5A || (byte) 0x61 <= rawtext[i + 1]
&& rawtext[i + 1] <= (byte) 0x7A || (byte) 0x81 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0xFE)) {
krchars++;
totalfreq += 500;
if ((byte) 0xA1 <= rawtext[i] && rawtext[i] <= (byte) 0xFE && (byte) 0xA1 <= rawtext[i + 1]
&& rawtext[i + 1] <= (byte) 0xFE) {
row = rawtext[i] + 256 - 0xA1;
column = rawtext[i + 1] + 256 - 0xA1;
if (KRFreq[row][column] != 0) {
krfreq += KRFreq[row][column];
}
}
}
i++;
}
}
rangeval = 50 * ((float) krchars / (float) dbchars);
freqval = 50 * ((float) krfreq / (float) totalfreq);
return (int) (rangeval + freqval);
}
int iso_2022_kr_probability(byte[] rawtext) {
int i;
for (i = 0; i < rawtext.length; i++) {
if (i + 3 < rawtext.length && rawtext[i] == 0x1b && (char) rawtext[i + 1] == '$' && (char) rawtext[i + 2] == ')'
&& (char) rawtext[i + 3] == 'C') {
return 100;
}
}
return 0;
}
/*
* Function: euc_jp_probability Argument: pointer to byte array Returns : number from 0 to 100 representing probability text
* in array uses EUC-JP encoding