java获取文件编码

这是一个Java工具类,用于检测文件的编码类型。通过分析字节流中的字符分布,该工具可以判断文件是GB2312、GBK、BIG5、EUC-TW等编码之一。工具包括对不同编码的概率计算,最终返回最可能的编码类型。
摘要由CSDN通过智能技术生成
java获取文件编码

工具类一:

package www.autocheck.email;

import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.net.URL;

public class ExchangeUtil extends Encoding {  
    // Frequency tables to hold the GB, Big5, and EUC-TW character  
    // frequencies  
    int GBFreq[][];  

    int GBKFreq[][];  

    int Big5Freq[][];  

    int Big5PFreq[][];  

    int EUC_TWFreq[][];  

    int KRFreq[][];  

    int JPFreq[][];  

    // int UnicodeFreq[94][128];  
    // public static String[] nicename;  
    // public static String[] codings;  
    public boolean debug;  

    public ExchangeUtil() {  
      super();  
      debug = false;  
      GBFreq = new int[94][94];  
      GBKFreq = new int[126][191];  
      Big5Freq = new int[94][158];  
      Big5PFreq = new int[126][191];  
      EUC_TWFreq = new int[94][94];  
      KRFreq = new int[94][94];  
      JPFreq = new int[94][94];  
      // Initialize the Frequency Table for GB, GBK, Big5, EUC-TW, KR, JP  
      initialize_frequencies();  
    }  

//    public static void main(String argc[]) {  
//      ExchangeUtil sinodetector;  
//      int result = OTHER;  
//      int i;  
//      sinodetector = new ExchangeUtil();  
//      for (i = 0; i < argc.length; i++) {  
//        if (argc[i].startsWith("http://") == true) {  
//          try {  
//            result = sinodetector.detectEncoding(new URL(argc[i]));  
//          } catch (Exception e) {  
//            System.err.println("Bad URL " + e.toString());  
//          }  
//        } else if (argc[i].equals("-d")) {  
//          sinodetector.debug = true;  
//          continue;  
//        } else {  
//          result = sinodetector.detectEncoding(new File(argc[i]));  
//        }  
//        System.out.println(nicename[result]);  
//      }  
//    }  

    /** 
     * Function : detectEncoding Aruguments: URL Returns : One of the encodings from the Encoding enumeration (GB2312, HZ, BIG5, 
     * EUC_TW, ASCII, or OTHER) Description: This function looks at the URL contents and assigns it a probability score for each 
     * encoding type. The encoding type with the highest probability is returned. 
     */  
    public int detectEncoding(URL testurl) {  
      byte[] rawtext = new byte[10000];  
      int bytesread = 0, byteoffset = 0;  
      int guess = OTHER;  
      InputStream chinesestream;  
      try {  
        chinesestream = testurl.openStream();  
        while ((bytesread = chinesestream.read(rawtext, byteoffset, rawtext.length - byteoffset)) > 0) {  
          byteoffset += bytesread;  
        }  
        ;  
        chinesestream.close();  
        guess = detectEncoding(rawtext);  
      } catch (Exception e) {  
        System.err.println("Error loading or using URL " + e.toString());  
        guess = -1;  
      }  
      return guess;  
    }  

    /** 
     * Function : detectEncoding Aruguments: File Returns : One of the encodings from the Encoding enumeration (GB2312, HZ, BIG5, 
     * EUC_TW, ASCII, or OTHER) Description: This function looks at the file and assigns it a probability score for each encoding 
     * type. The encoding type with the highest probability is returned. 
     */  
    public int detectEncoding(File testfile) {  
      FileInputStream chinesefile;  
      byte[] rawtext;  
      rawtext = new byte[(int) testfile.length()];  
      try {  
        chinesefile = new FileInputStream(testfile);  
        chinesefile.read(rawtext);  
        chinesefile.close();  
      } catch (Exception e) {  
        System.err.println("Error: " + e);  
      }  
      return detectEncoding(rawtext);  
    }  

    /** 
     * Function : detectEncoding Aruguments: byte array Returns : One of the encodings from the Encoding enumeration (GB2312, HZ, 
     * BIG5, EUC_TW, ASCII, or OTHER) Description: This function looks at the byte array and assigns it a probability score for 
     * each encoding type. The encoding type with the highest probability is returned. 
     */  
    public int detectEncoding(byte[] rawtext) {  
      int[] scores;  
      int index, maxscore = 0;  
      int encoding_guess = OTHER;  
      scores = new int[TOTALTYPES];  
      // Assign Scores  
      scores[GB2312] = gb2312_probability(rawtext);  
      scores[GBK] = gbk_probability(rawtext);  
      scores[GB18030] = gb18030_probability(rawtext);  
      scores[HZ] = hz_probability(rawtext);  
      scores[BIG5] = big5_probability(rawtext);  
      scores[CNS11643] = euc_tw_probability(rawtext);  
      scores[ISO2022CN] = iso_2022_cn_probability(rawtext);  
      scores[UTF8] = utf8_probability(rawtext);  
      scores[UNICODE] = utf16_probability(rawtext);  
      scores[EUC_KR] = euc_kr_probability(rawtext);  
      scores[CP949] = cp949_probability(rawtext);  
      scores[JOHAB] = 0;  
      scores[ISO2022KR] = iso_2022_kr_probability(rawtext);  
      scores[ASCII] = ascii_probability(rawtext);  
      scores[SJIS] = sjis_probability(rawtext);  
      scores[EUC_JP] = euc_jp_probability(rawtext);  
      scores[ISO2022JP] = iso_2022_jp_probability(rawtext);  
      scores[UNICODET] = 0;  
      scores[UNICODES] = 0;  
      scores[ISO2022CN_GB] = 0;  
      scores[ISO2022CN_CNS] = 0;  
      scores[OTHER] = 0;  
      // Tabulate Scores  
      for (index = 0; index < TOTALTYPES; index++) {  
        if (debug)  
          System.err.println("Encoding " + nicename[index] + " score " + scores[index]);  
        if (scores[index] > maxscore) {  
          encoding_guess = index;  
          maxscore = scores[index];  
        }  
      }  
      // Return OTHER if nothing scored above 50  
      if (maxscore <= 50) {  
        encoding_guess = OTHER;  
      }  
      return encoding_guess;  
    }  

    /* 
     * Function: gb2312_probability Argument: pointer to byte array Returns : number from 0 to 100 representing probability text 
     * in array uses GB-2312 encoding 
     */  
    int gb2312_probability(byte[] rawtext) {  
      int i, rawtextlen = 0;  
      int dbchars = 1, gbchars = 1;  
      long gbfreq = 0, totalfreq = 1;  
      float rangeval = 0, freqval = 0;  
      int row, column;  
      // Stage 1: Check to see if characters fit into acceptable ranges  
      rawtextlen = rawtext.length;  
      for (i = 0; i < rawtextlen - 1; i++) {  
        // System.err.println(rawtext[i]);  
        if (rawtext[i] >= 0) {  
          // asciichars++;  
        } else {  
          dbchars++;  
          if ((byte) 0xA1 <= rawtext[i] && rawtext[i] <= (byte) 0xF7 && (byte) 0xA1 <= rawtext[i + 1]  
              && rawtext[i + 1] <= (byte) 0xFE) {  
            gbchars++;  
            totalfreq += 500;  
            row = rawtext[i] + 256 - 0xA1;  
            column = rawtext[i + 1] + 256 - 0xA1;  
            if (GBFreq[row][column] != 0) {  
              gbfreq += GBFreq[row][column];  
            } else if (15 <= row && row < 55) {  
              // In GB high-freq character range  
              gbfreq += 200;  
            }  
          }  
          i++;  
        }  
      }  
      rangeval = 50 * ((float) gbchars / (float) dbchars);  
      freqval = 50 * ((float) gbfreq / (float) totalfreq);  
      return (int) (rangeval + freqval);  
    }  

    /* 
     * Function: gbk_probability Argument: pointer to byte array Returns : number from 0 to 100 representing probability text in 
     * array uses GBK encoding 
     */  
    int gbk_probability(byte[] rawtext) {  
      int i, rawtextlen = 0;  
      int dbchars = 1, gbchars = 1;  
      long gbfreq = 0, totalfreq = 1;  
      float rangeval = 0, freqval = 0;  
      int row, column;  
      // Stage 1: Check to see if characters fit into acceptable ranges  
      rawtextlen = rawtext.length;  
      for (i = 0; i < rawtextlen - 1; i++) {  
        // System.err.println(rawtext[i]);  
        if (rawtext[i] >= 0) {  
          // asciichars++;  
        } else {  
          dbchars++;  
          if ((byte) 0xA1 <= rawtext[i] && rawtext[i] <= (byte) 0xF7 && // Original GB range  
              (byte) 0xA1 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0xFE) {  
            gbchars++;  
            totalfreq += 500;  
            row = rawtext[i] + 256 - 0xA1;  
            column = rawtext[i + 1] + 256 - 0xA1;  
            // System.out.println("original row " + row + " column " + column);  
            if (GBFreq[row][column] != 0) {  
              gbfreq += GBFreq[row][column];  
            } else if (15 <= row && row < 55) {  
              gbfreq += 200;  
            }  
          } else if ((byte) 0x81 <= rawtext[i]  
              && rawtext[i] <= (byte) 0xFE  
              && // Extended GB range  
              (((byte) 0x80 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0xFE) || ((byte) 0x40 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0x7E))) {  
            gbchars++;  
            totalfreq += 500;  
            row = rawtext[i] + 256 - 0x81;  
            if (0x40 <= rawtext[i + 1] && rawtext[i + 1] <= 0x7E) {  
              column = rawtext[i + 1] - 0x40;  
            } else {  
              column = rawtext[i + 1] + 256 - 0x40;  
            }  
            // System.out.println("extended row " + row + " column " + column + " rawtext[i] " + rawtext[i]);  
            if (GBKFreq[row][column] != 0) {  
              gbfreq += GBKFreq[row][column];  
            }  
          }  
          i++;  
        }  
      }  
      rangeval = 50 * ((float) gbchars / (float) dbchars);  
      freqval = 50 * ((float) gbfreq / (float) totalfreq);  
      // For regular GB files, this would give the same score, so I handicap it slightly  
      return (int) (rangeval + freqval) - 1;  
    }  

    /* 
     * Function: gb18030_probability Argument: pointer to byte array Returns : number from 0 to 100 representing probability text 
     * in array uses GBK encoding 
     */  
    int gb18030_probability(byte[] rawtext) {  
      int i, rawtextlen = 0;  
      int dbchars = 1, gbchars = 1;  
      long gbfreq = 0, totalfreq = 1;  
      float rangeval = 0, freqval = 0;  
      int row, column;  
      // Stage 1: Check to see if characters fit into acceptable ranges  
      rawtextlen = rawtext.length;  
      for (i = 0; i < rawtextlen - 1; i++) {  
        // System.err.println(rawtext[i]);  
        if (rawtext[i] >= 0) {  
          // asciichars++;  
        } else {  
          dbchars++;  
          if ((byte) 0xA1 <= rawtext[i] && rawtext[i] <= (byte) 0xF7 && // Original GB range  
              i + 1 < rawtextlen && (byte) 0xA1 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0xFE) {  
            gbchars++;  
            totalfreq += 500;  
            row = rawtext[i] + 256 - 0xA1;  
            column = rawtext[i + 1] + 256 - 0xA1;  
            // System.out.println("original row " + row + " column " + column);  
            if (GBFreq[row][column] != 0) {  
              gbfreq += GBFreq[row][column];  
            } else if (15 <= row && row < 55) {  
              gbfreq += 200;  
            }  
          } else if ((byte) 0x81 <= rawtext[i] && rawtext[i] <= (byte) 0xFE  
              && // Extended GB range  
              i + 1 < rawtextlen  
              && (((byte) 0x80 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0xFE) || ((byte) 0x40 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0x7E))) {  
            gbchars++;  
            totalfreq += 500;  
            row = rawtext[i] + 256 - 0x81;  
            if (0x40 <= rawtext[i + 1] && rawtext[i + 1] <= 0x7E) {  
              column = rawtext[i + 1] - 0x40;  
            } else {  
              column = rawtext[i + 1] + 256 - 0x40;  
            }  
            // System.out.println("extended row " + row + " column " + column + " rawtext[i] " + rawtext[i]);  
            if (GBKFreq[row][column] != 0) {  
              gbfreq += GBKFreq[row][column];  
            }  
          } else if ((byte) 0x81 <= rawtext[i]  
              && rawtext[i] <= (byte) 0xFE  
              && // Extended GB range  
              i + 3 < rawtextlen && (byte) 0x30 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0x39  
              && (byte) 0x81 <= rawtext[i + 2] && rawtext[i + 2] <= (byte) 0xFE && (byte) 0x30 <= rawtext[i + 3]  
              && rawtext[i + 3] <= (byte) 0x39) {  
            gbchars++;  
            /* 
             * totalfreq += 500; row = rawtext[i] + 256 - 0x81; if (0x40 <= rawtext[i+1] && rawtext[i+1] <= 0x7E) { column = 
             * rawtext[i+1] - 0x40; } else { column = rawtext[i+1] + 256 - 0x40; } //System.out.println("extended row " + row + " 
             * column " + column + " rawtext[i] " + rawtext[i]); if (GBKFreq[row][column] != 0) { gbfreq += GBKFreq[row][column]; } 
             */  
          }  
          i++;  
        }  
      }  
      rangeval = 50 * ((float) gbchars / (float) dbchars);  
      freqval = 50 * ((float) gbfreq / (float) totalfreq);  
      // For regular GB files, this would give the same score, so I handicap it slightly  
      return (int) (rangeval + freqval) - 1;  
    }  

    /* 
     * Function: hz_probability Argument: byte array Returns : number from 0 to 100 representing probability text in array uses HZ 
     * encoding 
     */  
    int hz_probability(byte[] rawtext) {  
      int i, rawtextlen;  
      int hzchars = 0, dbchars = 1;  
      long hzfreq = 0, totalfreq = 1;  
      float rangeval = 0, freqval = 0;  
      int hzstart = 0, hzend = 0;  
      int row, column;  
      rawtextlen = rawtext.length;  
      for (i = 0; i < rawtextlen; i++) {  
        if (rawtext[i] == '~') {  
          if (rawtext[i + 1] == '{') {  
            hzstart++;  
            i += 2;  
            while (i < rawtextlen - 1) {  
              if (rawtext[i] == 0x0A || rawtext[i] == 0x0D) {  
                break;  
              } else if (rawtext[i] == '~' && rawtext[i + 1] == '}') {  
                hzend++;  
                i++;  
                break;  
              } else if ((0x21 <= rawtext[i] && rawtext[i] <= 0x77) && (0x21 <= rawtext[i + 1] && rawtext[i + 1] <= 0x77)) {  
                hzchars += 2;  
                row = rawtext[i] - 0x21;  
                column = rawtext[i + 1] - 0x21;  
                totalfreq += 500;  
                if (GBFreq[row][column] != 0) {  
                  hzfreq += GBFreq[row][column];  
                } else if (15 <= row && row < 55) {  
                  hzfreq += 200;  
                }  
              } else if ((0xA1 <= rawtext[i] && rawtext[i] <= 0xF7) && (0xA1 <= rawtext[i + 1] && rawtext[i + 1] <= 0xF7)) {  
                hzchars += 2;  
                row = rawtext[i] + 256 - 0xA1;  
                column = rawtext[i + 1] + 256 - 0xA1;  
                totalfreq += 500;  
                if (GBFreq[row][column] != 0) {  
                  hzfreq += GBFreq[row][column];  
                } else if (15 <= row && row < 55) {  
                  hzfreq += 200;  
                }  
              }  
              dbchars += 2;  
              i += 2;  
            }  
          } else if (rawtext[i + 1] == '}') {  
            hzend++;  
            i++;  
          } else if (rawtext[i + 1] == '~') {  
            i++;  
          }  
        }  
      }  
      if (hzstart > 4) {  
        rangeval = 50;  
      } else if (hzstart > 1) {  
        rangeval = 41;  
      } else if (hzstart > 0) { // Only 39 in case the sequence happened to occur  
        rangeval = 39; // in otherwise non-Hz text  
      } else {  
        rangeval = 0;  
      }  
      freqval = 50 * ((float) hzfreq / (float) totalfreq);  
      return (int) (rangeval + freqval);  
    }  

    /** 
     * Function: big5_probability Argument: byte array Returns : number from 0 to 100 representing probability text in array uses 
     * Big5 encoding 
     */  
    int big5_probability(byte[] rawtext) {  
      int i, rawtextlen = 0;  
      int dbchars = 1, bfchars = 1;  
      float rangeval = 0, freqval = 0;  
      long bffreq = 0, totalfreq = 1;  
      int row, column;  
      // Check to see if characters fit into acceptable ranges  
      rawtextlen = rawtext.length;  
      for (i = 0; i < rawtextlen - 1; i++) {  
        if (rawtext[i] >= 0) {  
          // asciichars++;  
        } else {  
          dbchars++;  
          if ((byte) 0xA1 <= rawtext[i]  
              && rawtext[i] <= (byte) 0xF9  
              && (((byte) 0x40 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0x7E) || ((byte) 0xA1 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0xFE))) {  
            bfchars++;  
            totalfreq += 500;  
            row = rawtext[i] + 256 - 0xA1;  
            if (0x40 <= rawtext[i + 1] && rawtext[i + 1] <= 0x7E) {  
              column = rawtext[i + 1] - 0x40;  
            } else {  
              column = rawtext[i + 1] + 256 - 0x61;  
            }  
            if (Big5Freq[row][column] != 0) {  
              bffreq += Big5Freq[row][column];  
            } else if (3 <= row && row <= 37) {  
              bffreq += 200;  
            }  
          }  
          i++;  
        }  
      }  
      rangeval = 50 * ((float) bfchars / (float) dbchars);  
      freqval = 50 * ((float) bffreq / (float) totalfreq);  
      return (int) (rangeval + freqval);  
    }  

    /* 
     * Function: big5plus_probability Argument: pointer to unsigned char array Returns : number from 0 to 100 representing 
     * probability text in array uses Big5+ encoding 
     */  
    int big5plus_probability(byte[] rawtext) {  
      int i, rawtextlen = 0;  
      int dbchars = 1, bfchars = 1;  
      long bffreq = 0, totalfreq = 1;  
      float rangeval = 0, freqval = 0;  
      int row, column;  
      // Stage 1: Check to see if characters fit into acceptable ranges  
      rawtextlen = rawtext.length;  
      for (i = 0; i < rawtextlen - 1; i++) {  
        // System.err.println(rawtext[i]);  
        if (rawtext[i] >= 128) {  
          // asciichars++;  
        } else {  
          dbchars++;  
          if (0xA1 <= rawtext[i] && rawtext[i] <= 0xF9 && // Original Big5 range  
              ((0x40 <= rawtext[i + 1] && rawtext[i + 1] <= 0x7E) || (0xA1 <= rawtext[i + 1] && rawtext[i + 1] <= 0xFE))) {  
            bfchars++;  
            totalfreq += 500;  
            row = rawtext[i] - 0xA1;  
            if (0x40 <= rawtext[i + 1] && rawtext[i + 1] <= 0x7E) {  
              column = rawtext[i + 1] - 0x40;  
            } else {  
              column = rawtext[i + 1] - 0x61;  
            }  
            // System.out.println("original row " + row + " column " + column);  
            if (Big5Freq[row][column] != 0) {  
              bffreq += Big5Freq[row][column];  
            } else if (3 <= row && row < 37) {  
              bffreq += 200;  
            }  
          } else if (0x81 <= rawtext[i] && rawtext[i] <= 0xFE && // Extended Big5 range  
              ((0x40 <= rawtext[i + 1] && rawtext[i + 1] <= 0x7E) || (0x80 <= rawtext[i + 1] && rawtext[i + 1] <= 0xFE))) {  
            bfchars++;  
            totalfreq += 500;  
            row = rawtext[i] - 0x81;  
            if (0x40 <= rawtext[i + 1] && rawtext[i + 1] <= 0x7E) {  
              column = rawtext[i + 1] - 0x40;  
            } else {  
              column = rawtext[i + 1] - 0x40;  
            }  
            // System.out.println("extended row " + row + " column " + column + " rawtext[i] " + rawtext[i]);  
            if (Big5PFreq[row][column] != 0) {  
              bffreq += Big5PFreq[row][column];  
            }  
          }  
          i++;  
        }  
      }  
      rangeval = 50 * ((float) bfchars / (float) dbchars);  
      freqval = 50 * ((float) bffreq / (float) totalfreq);  
      // For regular Big5 files, this would give the same score, so I handicap it slightly  
      return (int) (rangeval + freqval) - 1;  
    }  

    /* 
     * Function: euc_tw_probability Argument: byte array Returns : number from 0 to 100 representing probability text in array 
     * uses EUC-TW (CNS 11643) encoding 
     */  
    int euc_tw_probability(byte[] rawtext) {  
      int i, rawtextlen = 0;  
      int dbchars = 1, cnschars = 1;  
      long cnsfreq = 0, totalfreq = 1;  
      float rangeval = 0, freqval = 0;  
      int row, column;  
      // Check to see if characters fit into acceptable ranges  
      // and have expected frequency of use  
      rawtextlen = rawtext.length;  
      for (i = 0; i < rawtextlen - 1; i++) {  
        if (rawtext[i] >= 0) { // in ASCII range  
          // asciichars++;  
        } else { // high bit set  
          dbchars++;  
          if (i + 3 < rawtextlen && (byte) 0x8E == rawtext[i] && (byte) 0xA1 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0xB0  
              && (byte) 0xA1 <= rawtext[i + 2] && rawtext[i + 2] <= (byte) 0xFE && (byte) 0xA1 <= rawtext[i + 3]  
              && rawtext[i + 3] <= (byte) 0xFE) { // Planes 1 - 16  
            cnschars++;  
            // System.out.println("plane 2 or above CNS char");  
            // These are all less frequent chars so just ignore freq  
            i += 3;  
          } else if ((byte) 0xA1 <= rawtext[i] && rawtext[i] <= (byte) 0xFE && // Plane 1  
              (byte) 0xA1 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0xFE) {  
            cnschars++;  
            totalfreq += 500;  
            row = rawtext[i] + 256 - 0xA1;  
            column = rawtext[i + 1] + 256 - 0xA1;  
            if (EUC_TWFreq[row][column] != 0) {  
              cnsfreq += EUC_TWFreq[row][column];  
            } else if (35 <= row && row <= 92) {  
              cnsfreq += 150;  
            }  
            i++;  
          }  
        }  
      }  
      rangeval = 50 * ((float) cnschars / (float) dbchars);  
      freqval = 50 * ((float) cnsfreq / (float) totalfreq);  
      return (int) (rangeval + freqval);  
    }  

    /* 
     * Function: iso_2022_cn_probability Argument: byte array Returns : number from 0 to 100 representing probability text in 
     * array uses ISO 2022-CN encoding WORKS FOR BASIC CASES, BUT STILL NEEDS MORE WORK 
     */  
    int iso_2022_cn_probability(byte[] rawtext) {  
      int i, rawtextlen = 0;  
      int dbchars = 1, isochars = 1;  
      long isofreq = 0, totalfreq = 1;  
      float rangeval = 0, freqval = 0;  
      int row, column;  
      // Check to see if characters fit into acceptable ranges  
      // and have expected frequency of use  
      rawtextlen = rawtext.length;  
      for (i = 0; i < rawtextlen - 1; i++) {  
        if (rawtext[i] == (byte) 0x1B && i + 3 < rawtextlen) { // Escape char ESC  
          if (rawtext[i + 1] == (byte) 0x24 && rawtext[i + 2] == 0x29 && rawtext[i + 3] == (byte) 0x41) { // GB Escape $ ) A  
            i += 4;  
            while (rawtext[i] != (byte) 0x1B) {  
              dbchars++;  
              if ((0x21 <= rawtext[i] && rawtext[i] <= 0x77) && (0x21 <= rawtext[i + 1] && rawtext[i + 1] <= 0x77)) {  
                isochars++;  
                row = rawtext[i] - 0x21;  
                column = rawtext[i + 1] - 0x21;  
                totalfreq += 500;  
                if (GBFreq[row][column] != 0) {  
                  isofreq += GBFreq[row][column];  
                } else if (15 <= row && row < 55) {  
                  isofreq += 200;  
                }  
                i++;  
              }  
              i++;  
            }  
          } else if (i + 3 < rawtextlen && rawtext[i + 1] == (byte) 0x24 && rawtext[i + 2] == (byte) 0x29  
              && rawtext[i + 3] == (byte) 0x47) {  
            // CNS Escape $ ) G  
            i += 4;  
            while (rawtext[i] != (byte) 0x1B) {  
              dbchars++;  
              if ((byte) 0x21 <= rawtext[i] && rawtext[i] <= (byte) 0x7E && (byte) 0x21 <= rawtext[i + 1]  
                  && rawtext[i + 1] <= (byte) 0x7E) {  
                isochars++;  
                totalfreq += 500;  
                row = rawtext[i] - 0x21;  
                column = rawtext[i + 1] - 0x21;  
                if (EUC_TWFreq[row][column] != 0) {  
                  isofreq += EUC_TWFreq[row][column];  
                } else if (35 <= row && row <= 92) {  
                  isofreq += 150;  
                }  
                i++;  
              }  
              i++;  
            }  
          }  
          if (rawtext[i] == (byte) 0x1B && i + 2 < rawtextlen && rawtext[i + 1] == (byte) 0x28 && rawtext[i + 2] == (byte) 0x42) { // ASCII:  
            // ESC  
            // ( B  
            i += 2;  
          }  
        }  
      }  
      rangeval = 50 * ((float) isochars / (float) dbchars);  
      freqval = 50 * ((float) isofreq / (float) totalfreq);  
      // System.out.println("isochars dbchars isofreq totalfreq " + isochars + " " + dbchars + " " + isofreq + " " + totalfreq + "  
      // " + rangeval + " " + freqval);  
      return (int) (rangeval + freqval);  
      // return 0;  
    }  

    /* 
     * Function: utf8_probability Argument: byte array Returns : number from 0 to 100 representing probability text in array uses 
     * UTF-8 encoding of Unicode 
     */  
    int utf8_probability(byte[] rawtext) {  
      int score = 0;  
      int i, rawtextlen = 0;  
      int goodbytes = 0, asciibytes = 0;  
      // Maybe also use UTF8 Byte Order Mark: EF BB BF  
      // Check to see if characters fit into acceptable ranges  
      rawtextlen = rawtext.length;  
      for (i = 0; i < rawtextlen; i++) {  
        if ((rawtext[i] & (byte) 0x7F) == rawtext[i]) { // One byte  
          asciibytes++;  
          // Ignore ASCII, can throw off count  
        } else if (-64 <= rawtext[i] && rawtext[i] <= -33 && // Two bytes  
            i + 1 < rawtextlen && -128 <= rawtext[i + 1] && rawtext[i + 1] <= -65) {  
          goodbytes += 2;  
          i++;  
        } else if (-32 <= rawtext[i] && rawtext[i] <= -17  
            && // Three bytes  
            i + 2 < rawtextlen && -128 <= rawtext[i + 1] && rawtext[i + 1] <= -65 && -128 <= rawtext[i + 2]  
            && rawtext[i + 2] <= -65) {  
          goodbytes += 3;  
          i += 2;  
        }  
      }  
      if (asciibytes == rawtextlen) {  
        return 0;  
      }  
      score = (int) (100 * ((float) goodbytes / (float) (rawtextlen - asciibytes)));  
      // System.out.println("rawtextlen " + rawtextlen + " goodbytes " + goodbytes + " asciibytes " + asciibytes + " score " +  
      // score);  
      // If not above 98, reduce to zero to prevent coincidental matches  
      // Allows for some (few) bad formed sequences  
      if (score > 98) {  
        return score;  
      } else if (score > 95 && goodbytes > 30) {  
        return score;  
      } else {  
        return 0;  
      }  
    }  

    /* 
     * Function: utf16_probability Argument: byte array Returns : number from 0 to 100 representing probability text in array uses 
     * UTF-16 encoding of Unicode, guess based on BOM // NOT VERY GENERAL, NEEDS MUCH MORE WORK 
     */  
    int utf16_probability(byte[] rawtext) {  
      // int score = 0;  
      // int i, rawtextlen = 0;  
      // int goodbytes = 0, asciibytes = 0;  
      if (rawtext.length > 1 && ((byte) 0xFE == rawtext[0] && (byte) 0xFF == rawtext[1]) || // Big-endian  
          ((byte) 0xFF == rawtext[0] && (byte) 0xFE == rawtext[1])) { // Little-endian  
        return 100;  
      }  
      return 0;  
      /* 
       * // Check to see if characters fit into acceptable ranges rawtextlen = rawtext.length; for (i = 0; i < rawtextlen; i++) { 
       * if ((rawtext[i] & (byte)0x7F) == rawtext[i]) { // One byte goodbytes += 1; asciibytes++; } else if ((rawtext[i] & 
       * (byte)0xDF) == rawtext[i]) { // Two bytes if (i+1 < rawtextlen && (rawtext[i+1] & (byte)0xBF) == rawtext[i+1]) { 
       * goodbytes += 2; i++; } } else if ((rawtext[i] & (byte)0xEF) == rawtext[i]) { // Three bytes if (i+2 < rawtextlen && 
       * (rawtext[i+1] & (byte)0xBF) == rawtext[i+1] && (rawtext[i+2] & (byte)0xBF) == rawtext[i+2]) { goodbytes += 3; i+=2; } } } 
       *  
       * score = (int)(100 * ((float)goodbytes/(float)rawtext.length)); // An all ASCII file is also a good UTF8 file, but I'd 
       * rather it // get identified as ASCII. Can delete following 3 lines otherwise if (goodbytes == asciibytes) { score = 0; } // 
       * If not above 90, reduce to zero to prevent coincidental matches if (score > 90) { return score; } else { return 0; } 
       */  
    }  

    /* 
     * Function: ascii_probability Argument: byte array Returns : number from 0 to 100 representing probability text in array uses 
     * all ASCII Description: Sees if array has any characters not in ASCII range, if so, score is reduced 
     */  
    int ascii_probability(byte[] rawtext) {  
      int score = 75;  
      int i, rawtextlen;  
      rawtextlen = rawtext.length;  
      for (i = 0; i < rawtextlen; i++) {  
        if (rawtext[i] < 0) {  
          score = score - 5;  
        } else if (rawtext[i] == (byte) 0x1B) { // ESC (used by ISO 2022)  
          score = score - 5;  
        }  
        if (score <= 0) {  
          return 0;  
        }  
      }  
      return score;  
    }  

    /* 
     * Function: euc_kr__probability Argument: pointer to byte array Returns : number from 0 to 100 representing probability text 
     * in array uses EUC-KR encoding 
     */  
    int euc_kr_probability(byte[] rawtext) {  
      int i, rawtextlen = 0;  
      int dbchars = 1, krchars = 1;  
      long krfreq = 0, totalfreq = 1;  
      float rangeval = 0, freqval = 0;  
      int row, column;  
      // Stage 1: Check to see if characters fit into acceptable ranges  
      rawtextlen = rawtext.length;  
      for (i = 0; i < rawtextlen - 1; i++) {  
        // System.err.println(rawtext[i]);  
        if (rawtext[i] >= 0) {  
          // asciichars++;  
        } else {  
          dbchars++;  
          if ((byte) 0xA1 <= rawtext[i] && rawtext[i] <= (byte) 0xFE && (byte) 0xA1 <= rawtext[i + 1]  
              && rawtext[i + 1] <= (byte) 0xFE) {  
            krchars++;  
            totalfreq += 500;  
            row = rawtext[i] + 256 - 0xA1;  
            column = rawtext[i + 1] + 256 - 0xA1;  
            if (KRFreq[row][column] != 0) {  
              krfreq += KRFreq[row][column];  
            } else if (15 <= row && row < 55) {  
              krfreq += 0;  
            }  
          }  
          i++;  
        }  
      }  
      rangeval = 50 * ((float) krchars / (float) dbchars);  
      freqval = 50 * ((float) krfreq / (float) totalfreq);  
      return (int) (rangeval + freqval);  
    }  

    /* 
     * Function: cp949__probability Argument: pointer to byte array Returns : number from 0 to 100 representing probability text 
     * in array uses Cp949 encoding 
     */  
    int cp949_probability(byte[] rawtext) {  
      int i, rawtextlen = 0;  
      int dbchars = 1, krchars = 1;  
      long krfreq = 0, totalfreq = 1;  
      float rangeval = 0, freqval = 0;  
      int row, column;  
      // Stage 1: Check to see if characters fit into acceptable ranges  
      rawtextlen = rawtext.length;  
      for (i = 0; i < rawtextlen - 1; i++) {  
        // System.err.println(rawtext[i]);  
        if (rawtext[i] >= 0) {  
          // asciichars++;  
        } else {  
          dbchars++;  
          if ((byte) 0x81 <= rawtext[i]  
              && rawtext[i] <= (byte) 0xFE  
              && ((byte) 0x41 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0x5A || (byte) 0x61 <= rawtext[i + 1]  
                  && rawtext[i + 1] <= (byte) 0x7A || (byte) 0x81 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0xFE)) {  
            krchars++;  
            totalfreq += 500;  
            if ((byte) 0xA1 <= rawtext[i] && rawtext[i] <= (byte) 0xFE && (byte) 0xA1 <= rawtext[i + 1]  
                && rawtext[i + 1] <= (byte) 0xFE) {  
              row = rawtext[i] + 256 - 0xA1;  
              column = rawtext[i + 1] + 256 - 0xA1;  
              if (KRFreq[row][column] != 0) {  
                krfreq += KRFreq[row][column];  
              }  
            }  
          }  
          i++;  
        }  
      }  
      rangeval = 50 * ((float) krchars / (float) dbchars);  
      freqval = 50 * ((float) krfreq / (float) totalfreq);  
      return (int) (rangeval + freqval);  
    }  

    int iso_2022_kr_probability(byte[] rawtext) {  
      int i;  
      for (i = 0; i < rawtext.length; i++) {  
        if (i + 3 < rawtext.length && rawtext[i] == 0x1b && (char) rawtext[i + 1] == '$' && (char) rawtext[i + 2] == ')'  
            && (char) rawtext[i + 3] == 'C') {  
          return 100;  
        }  
      }  
      return 0;  
    }  

    /* 
     * Function: euc_jp_probability Argument: pointer to byte array Returns : number from 0 to 100 representing probability text 
     * in array uses EUC-JP encoding 
   
  • 3
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 3
    评论
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值