字符检测程序(上) 检测GB2312、BIG5...

import java.lang.*;
import java.util.*;
import java.io.*;
import java.net.*;

public class SinoDetect {

    static final int GB2312 = 0;
    static final int GBK = 1;
    static final int HZ = 2;
    static final int BIG5 = 3;
    static final int EUC_TW = 4;
    static final int ISO_2022_CN = 5;
    static final int UTF8 = 6;
    static final int UNICODE = 7;
    static final int ASCII = 8;
    static final int OTHER = 9;

    static final int TOTAL_ENCODINGS = 10;


    // Frequency tables to hold the GB, Big5, and EUC-TW character
    // frequencies
    int GBFreq[][];
    int GBKFreq[][];
    int Big5Freq[][];
    int EUC_TWFreq[][];
    //int UnicodeFreq[94][128];

    public static String[] nicename;
    public static String[] codings;


    public SinoDetect() {
 // Initialize the Frequency Table for GB, Big5, EUC-TW
 GBFreq = new int[94][94];
 GBKFreq = new int[126][191];
 Big5Freq = new int[94][158];
 EUC_TWFreq = new int[94][94];

 codings = new String[TOTAL_ENCODINGS];
 codings[GB2312] = "GB2312";
 codings[GBK] = "GBK";
 codings[HZ] = "HZ";
 codings[BIG5] = "BIG5";
 codings[EUC_TW] = "CNS11643";
 codings[ISO_2022_CN] = "ISO2022CN";
 codings[UTF8] = "UTF8";
 codings[UNICODE] = "Unicode";
 codings[ASCII] = "ASCII";
 codings[OTHER] = "OTHER";

 nicename = new String[TOTAL_ENCODINGS];
 nicename[GB2312] = "GB2312";
 nicename[GBK] = "GBK";
 nicename[HZ] = "HZ";
 nicename[BIG5] = "Big5";
 nicename[EUC_TW] = "CNS 11643";
 nicename[ISO_2022_CN] = "ISO 2022-CN";
 nicename[UTF8] = "UTF-8";
 nicename[UNICODE] = "Unicode";
 nicename[ASCII] = "ASCII";
 nicename[OTHER] = "OTHER";

 initialize_frequencies();
    }


  public static void main(String argc[])
  {
   SinoDetect sinodetector;
   int result = OTHER;

   argc = new String[1];
   //argc[0] = "c://chinesedata//codeconvert//voaunit.txt";
    argc[0] = "中文";
   sinodetector = new SinoDetect();
   if (argc[0].startsWith("http://") == true)
   {
     try {
      result = sinodetector.detectEncoding(new URL(argc[0]));
     }
     catch (Exception e) {
      System.err.println("Bad URL " + e.toString());
     }
   } else {
     //result = sinodetector.detectEncoding(new File(argc[0]));
      result = sinodetector.detectEncoding(argc[0].getBytes());
   }
   System.out.println(nicename[result]);
  }


    /** Function  :  detectEncoding
       Aruguments:  URL
       Returns   :  One of the encodings from the Encoding enumeration
       (GB2312, HZ, BIG5, EUC_TW, ASCII, or OTHER)
       Description: This function looks at the URL contents
       and assigns it a probability score for each encoding type.
       The encoding type with the highest probability is returned.
    */

    public int detectEncoding(URL testurl) {
 byte[] rawtext = new byte[10000];
 int bytesread = 0, byteoffset = 0;
 int guess = OTHER;
 InputStream chinesestream;

 try {
     chinesestream = testurl.openStream();

     while ((bytesread = chinesestream.read(rawtext, byteoffset, rawtext.length - byteoffset)) > 0) {
  byteoffset += bytesread;
     };
     chinesestream.close();
     guess = detectEncoding(rawtext);


 }
 catch (Exception e) {
     System.err.println("Error loading or using URL " + e.toString());
     guess = OTHER;
 }

 return guess;
    }

    /** Function  :  detectEncoding
       Aruguments:  File
       Returns   :  One of the encodings from the Encoding enumeration
       (GB2312, HZ, BIG5, EUC_TW, ASCII, or OTHER)
       Description: This function looks at the file
       and assigns it a probability score for each encoding type.
       The encoding type with the highest probability is returned.
    */

    public int detectEncoding(File testfile) {
 FileInputStream chinesefile;
 byte[] rawtext;

 rawtext = new byte[(int)testfile.length()];
 try {
     chinesefile = new FileInputStream(testfile);
     chinesefile.read(rawtext);
 }
 catch (Exception e) {
     System.err.println("Error: " + e);
 }

 return detectEncoding(rawtext);
    }

 

    /** Function  :  detectEncoding
       Aruguments:  byte array
       Returns   :  One of the encodings from the Encoding enumeration
       (GB2312, HZ, BIG5, EUC_TW, ASCII, or OTHER)
       Description: This function looks at the byte array
       and assigns it a probability score for each encoding type.
       The encoding type with the highest probability is returned.
    */

    public int detectEncoding(byte[] rawtext) {
 int[] scores;
 int index, maxscore = 0;
 int encoding_guess = OTHER;

 scores = new int[TOTAL_ENCODINGS];

 // Assign Scores
 scores[GB2312]      = gb2312_probability(rawtext);
 scores[GBK]         = gbk_probability(rawtext);
 scores[HZ]          = hz_probability(rawtext);
 scores[BIG5]        = big5_probability(rawtext);
 scores[EUC_TW]      = euc_tw_probability(rawtext);
 scores[ISO_2022_CN] = iso_2022_cn_probability(rawtext);
 scores[UTF8]        = utf8_probability(rawtext);
 scores[UNICODE]     = utf16_probability(rawtext);
 scores[ASCII]       = ascii_probability(rawtext);
 scores[OTHER]       = 0;

 // Tabulate Scores
 for (index = 0; index < TOTAL_ENCODINGS; index++) {
     if (scores[index] > maxscore) {
  encoding_guess = index;
  maxscore = scores[index];
     }
 }

 // Return OTHER if nothing scored above 50
 if (maxscore <= 50) {
     encoding_guess = OTHER;
 }

 return encoding_guess;
    }

 


    /* Function:  gb2312_probability
       Argument:  pointer to byte array
       Returns :  number from 0 to 100 representing probability
       text in array uses GB-2312 encoding
    */

    int gb2312_probability(byte[] rawtext) {
 int i, rawtextlen = 0;

 int dbchars = 1, gbchars = 1;
 long gbfreq = 0, totalfreq = 1;
 float rangeval = 0, freqval = 0;
 int row, column;

 // Stage 1:  Check to see if characters fit into acceptable ranges

 rawtextlen = rawtext.length;
 for (i = 0; i < rawtextlen-1; i++) {
     //System.err.println(rawtext[i]);
     if (rawtext[i] >= 0) {
  //asciichars++

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

skyyoung

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值