检测字节流的编码类型(GBK,UNICODE,GB18030,UTF8..)

想对文本文件做处理,但是有的文本文件没有BOM信息,于是只能对字节流进行检测。

网上找不到综合的,于是干脆自己写了。

比较麻烦的是big5和gbk(在代码里我将ASCII,GB2312,GBK统称为ANSI)的判断,因为gbk的码区包含了big5的部分。

不过测试了不少文件,几十个样本的准确率还是100%。

但估计总有漏网之鱼。

下面为实现代码,供简单参考。

 

=============头文件===================

#defineCODE_ANSI          1
#defineCODE_UNICODE       2
#defineCODE_UTF8          3
#defineCODE_GB18030       4
#defineCODE_UNICODEB      5
#defineCODE_UNICODE4      6
#defineCODE_UNICODE4B     7
#defineCODE_BIG5          8


#defineCODE_ANSIASCII     1
#defineCODE_ANSIGB2312    2
#defineCODE_ANSIGBK       3

 

#defineFILE_TYPE_ANSI     CODE_ANSI
#define FILE_TYPE_UNICODE  CODE_UNICODE
#defineFILE_TYPE_UTF8     CODE_UTF8
#define FILE_TYPE_GB18030  CODE_GB18030
#define FILE_TYPE_UNICODEB  CODE_UNICODEB
#define FILE_TYPE_UNICODE4  CODE_UNICODE4
#define FILE_TYPE_UNICODE4B CODE_UNICODE4B
#defineFILE_TYPE_BIG5     CODE_BIG5

#define CHARBetween(CH_,Low_,Hi_) (((CH_)>=(Low_)) &&((CH_)<=(Hi_)))

#define CHARBetween(CH_,Low_,Hi_) (((CH_)>=(Low_)) &&((CH_)<=(Hi_)))
BOOL   TL_IsATwiceBig5Code(unsigned char* pBuf); //头2字节是否属于big5码区
BOOL   TL_IsATwiceGB2312Code(unsigned char* pBuf);//头2字节是否属于GB码区
BOOL   TL_IsATwiceGBKCode(unsigned char* pBuf);//头2字节是否属于GBK码区
int    TL_IsStreamCodeLikeUTF8(unsigned char* pBuf,size_tLen);//是否utf8,返回0为否,否则返回可能性(75-100)
int    TL_GetFileCodeType(CString FileName,int* HeadLen=NULL,int*ProbalyOut=NULL,int* ANSIDetailCode=NULL);//文件的编码检测
BOOL   TL_DetechStreamCode(unsigned char* pBuf,size_t Len,int*CodeOut,int* ProbalyOut=NULL,int*ANSIDetailCode=NULL);//字节流的编码检测

 

===========代码==============

 


int  TL_IsStreamCodeLikeUTF8(unsigned char*pBuf,size_t Len)
{
  size_t i,AsciiNum=0,ZeroNum=0,LastZero=0,UTFNum=0,UnKnow=0;
 int    UTFRate=0,UnKnowRate=0;

  if (Len<1) return 0;

  for (i=0;i<Len;i++)
  {
    if(pBuf[i]==0)
    {
     ZeroNum++;
     LastZero++;
     continue;
    }
   LastZero=0;
    if(pBuf[i]<=127)
    {
     AsciiNum++;
     continue;
    }

    if(CHARBetween(pBuf[i],0xC0,0xDF) &&((i+1)<Len) &&CHARBetween(pBuf[i+1],0x80,0xBF))
   {//两字节UTF8,110xxx,10xx
     UTFNum +=2;
     i+=1;
     continue;
    }

    if(CHARBetween(pBuf[i],0xE0,0xEF) &&((i+2)<Len) &&CHARBetween(pBuf[i+1],0x80,0xBF)&&CHARBetween(pBuf[i+2],0x80,0xBF))
   {//三字节UTF8
     UTFNum +=3;
     i+=2;
     continue;
    }

   UnKnow++;
  }

  if (UTFNum == 0) return 0;

  UTFRate = (int)(100*((float)UTFNum/(float)(Len-AsciiNum)));
  UnKnowRate = (int)(100*((float)UnKnow/(float)(Len-AsciiNum)));

  if (UTFRate > 98) returnUTFRate; //UTF8字节数的比率超过98

  if ((UTFRate > 95)&& (UTFNum>30)&& (UnKnowRate<5))return UTFRate;  //UTF8字节数的比率超过95 且个数差不多可以接受

  if ((UTFRate > 75)&& (UTFNum>150)&& (UnKnowRate<20))return UTFRate;

  return 0;
}

BOOL   TL_IsATwiceBig5Code(unsigned char* pBuf)
{
#define BIGZONENum  8
  const unsigned charBig5Tbl[BIGZONENum][12]={
   {0xA1,0xA2,0x40,0x7E,0xA1,0xFE,0},
   {0xA3,0xA3,0x40,0x7E,0xA1,0xBF,0xE1,0xE1,0},
   {0xA4,0xC5,0x40,0x7E,0xA1,0xFE,0},
   {0xC6,0xC6,0x40,0x7E,0xA1,0xfE,0},
   {0xC7,0xC7,0x40,0x7E,0xA1,0xFE,0},
   {0xC8,0xC8,0x40,0x7E,0xA1,0xD3,0},
   {0xC9,0xF8,0x40,0x7E,0xA1,0xFE,0},
   {0xF9,0xF9,0x40,0x7E,0xA1,0xD5,0xD6,0xDC,0xDD,0xFE,0}
  };
  int  Zone,hi,low;

  if ((pBuf[0] <Big5Tbl[0][0])|| (pBuf[0] >Big5Tbl[BIGZONENum-1][1])) return FALSE;

 
  hi = BIGZONENum-1;
  low = 0;
  while (low <= hi)
  {
    Zone =(hi+low+1)/2;
    if(CHARBetween(pBuf[0],Big5Tbl[Zone][0],Big5Tbl[Zone][1]))break;
    if (low ==hi) return FALSE;
    if (pBuf[0]< Big5Tbl[Zone][0]) hi= Zone-1;
    else low =Zone+1;
  }

  for (low=2;;low+=2)
  {
    if(Big5Tbl[Zone][low] == 0) return FALSE;
    if(CHARBetween(pBuf[1],Big5Tbl[Zone][low],Big5Tbl[Zone][low+1]))return TRUE;
  }
  return FALSE;
}


BOOL   TL_IsATwiceGB2312Code(unsigned char* pBuf)
 
#define GB2312ZONENum  9
  const unsigned charGB2312Tbl[GB2312ZONENum][10]={
   {0xA1,0xA1,0xFE,0},
   {0xA2,0xB1,0xE2,0xE5,0xEE,0xF1,0xFC,0},
   {0xA3,0xA1,0xFE,0},
   {0xA4,0xA1,0xF3,0},
   {0xA5,0xA1,0xF6,0},
   {0xA6,0xA1,0xB8,0xC1,0xD8,0},
   {0xA7,0xA1,0xC1,0xD1,0xF1,0},
   {0xA8,0xA1,0xBA,0xC5,0xE9,0},
   {0xA9,0xA4,0xEF,0}
  };
  int  Zone,hi,low;

  if ((pBuf[0] < 0xA1)||(pBuf[0] > 0xF7)) return FALSE;

 
  hi = GB2312ZONENum-1;
  low = 0;
  while (low <= hi)
  {
    Zone =(hi+low+1)/2;
    if (pBuf[0]== GB2312Tbl[Zone][0])
    {
     for (low=1;;low+=2)
     {
       if (GB2312Tbl[Zone][low] == 0) return FALSE;
       if(CHARBetween(pBuf[1],GB2312Tbl[Zone][low],GB2312Tbl[Zone][low+1]))return TRUE;
     }
    }
    if (low ==hi) break;
    if (pBuf[0]< GB2312Tbl[Zone][0]) hi= Zone-1;
    else low =Zone+1;
  }

  if ((pBuf[0] == 0xD7)&& CHARBetween(pBuf[1],0xFA,0xFE))return FALSE;
  if (CHARBetween(pBuf[0],0xB0,0xF7)&& CHARBetween(pBuf[1],0xA1,0xFE))return TRUE;

  return FALSE;
}

BOOL   TL_IsATwiceGBKCode(unsigned char* pBuf)
{
  if((pBuf[0]<0x81)||(pBuf[0]==0xFF)) returnFALSE;
  if((pBuf[1]<0x40)||(pBuf[1]==0xFF)) returnFALSE;
  if ((pBuf[1]==0x7F)) return FALSE;

  return TRUE;
}


BOOL    TL_DetechStreamCode(unsigned char* pBuf,size_t Len,int*CodeOut,int* ProbalyOut,int* ANSIDetailCode)
{{
  size_t i,AsciiNum=0,ZeroNum=0,UnKnow=0;
  size_t  CharNum[10];
 int    CharRate[10],CharRateExpend[10];
 int    TypeMatchNum=0;
  size_t  LastConsZero=0;
 int    BufCharCode=0,AnsiExType=0,BufCharProbaly=0;

  MEMZERO(CharNum);
  MEMZERO(CharRate);
  MEMZERO(CharRateExpend);
  if (CodeOut!=NULL) *CodeOut = 0;
  if (Len<1) return 0;

  for (i=0;i<Len;i++)
  {
    if(pBuf[i]==0)
    {
     ZeroNum++;
     LastConsZero++;
     if (LastConsZero > 5) break;
     continue;
    }
   LastConsZero=0;
    if(pBuf[i]<=0x7F)
    {
     if (((i+1)<Len) &&(pBuf[i+1] == 0)) // 先看是否unicode
     {
       CharNum[CODE_UNICODE] += 2;
       i++;
       continue;
     }
     AsciiNum++;
     continue;
    }

    if(CHARBetween(pBuf[i],0xC0,0xDF) &&((i+1)<Len) &&CHARBetween(pBuf[i+1],0x80,0xBF))
   {//两字节UTF,110xxx,
     CharNum[CODE_UTF8] += 2;
     i+=1;
     continue;
    }

    if(CHARBetween(pBuf[i],0xE0,0xEF) &&((i+2)<Len) &&CHARBetween(pBuf[i+1],0x80,0xBF) &&CHARBetween(pBuf[i+2],0x80,0xBF))
   {//三字节UTF
     CharNum[CODE_UTF8] += 3;
     i+=2;
     continue;
    }

    if(CHARBetween(pBuf[i],0x81,0xFE) &&((i+3)<Len) &&CHARBetween(pBuf[i+1],0x30,0x39) &&CHARBetween(pBuf[i+2],0x81,0xFE)&&CHARBetween(pBuf[i+3],0x30,0x39))
   {//4字节gb18030
     CharNum[CODE_GB18030] += 4;
     i+=3;
     continue;
    }

   TypeMatchNum =0;
    if(((i+2)<Len) &&TL_IsATwiceBig5Code(pBuf+i))
    {
     CharNum[CODE_BIG5] += 2;
     TypeMatchNum++;
     
    }

    if(CHARBetween(pBuf[i],0x81,0xFE) &&((i+1)<Len) &&CHARBetween(pBuf[i+1],0x40,0xFE) &&(pBuf[i+1] !=0x7F))
    {//字节GBK  字符范围 首字节0x81 - 0xFE,低字节0x40 - 0xFE ,剔除xx7F 一条线  ,包含GB2312
     CharNum[CODE_ANSI] += 2;
     TypeMatchNum+=2;

     
     if((ANSIDetailCode!=NULL)&&TL_IsATwiceGB2312Code(pBuf+i)) CharNum[0] += 2;
     
    }

    if(TypeMatchNum>0)
    {
     i++;
     continue;
    }

   UnKnow++;
  }
 
  for (i=1;i<10;i++)
  {
    if(CharNum[i] == 0) continue;
    CharRate[i]= (int)(100*((float)CharNum[i] /(float)(Len-AsciiNum)));
    if(i!=CODE_UNICODE)  CharRateExpend[i] =(int)(100*((float)(CharNum[i]+AsciiNum) /(float)Len));
    elseCharRateExpend[i] =(int)(100*((float)(CharNum[i]+UnKnow+CharNum[CODE_ANSI])/(float)Len));
  }

  for (i=1;i<10;i++)
  {
    if(CharRate[i] < 76)  continue;
    if(BufCharProbaly>CharRate[i]) continue;
    BufCharCode= i;
   BufCharProbaly = CharRate[i];
  }
  if (BufCharCode == 0)
  {
    for(i=1;i<10;i++)
    {
     if (CharRateExpend[i] < 90) continue;
     if (BufCharProbaly>CharRateExpend[i])continue;
     BufCharCode = i;
     BufCharProbaly = CharRateExpend[i];
    }
  }
 
  if ((BufCharCode == CODE_ANSI)&& (CharNum[CODE_ANSI]<= CharNum[CODE_BIG5]+2)&&(CharNum[CODE_ANSI]>30)) BufCharCode =CODE_BIG5;

  if (AsciiNum == Len)
  {
    BufCharCode= CODE_ANSI;
    AnsiExType =CODE_ANSIASCII;
   BufCharProbaly = 100;
  }
  else if((ANSIDetailCode!=NULL)&&(BufCharCode== CODE_ANSI)&&(AnsiExType ==0))
  {
    if(CharNum[0] == CharNum[CODE_ANSI]) AnsiExType =CODE_ANSIGB2312;
    elseAnsiExType = CODE_ANSIGBK;
  }

  if (CodeOut!=NULL) *CodeOut =BufCharCode;
  if (ProbalyOut!=NULL) *ProbalyOut =BufCharProbaly;
  if((ANSIDetailCode!=NULL)&&(AnsiExType>0)) *ANSIDetailCode = AnsiExType;

  if (BufCharCode>0) returnTRUE;
  return FALSE;
}


int TL_GetFileCodeType(CString FileName,int* HeadLen,int*ProbalyOut,int* ANSIDetailCode)
{
  CFile file;
  unsigned char Head[10248];
  BOOL  bGetTrueType=TRUE;
  UINT i=0,ReadLen=0,PerRead=10240;
  int  iDetechCode=0;

  if (HeadLen!=NULL) *HeadLen = 0;
  if(!file.Open(FileName,CFile::modeReadWrite|CFile::modeNoTruncate))
  {
    returnERR_OPEN_FILE;
  }

  MEMZERO(Head);

  file.Read(Head,20);
  file.Close();

  if((Head[0]==0xEF)&&(Head[1]==0xBB)&&(Head[2]==0xBF))
  {
    if(HeadLen!=NULL) *HeadLen = 3;
    returnFILE_TYPE_UTF8;
  }

  if((Head[0]==0xFF)&&(Head[1]==0xFE))
  {
    if((Head[2]==0x00)&&(Head[3]==0x00))
    {
     if (HeadLen!=NULL) *HeadLen = 4;
     return FILE_TYPE_UNICODE4;
    }
    if(HeadLen!=NULL) *HeadLen = 2;
    returnFILE_TYPE_UNICODE;
  }

  if((Head[0]==0x84)&&(Head[1]==0x31)&&(Head[2]==0x95)&&(Head[3]==0x33))
  {
    if(HeadLen!=NULL) *HeadLen = 4;
    returnFILE_TYPE_GB18030;
  }

  if((Head[0]==0xFE)&&(Head[1]==0xFF))
  {
    if(HeadLen!=NULL) *HeadLen = 2;
    returnFILE_TYPE_UNICODEB;
  }
  if((Head[0]==0x00)&&(Head[1]==0x00)&&(Head[2]==0xFE)&&(Head[3]==0xFF))
  {
    if(HeadLen!=NULL) *HeadLen = 4;
    returnFILE_TYPE_UNICODE4B;
  }
  if (!bGetTrueType) return FILE_TYPE_ANSI;

  if(!file.Open(FileName,CFile::modeReadWrite|CFile::modeNoTruncate))
  {
    returnERR_OPEN_FILE;
  }

  iDetechCode = 0;
  ReadLen=PerRead;
  while (ReadLen==PerRead)
  {
   MEMZERO(Head);
    ReadLen =file.Read(Head,PerRead);
    if(TL_DetechStreamCode(Head,ReadLen,&iDetechCode,ProbalyOut,ANSIDetailCode))break;
  }
  file.Close();

 

if (iDetechCode >0) returniDetechCode;
  return FILE_TYPE_ANSI;
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值