NO BOM UTF-8 文本的判断

这是用的比较多的一个C++函数,我把他转成了 Delphi 的版本:

function IsTextUTF8(lpstrInputStream : PChar; iLen : Integer) : Boolean;
var
  i : Integer;
  cOctets : DWORD; // octets to go in this UTF-8 encoded character
  chr : UCHAR;
  bAllAscii : Boolean;

begin
  cOctets := 0;
  bAllAscii := True;
  for i := 0 to iLen - 1 do
  begin
    chr := Ord(lpstrInputStream[i]);

    if ( (chr and $80) <> 0 ) then
      bAllAscii := False;

    if ( cOctets = 0 ) then
    begin
      //
      // 7 bit ascii after 7 bit ascii is just fine. Handle start of encoding case.
      //
      if ( chr >= $80 ) then
      begin
        //
        // count of the leading 1 bits is the number of characters encoded
        //
        chr := chr * 2;
        cOctets := cOctets + 1;
        while( (chr and $80) <> 0 ) do
        begin
          chr := chr * 2;
          cOctets := cOctets + 1;
        end;

        cOctets := cOctets - 1; // count includes this character
        if( cOctets = 0 ) then
        begin
          Result := False; // must start with 11xxxxxx
          exit;
        end;
      end;
    end
    else begin
      // non-leading bytes must start as 10xxxxxx
      if( (chr and $C0) <> $80 ) then
      begin
        Result := False;
        exit;
      end;
      cOctets := cOctets - 1; // processed another octet in encoding
    end;
  end;

  //
  // End of text. Check for consistency.
  //

  if( cOctets > 0 ) then // anything left over at the end is an error
  begin
    Result := False;
    exit;
  end;

  if bAllAscii then // Not utf-8 if all ascii. Forces caller to use code pages for conversion
  begin
    Result := False;
    exit;
  end;

  Result := True;
end;

下面的是 C++ 原作:


/* IsTextUTF8
 *
 * UTF-8 is the encoding of Unicode based on Internet Society RFC2279
 * ( See http://www.cis.ohio-state.edu/htbin/rfc/rfc2279.html )
 *
 * Basicly:
 * 0000 0000-0000 007F - 0xxxxxxx  (ascii converts to 1 octet!)
 * 0000 0080-0000 07FF - 110xxxxx 10xxxxxx    ( 2 octet format)
 * 0000 0800-0000 FFFF - 1110xxxx 10xxxxxx 10xxxxxx (3 octet format)
 * (this keeps going for 32 bit unicode)
 *
 *
 * Return value:  TRUE, if the text is in UTF-8 format.
 *                FALSE, if the text is not in UTF-8 format.
 *                We will also return FALSE is it is only 7-bit ascii, so the right code page
 *                will be used.
 *
 *                Actually for 7 bit ascii, it doesn't matter which code page we use, but
 *                notepad will remember that it is utf-8 and "save" or "save as" will store
 *                the file with a UTF-8 BOM.  Not cool.
 */

INT IsTextUTF8( LPSTR lpstrInputStream, INT iLen )
{
    INT   i;
    DWORD cOctets;  // octets to go in this UTF-8 encoded character
    UCHAR chr;
    BOOL  bAllAscii= TRUE;

    cOctets= 0;
    for( i=0; i < iLen; i++ ) {
        chr= *(lpstrInputStream+i);

        if( (chr&0x80) != 0 ) bAllAscii= FALSE;

        if( cOctets == 0 )  {
            //
            // 7 bit ascii after 7 bit ascii is just fine.  Handle start of encoding case.
            //
            if( chr >= 0x80 ) { 
               //
               // count of the leading 1 bits is the number of characters encoded
               //
               do {
                  chr <<= 1;
                  cOctets++;
               }
               while( (chr&0x80) != 0 );

               cOctets--;                        // count includes this character
               if( cOctets == 0 ) return FALSE;  // must start with 11xxxxxx
            }
        }
        else {
            // non-leading bytes must start as 10xxxxxx
            if( (chr&0xC0) != 0x80 ) {
                return FALSE;
            }
            cOctets--;                           // processed another octet in encoding
        }
    }

    //
    // End of text.  Check for consistency.
    //

    if( cOctets > 0 ) {   // anything left over at the end is an error
        return FALSE;
    }

    if( bAllAscii ) {     // Not utf-8 if all ascii.  Forces caller to use code pages for conversion
        return FALSE;
    }

    return TRUE;
}

 
不过这段代码是有bug的,微软记事本很著名的“联通”的bug也是这段代码引起的,另外我刚刚发现拉萨的拉也有问题,不信你在桌面新建一个文本,打开后输入 “拉” 保存,重新打开就会发现什么也没有,暂时还没找到更好的识别无BOM的UTF8文本的更好的办法。
 

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值