(C/C++)UTF8字符串中字的切分

UTF-8 采用变长度字节来表示字符,理论上最多可以到 6 个字节长度。UTF-8 编码兼容了 ASC II(0-127), 也就是说 UTF-8 对于 ASC II 字符的编码是和 ASC II 一样的。对于超过一个字节长度的字符,才用以下编码规范: 
左边第一个字节1的个数表示这个字符编码字节的位数,例如两位字节字符编码样式为为:110xxxxx 10xxxxxx; 三位字节字符的编码样式为:1110xxxx 10xxxxxx 10xxxxxx.;以此类推,六位字节字符的编码样式为:1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx。 xxx 的值由字符编码的二进制表示的位填入。 
1字节:0xxxxxxx 
2字节:110xxxxx 10xxxxxx 
3字节:1110xxxx 10xxxxxx 10xxxxxx 
4字节:11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 
5字节:111110xx 10xxxxxx 10xxxxxx 10xxxxxx
///
# include <string.h>
# include <vector>

///
using namespace std;

///
void fnReadCharactersUTF8( const char* pszSentence, vector<string>& vec )
{
    int iLen;
    iLen = strlen( pszSentence );
 
   const char*   p;
 
     p = pszSentence;
 
     unsigned char * q;
 
     char    szCharacter[101];
     int     iChar;
 
     int     iNumChars;
     iNumChars = 0;
 
     vec.clear();
 
     string  strCharacter;
 
     while ( p != NULL && strlen( p ) > 0 )
     {
         q = ( unsigned char * ) p;
         if ( q[0] < 0x80 )
         {
             //p[ 0 ] must be an ASCII character
             iChar = 0;
             szCharacter[iChar++] = p[0];
             p++;
             q = ( unsigned char * ) p;
             while ( p != NULL && q[0] < 0x80 )
             {
                 szCharacter[iChar++] = p[0];
                 p++;
                 q = ( unsigned char * ) p;
             }
             szCharacter[iChar] = '\0';
 
             vec.push_back( string( szCharacter ) );
 
             iNumChars++;
         }
         else if ( q[0] < 0xC0 )
         {
             //invalid char between 0x80 and 0xC0
             p++;
         }
         else if ( q[0] < 0xE0 )
         {
             //two chars
             szCharacter[0] = p[0];
             szCharacter[1] = p[1];
             szCharacter[2] = '\0';
             p = p + 2;
 
             strCharacter = string( szCharacter );
             vec.push_back( strCharacter );
 
             iNumChars++;
         }
         else if ( q[0] < 0xF0 )
         {
             //three chars
             szCharacter[0] = p[0];
             szCharacter[1] = p[1];
             szCharacter[2] = p[2];
             szCharacter[3] = '\0';
             p = p + 3;
 
             strCharacter = string( szCharacter );
             vec.push_back( strCharacter );
 
             //printf( "%s ", strCharacter.c_str( ) );
 
             iNumChars++;
         }
         else if ( q[0] < 0xF8 )
         {
             //four chars
             p += 4;
         }
         else if ( q[0] < 0xFC )
         {
             //five chars
             p += 5;
         }
         else if ( q[0] < 0xFE )
         {
             //6 chars
             p += 5;
         }
         else
         {
             //>=0xFE
             p++;
         }
     }
 }
 
 // FILE END ///
 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值