SDL入门教程（十）：4、用iconv获得正确的Unicode，使用FriBiDi实现UTF-8到Unicode的正确转换

最新推荐文章于 2024-08-15 09:48:21 发布

程序心声

最新推荐文章于 2024-08-15 09:48:21 发布

阅读量840

点赞数

分类专栏： SDL 文章标签： dst vector byte 算法 string linux

SDL 专栏收录该内容

40 篇文章 4 订阅

订阅专栏

作者：龙飞

4.1：为什么iconv不能完全正确的转换Unicode？

        我不是先知，教程里面是整理过的思路和逻辑顺序，在我研究这个问题的时候，头绪远远比教程里面乱得多。我完全是从Wesnoth的源代码去分析问题的，所以，为什么会扯上UTF-8和FriBidi，那也是因为在源代码中找到了线索。
        iconv不能完全正确的获得Unicode，也就是我们刚才遇到的纯汉字转换没问题，而有英文就不行了。我并不清楚这是win32下的问题，还是在Linux下也这样，我也不清楚具体的算法和问题的根本原因，我只是通过试验得到一个算是表面原因的结论：我们知道，GB2312和Unicode汉字都使用2个字节（在UTF-8中是3个字节），英文和数字等用1个字节。iconv在得到两个字节（unsigned char即一个字节大小）代码的时候可以正确的将GB2312转化为Unicode（或者UTF-8），但是只有1个字节的时候则在转化Unicode的时候终止了，幸运的是，如果是转化为UTF-8则可以正确的进行，并且也转化为1个字节的UTF-8（只限于英文，数字等）。
        所以，我们可以先通过iconv将原来的GB2312转化为UTF-8——汉字用3个字节（3个单位的unsigned char），英文、数字和基本符号用1个字节（1个单位的unsigned char）。然后，我们需要一个函数，将这种形式的UTF-8转换为SDL所需要的Uint16的Unicode。什么样的函数可以实现这种转换呢？

4.2：其它编码与Unicode之间的双向转换，GNU FriBidi
http://fribidi.freedesktop.org/wiki/
        FriBidi是一个致力于Unicode编码与其它编码相互转换的开源项目，到目前为止，还是一个尚未完成的项目。我在研究Wesnoth源代码的时候看到这样的函数：fribidi_utf8_to_unicode()，所以，我想在这个函数中可能应该包含UTF-8到Unicode的算法——希望不要太复杂。在FriBidi项目中找到这个函数，它在文件fribidi_char_sets_utf8.c下面：

int
fribidi_utf8_to_unicode ( char * s, int len, FriBidiChar * us)
/* warning: the length of input string may exceed the length of the output */
{
   int length;
   char * t = s;

  length = 0 ;
   while (s - t < len)
    {
       if ( * (unsigned char * ) s <= 0x7f )     /* one byte */
    {
       * us ++ = * s ++ ;         /* expand with 0s */
    }
       else if ( * (unsigned char * ) s <= 0xdf )     /* 2 byte */
    {
       * us ++ =
        (( * (unsigned char * ) s & 0x1f ) << 6 ) +
        (( * (unsigned char * ) (s + 1 )) & 0x3f );
      s += 2 ;
    }
       else              /* 3 byte */
    {
       * us ++ =
        (( int ) ( * (unsigned char * ) s & 0x0f ) << 12 ) +
        (( * (unsigned char * ) (s + 1 ) & 0x3f ) << 6 ) +
        ( * (unsigned char * ) (s + 2 ) & 0x3f );
      s += 3 ;
    }
      length ++ ;
    }
   * us = 0 ;
   return (length);
}

其中，我们找到FriBidiChar的定义，类似Uint32的类型；另外，函数用char表示1字节的单位。根据我的试验，至少在VC2008下是有错误的，我们一直用的是unsigned char表示1字节的单位，所以，我们需要对这个函数做些修改：

int myUTF8_to_UNICODE(Uint16 * unicode, unsigned char * utf8, int len)
{
     int length;
    unsigned char * t = utf8;

    length = 0 ;
     while (utf8 - t < len){
         // one byte.ASCII as a, b, c, 1, 2, 3

ect
         if ( * (unsigned char * ) utf8 <= 0x7f ) {
             // expand with 0s.
             * unicode ++ = * utf8 ++ ;
        }
         // 2 byte.
         else if ( * (unsigned char * ) utf8 <= 0xdf ) {
             * unicode ++ = (( * (unsigned char * ) utf8 & 0x1f ) << 6 ) + (( * (unsigned char * ) (utf8 + 1 )) & 0x3f );
            utf8 += 2 ;
        }
         // 3 byte.Chinese may use 3 byte.
         else {
             * unicode ++ = (( int ) ( * (unsigned char * ) utf8 & 0x0f ) << 12 ) +
                (( * (unsigned char * ) (utf8 + 1 ) & 0x3f ) << 6 ) +
                ( * (unsigned char * ) (utf8 + 2 ) & 0x3f );
            utf8 += 3 ;
        }
        length ++ ;
    }

     * unicode = 0 ;

     return (length);
}

4.3：将汉字，英文，数字和符号都正确的转换为16位的Unicode

有了iconv和上面这个函数，我们终于可以将GB2312的编码正确的转换为Unicode了。

// FileName: gb2312_to_Unicode.h
#ifndef GB2312_TO_UNICODE_H_
#define GB2312_TO_UNICODE_H_

#include < iostream >
#include < vector >
#include " GNU/iconv.h "
#include " SDL/SDL.h "

std::vector < Uint16 > getUnicode( const std:: string & str);

#endif

实现文件中包含我们上面写的从UTF-8到Unicode的函数：

#include " gb2312_to_Unicode.h "

int myUTF8_to_UNICODE(Uint16 * unicode, unsigned char * utf8, int len);

std::vector < Uint16 > getUnicode( const std:: string & str)
{
     const int CHAR_SIZE = 256 ;
     // GB2312 src
     const unsigned char * src = ( const unsigned char * )(str.c_str());
    size_t src_len = strlen(( char * )src);
     // Unicode dst to get
    unsigned char dst[CHAR_SIZE] = { 0 };
    size_t dst_len = sizeof (dst);
     // iconv arg
     const unsigned char * in = src;
    unsigned char * out = dst;

    iconv_t cd;
     // GB2312 to UTF-8
    cd = iconv_open( " UTF-8 " , " GB2312 " );
     if ((iconv_t) - 1 == cd){
        exit ( - 1 );
    }
     // conversion
    iconv(cd, ( const char ** ) & in , & src_len, ( char ** ) & out , & dst_len);

     // UTF-8 to Unicode
     int utf8Len = strlen(( char * )dst);
    Uint16 unicodeData[CHAR_SIZE] = { 0 };
     int unicodeLen = myUTF8_to_UNICODE(unicodeData, dst, utf8Len);
    std::vector < Uint16 > unicodeVectorArray;
     for ( int i = 0 ; i < unicodeLen; i ++ ) {
        unicodeVectorArray.push_back(unicodeData[i]);
    }

    iconv_close(cd);
     return unicodeVectorArray;
}

函数把一个std::string转换位Uint16的vector数组并返回，这正是SDL所需要的Unicode格式。

此文章来自于【http://blog.csdn.net/lf426/article/details/2243032】