Conversion Between Unicode-UCS-4 And UTF-8 (Unicode-UCS-4 与 UTF-8 之间的转换)

Note: part of this article is referenced from RFC2279 - UTF-8, a transformation format of ISO 10646.

I'm kinda practising my C programming skills, and I found some friends had written some codes, but the codes might not be readable. I learned the rules of the conversion between Unicode and UTF-8, and wrote two functions as shown below.

Please be focusing on the functions fnUnicode2UTF8() and fnUTF82Unicode() in the source code. Don't be confused with the program output part in the main() function :-)

/********** Pure C Codes for Converting Between UTF8 & Unicode *************
*                                                                          *
* Author: Peter Lee (peterlee.com.cn <at> gmail.com)                       *
*   Date: 2008-11-21                                                       *
*                                                                          *
* Please keep this information while referencing the code below.           *
* Thanks so much!                                                          *
*                                                                          *
* Welcome To Peter Lee's Blog Website:                                     *
*         http://www.peterlee.com.cn                                       *
*         http://blog.peterlee.com.cn                                      *
*                                                                          *
***************************************************************************/

#include <stdio.h>
#include <string.h>
#define MAX 6 /* Max length of a UTF-8 character */

/* Unicode to UTF-8 mapping:
Unicode Range        : UTF-8
---------------------:------------------------------------------------------
U00000000 - U0000007F: 0xxxxxxx
U00000080 - U000007FF: 110xxxxx 10xxxxxx
U00000800 - U0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
U00010000 - U001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
U00200000 - U03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
U04000000 - U7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx

Para: 0x00000000 <= unicode <= 0x7FFFFFFF
*/
void fnUnicode2UTF8(unsigned long unicode, char UTF8[])
{
    if ( 0x00000000 <= unicode && unicode <= 0x0000007F )
    {
        UTF8[MAX] = 1;
        UTF8[0]   = (char)(unicode);
        return;
    }

    if ( 0x00000080 <= unicode && unicode <= 0x000007FF )
    {
        UTF8[MAX] = 2;
        UTF8[0]   = (char)(0xC0 | unicode>>6);
        UTF8[1]   = (char)(0x80 | unicode & 0x3F);
        return;
    }
   
    if ( 0x00000800 <= unicode && unicode <= 0x0000FFFF )
    {
        UTF8[MAX] = 3;
        UTF8[0]   = (char)(0xE0 | unicode>>12);
        UTF8[1]   = (char)(0x80 | unicode>>6 & 0x3F);
        UTF8[2]   = (char)(0x80 | unicode & 0x3F);
        return;
    }
 
    if ( 0x00010000 <= unicode && unicode <= 0x001FFFFF )
    {
        UTF8[MAX] = 4;
        UTF8[0]   = (char)(0xF0 | unicode>>18);
        UTF8[1]   = (char)(0x80 | unicode>>12 & 0x3F);
        UTF8[2]   = (char)(0x80 | unicode>>6 & 0x3F);
        UTF8[3]   = (char)(0x80 | unicode & 0x3F);
        return;
    }

    if ( 0x00200000 <= unicode && unicode <= 0x03FFFFFF )
    {
        UTF8[MAX] = 5;
        UTF8[0]   = (char)(0xF8 | unicode>>24);
        UTF8[1]   = (char)(0x80 | unicode>>18 & 0x3F);
        UTF8[2]   = (char)(0x80 | unicode>>12 & 0x3F);
        UTF8[3]   = (char)(0x80 | unicode>>6 & 0x3F);
        UTF8[4]   = (char)(0x80 | unicode & 0x3F);
        return;
    }

    if ( 0x04000000 <= unicode && unicode <= 0x7FFFFFFF )
    {
        UTF8[MAX] = 6;
        UTF8[0]   = (char)(0xFC | unicode>>30);
        UTF8[1]   = (char)(0x80 | unicode>>24 & 0x3F);
        UTF8[2]   = (char)(0x80 | unicode>>18 & 0x3F);
        UTF8[3]   = (char)(0x80 | unicode>>12 & 0x3F);
        UTF8[4]   = (char)(0x80 | unicode>>6 & 0x3F);
        UTF8[5]   = (char)(0x80 | unicode & 0x3F);
        return;
    }
}

/* Unicode to UTF-8 mapping:
Unicode Range        : UTF-8
---------------------:------------------------------------------------------
U00000000 - U0000007F: 0xxxxxxx
U00000080 - U000007FF: 110xxxxx 10xxxxxx
U00000800 - U0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
U00010000 - U001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
U00200000 - U03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
U04000000 - U7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx

Para: 0x00000000 <= unicode <= 0x7FFFFFFF
*/
unsigned long fnUTF82Unicode(char UTF8[])
{
    unsigned long unicode = 0;

    if ( 0x00 == (unsigned char)(UTF8[0])>>7 )
    {
        UTF8[MAX] = 1;
        unicode   = UTF8[0];
        return unicode;
    }

    if ( 0x06 == (unsigned char)(UTF8[0])>>5 )
    {
        UTF8[MAX] = 2;
        unicode   = (UTF8[0]&0x1F) << 6;
        unicode  |= (UTF8[1]&0x3F);
        return unicode;
    }

    if ( 0x0E == (unsigned char)(UTF8[0])>>4 )
    {
        UTF8[MAX] = 3;
        unicode   = (UTF8[0]&0x0F) << 12;
        unicode  |= (UTF8[1]&0x3F) << 6;
        unicode  |= (UTF8[2]&0x3F);
        return unicode;
    }

    if ( 0x1E == (unsigned char)(UTF8[0])>>3 )
    {
        UTF8[MAX] = 4;
        unicode   = (UTF8[0]&0x07) << 18;
        unicode  |= (UTF8[1]&0x3F) << 12;
        unicode  |= (UTF8[2]&0x3F) << 6;
        unicode  |= (UTF8[3]&0x3F);
        return unicode;
    }

    if ( 0x3E == (unsigned char)(UTF8[0])>>2 )
    {
        UTF8[MAX] = 5;
        unicode   = (UTF8[0]&0x03) << 24;
        unicode  |= (UTF8[1]&0x3F) << 18;
        unicode  |= (UTF8[2]&0x3F) << 12;
        unicode  |= (UTF8[3]&0x3F) << 6;
        unicode  |= (UTF8[4]&0x3F);
        return unicode;
    }

    if ( 0x7E == (unsigned char)(UTF8[0])>>1 )
    {
        UTF8[MAX] = 6;
        unicode   = (UTF8[0]&0x01) << 30;
        unicode  |= (UTF8[1]&0x3F) << 24;
        unicode  |= (UTF8[2]&0x3F) << 18;
        unicode  |= (UTF8[3]&0x3F) << 12;
        unicode  |= (UTF8[4]&0x3F) << 6;
        unicode  |= (UTF8[5]&0x3F);
        return unicode;
    }

    return 0; /* Impossible */
}

char Hex2Bin[23][5] = {"0000", "0001", "0010", "0011",
                       "0100", "0101", "0110", "0111",
                       "1000", "1001",
                       "", "", "", "", "", "", "",
                       "1010", "1011",
                       "1100", "1101", "1110", "1111"};

void fnHex2Bin(char hex[], char bin[])
{
    int i, len = strlen(hex);
   
    for ( bin[0] = i = 0; i < len; ++i )
        strcat ( bin, Hex2Bin[hex[i]-'0'] );
}

int main(int argc, char* argv[])
{
    int i;
    char UTF8[MAX+1], bin[4*8+1], hex[8+1];
    unsigned long unicode = 0x4F60;
    /* Testing for the Han (Chinese) character:
       Unicode: 4F60; "ni3", means "you". */

/* Unicode 2 UTF8 */
    printf ("Unicode 2 UTF8:/n");
   
    sprintf ( hex, "%X", unicode );
    fnHex2Bin ( hex, bin );
    printf ( "Unicode Hex: %s/n", hex );
    printf ( "Unicode Bin: %s/n", bin );
   
    fnUnicode2UTF8 ( unicode, UTF8 );
   
    /* Make the hex string for UTF8[] */
    for ( i = 0; i < UTF8[MAX]; ++i )
        sprintf ( hex+2*i, "%02X", (unsigned char)(UTF8[i]) );
    hex[2*UTF8[MAX]] = 0;
   
    fnHex2Bin ( hex, bin );

    printf ( "    UTF Hex: %0*s/n", UTF8[MAX]*2, hex );
    printf ( "    UTF Bin: %s/n", bin );
/* Unicode 2 UTF8 */


    printf ("/n");


/* UTF8 2 Unicode */
    printf ("UTF8 2 Unicode:/n");
   
    printf ( "    UTF Hex: %0*s/n", UTF8[MAX]*2, hex );
    printf ( "    UTF Bin: %s/n", bin );
   
    unicode = fnUTF82Unicode ( UTF8 );
   
    sprintf ( hex, "%X", unicode );
    fnHex2Bin ( hex, bin );
    printf ( "Unicode Hex: %s/n", hex );
    printf ( "Unicode Bin: %s/n", bin );
/* UTF8 2 Unicode */

    return 0;
}
/* Output:
Unicode 2 UTF8:
Unicode Hex: 4F60
Unicode Bin: 0100111101100000
    UTF Hex: E4BDA0
    UTF Bin: 111001001011110110100000

UTF8 2 Unicode:
    UTF Hex: E4BDA0
    UTF Bin: 111001001011110110100000
Unicode Hex: 4F60
Unicode Bin: 0100111101100000
*/
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值