utf8与unicode转换

最新推荐文章于 2024-09-04 08:35:12 发布

pinggle

最新推荐文章于 2024-09-04 08:35:12 发布

阅读量995

点赞数

分类专栏： VC编程基础 Linux编程基础 C++ 编程基础文章标签： UNICODE Unicode unicode UTF8 utf8

C++ 同时被 3 个专栏收录

50 篇文章 0 订阅

订阅专栏

VC编程基础

44 篇文章 0 订阅

订阅专栏

Linux编程基础

20 篇文章 0 订阅

订阅专栏

1. 不用VC库函数转换:

class CStrConvertor  
{  
public:                                    
    static int Unicode2Utf8(LPSTR cBuf, int& iCBuf,LPCWSTR uBuf, int iUBuf);  
    static int Utf82Unicode(LPWSTR pDst,  int nDstLen , LPSTR pSrc,int nSrcLen);  
};  

/** 
  *     该函数用于将Unicode数组转为utf8格式! 
  *     @param out   转换结果存放区域指针 
  *     @param outLength   转换结果存放区域大小 
  *     @param in   源串存放区域指针 
  *     @param inLength   源串存放区域大小 
  *     @return QINT 转换结果在目的串中实际使用的长度，转换失败则返回-1 
  */  
int CStrConvertor::Unicode2Utf8( char* out, int& outLength,const wchar_t * in, int inLength )  
{  
    //------------------------------------------------   
    //参数有效性判断   
    if(out == NULL || in == NULL || inLength<0)  
    {  
        return -1;  
    }  
    int totalNum = 0;  
    for(int i = 0; i < inLength; i++)//计算转换结果实际所需长度   
    {  
        wchar_t unicode = in[i];  
        if (unicode >= 0x0000 && unicode <= 0x007f)  
        {  
            totalNum += 1;  
        }  
        else if (unicode >= 0x0080 && unicode <= 0x07ff)  
        {  
            totalNum += 2;  
        }  
        else if (unicode >= 0x0800 && unicode <= 0xffff)  
        {  
            totalNum += 3;  
        }  
    }  
    if( outLength < totalNum )//参数有效性判断！   
    {  
        return -1;  
    }  
    //------------------------------------------------   
  
    int outsize = 0;//用来计数输出结果的实际大小！   
    char *tmp = out;  
    int i = 0;  
    for (i = 0; i < inLength; i++)  
    {  
        if(outsize>outLength) //空间不足对应处理！   
        {  
            return -1;  
        }  
        wchar_t unicode = in[i];  
          
        if (unicode >= 0x0000 && unicode <= 0x007f)  
        {  
            *tmp = (char)unicode;  
            tmp += 1;  
            outsize += 1;  
        }  
        else if (unicode >= 0x0080 && unicode <= 0x07ff)  
        {  
            *tmp = 0xc0 | (unicode >> 6);  
            tmp += 1;  
            *tmp = 0x80 | (unicode & (0xff >> 2));  
            tmp += 1;  
            outsize += 2;  
        }  
        else if (unicode >= 0x0800 && unicode <= 0xffff)  
        {  
            *tmp = 0xe0 | (unicode >> 12);  
            tmp += 1;  
            *tmp = 0x80 | (unicode >> 6 & 0x00ff);  
            tmp += 1;  
            *tmp = 0x80 | (unicode & (0xff >> 2));  
            tmp += 1;  
            outsize += 3;  
        }  
    }  
    return outsize;  
}  
-------------------------------------------------------  
/** 
  * 该函数用于将utf8数组转为Unicode格式! 
  * 目前该函数返回值为：转换后unicode数据占用的wchar_t的个数（切记不是总char的个数） ！ 
  *     @param out   转换结果存放区域指针 
  *     @param outsize   转换结果存放区域大小 
  *     @param in   源串存放区域指针 
  *     @param insize   源串存放区域大小 
  *     @return QINT 转换结果在目的串中的长度，转换失败则返回-1 
  */   
QINT  CStrConvertor::Utf82Unicode(LPWSTR out,  QINT outsize , LPSTR in,QINT insize)  
{  
    //-------------------------------------------------------------------------------------------   
    //参数有效性判断   
    if(out == NULL || in == NULL || insize<0)  
    {  
        return -1;  
    }  
  
    int totalNum = 0;  
    char *p = in;  
    for(int i=0;i<insize;i++)  
    {  
        if (*p >= 0x00 && *p <= 0x7f)//说明最高位为'0'，这意味着utf8编码只有1个字节！   
        {  
            p++;  
            totalNum += 1;  
        }  
        else if ((*p & (0xe0))== 0xc0)//只保留最高三位，看最高三位是不是110，如果是则意味着utf8编码有2个字节！   
        {  
            p++;  
            p++;  
            totalNum += 1;  
        }  
        else if ((*p & (0xf0))== 0xe0)//只保留最高四位，看最高三位是不是1110，如果是则意味着utf8编码有3个字节！   
        {  
            p++;  
            p++;  
            p++;  
            totalNum += 1;  
        }  
    }  
    if( outsize < totalNum )//参数有效性判断！   
    {  
        return -1;  
    }  
    //------------------------------------------------   
        int resultsize = 0;  
  
        p = in;  
        char* tmp = (char *)out;  
        while(*p)  
        {  
            if (*p >= 0x00 && *p <= 0x7f)//说明最高位为'0'，这意味着utf8编码只有1个字节！   
            {  
                *tmp = *p;  
                tmp++;  
                //*tmp = '/0';   
                tmp++;  
                resultsize += 1;  
            }  
            else if ((*p & 0xe0)== 0xc0)//只保留最高三位，看最高三位是不是110，如果是则意味着utf8编码有2个字节！   
            {  
                wchar_t t = 0;  
                char t1 = 0;  
                char t2 = 0;  
  
                t1 = *p & (0x1f);//高位的后5位！（去除了头部的110这个标志位）   
                p++;  
                t2 = *p & (0x3f);//低位的后6位！（去除了头部的10这个标志位）   
  
                *tmp = t2 | ((t1 & (0x03)) << 6);  
                tmp++;  
                *tmp = t1 >> 2;//留下其保留的三位   
                tmp++;  
                resultsize += 1;  
            }  
            else if ((*p & (0xf0))== 0xe0)//只保留最高四位，看最高三位是不是1110，如果是则意味着utf8编码有3个字节！   
            {  
                wchar_t t = 0;  
                wchar_t t1 = 0;  
                wchar_t t2 = 0;  
                wchar_t t3 = 0;  
                t1 = *p & (0x1f);  
                p++;  
                t2 = *p & (0x3f);  
                p++;  
                t3 = *p & (0x3f);  
  
                *tmp = ((t2 & (0x03)) << 6) | t3;  
                tmp++;  
                *tmp = (t1 << 4) | (t2 >> 2);  
                tmp++;  
                resultsize += 1;  
            }  
            p++;  
        }  
        /*不考虑结束符，如果考虑则打开此段！  
        *tmp = '/0';  
        tmp++;  
        *tmp = '/0';  
        resultsize += 2;  
        */  
        return resultsize;  
}

转载自: http://blog.csdn.net/nrc_douningbo/article/details/5880602

2. 使用VC库函数转换:

char* UNICODEtoUTF8(const wchar_t* unicode)
{
	int len = WideCharToMultiByte(CP_UTF8, 0, unicode, -1, NULL, 0, NULL, NULL);
	char* str = new char[len+1];
	memset(str, 0, len+1);
	WideCharToMultiByte(CP_UTF8, 0,unicode, -1, str, len, NULL, NULL);
	str[len]= '\0';
	return str;
}

wchar_t* UTF8toUNICODE(const char* utf8)
{
	int len = MultiByteToWideChar(CP_UTF8, 0, utf8, -1, NULL, 0);
	wchar_t* wstr = new wchar_t[len+1];
	memset(wstr, 0, len+1);
	MultiByteToWideChar(CP_UTF8, 0, utf8, -1, wstr, len);
	wstr[len]= '\0';
	return wstr;
}