Ubuntu下实现UTF8编码转为Unicode编码 C程序

最新推荐文章于 2024-08-12 10:48:20 发布

_那个谁

最新推荐文章于 2024-08-12 10:48:20 发布

阅读量2.3k

点赞数 1

分类专栏： Linux 文章标签： unicode ubuntu makefile

本文链接：https://blog.csdn.net/xingkong886/article/details/54971191

版权

Linux 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

参考博客：http://blog.csdn.net/tge7618291/article/details/7599902
本文单字转化代码来自上面的博客。
Ubuntu下实现UTF8编码转为Unicode编码 C程序
unicode.c

#include <stdio.h>
#include<string.h>
#include"unicode.h"
unsigned char output[4];
//单字utf8编码长度　字节
int get_utf8_size(unsigned char Input)  
{
    int firstch=Input;
    int temp = 0x80;
    int num = 0;
    while (temp & firstch)
    {
       num++;
       temp = (temp >> 1);
    }
    return num;
}
//单字utf8 to unicode
int one_utf8_to_unicode(unsigned char* input, int utfbytes)  
{   
    bzero(output,4);
    // b1 表示UTF-8编码的pInput中的高字节, b2 表示次高字节, ...  
    unsigned char b1, b2, b3, b4, b5, b6;  

    switch ( utfbytes)  
    {  
        case 0:  
            *output = *input;
            *(output+1) = 0;    
            utfbytes += 1;  
            break;  
        case 2:  
            b1 = *input;  
            b2 = *(input+1);  
            if ( (b2 & 0xE0) != 0x80 )  //此高位10xx xxxx
                return 0;  
            *output     = (b1 << 6) + (b2 & 0x3F);
            *(output+1) = (b1 >> 2) & 0x07;  //2位范围0000 0080-0000 07ff
            break;  
        case 3:  
            b1 = *input;  
            b2 = *(input+1);  
            b3 = *(input+2);  
            if ( ((b2 & 0xC0) != 0x80) || ((b3 & 0xC0) != 0x80) )  
                return 0;  
            *output     = (b2 << 6) + (b3 & 0x3F);  
            *(output+1) = (b1 << 4) + ((b2 >> 2) & 0x0F);  
            break;  
        case 4:  
            b1 = *input;  
            b2 = *(input+1);  
            b3 = *(input+2);  
            b4 = *(input+3); 
            if ( ((b2 & 0xC0) != 0x80) || ((b3 & 0xC0) != 0x80)  
                    || ((b4 & 0xC0) != 0x80) )  
                return 0;  
            *output     = (b3 << 6) + (b4 & 0x3F);  
            *(output+1) = (b2 << 4) + ((b3 >> 2) & 0x0F);  
            *(output+2) = ((b1 << 2) & 0x1C)  + ((b2 >> 4) & 0x03);  
            break;  
        case 5:  
            b1 = *input;  
            b2 = *(input+1);  
            b3 = *(input+2);  
            b4 = *(input+3); 
            b5 = *(input+4); 
            if ( ((b2 & 0xC0) != 0x80) || ((b3 & 0xC0) != 0x80)  
                    || ((b4 & 0xC0) != 0x80) || ((b5 & 0xC0) != 0x80) )  
                return 0;  
            *output     = (b4 << 6) + (b5 & 0x3F);  
            *(output+1) = (b3 << 4) + ((b4 >> 2) & 0x0F);  
            *(output+2) = (b2 << 2) + ((b3 >> 4) & 0x03);  
            *(output+3) = (b1 << 6);  
            break;  
        case 6:  
            b1 = *input;  
            b2 = *(input+1);  
            b3 = *(input+2);  
            b4 = *(input+3); 
            b5 = *(input+4); 
            b6 = *(input+5); 
            if ( ((b2 & 0xC0) != 0x80) || ((b3 & 0xC0) != 0x80)  
                    || ((b4 & 0xC0) != 0x80) || ((b5 & 0xC0) != 0x80)  
                    || ((b6 & 0xC0) != 0x80) )  
                return 0;  
            *output     = (b5 << 6) + (b6 & 0x3F);  
            *(output+1) = (b5 << 4) + ((b6 >> 2) & 0x0F);  
            *(output+2) = (b3 << 2) + ((b4 >> 4) & 0x03);  
            *(output+3) = ((b1 << 6) & 0x40) + (b2 & 0x3F);  
            break;  
        default:  
            return 0;  
            break;  
    } 
    return utfbytes; 
} 
//inbyte inputs字节
int utf8_to_unicode(unsigned char* inputs,unsigned char* outputs,int inbyte)
{
    unsigned char midInput[6]={0,0,0,0,0,0};
    int i,bytes=0;
    int midbyte=0;
    int j=0;
    int readbytes=0;
    int readlenth=0;
    int endbytes=inbyte;
    while( endbytes>0 )
    {
        bytes = get_utf8_size( *(inputs+readbytes) );
        if(bytes>=0)
            readlenth++;
        midbyte=bytes;  
        if(0==bytes)
            midbyte=1;
        for(i=0;i<midbyte;i++)
            midInput[i]=*(inputs+readbytes + i);
        one_utf8_to_unicode(midInput,bytes);

        *(outputs+j)=output[1];
        *(outputs+j+1)=output[0];
        j=j+2;  
        readbytes+=midbyte;
        endbytes=inbyte-readbytes;

    }       
    return readlenth;
}

unicode.h文件：

#ifndef _UNICODE_H_
#define _UNICODE_H_

int get_utf8_size(unsigned char Input);//单utf8编码字节数
int one_utf8_to_unicode(unsigned char *input,int utfbytes);  
int utf8_to_unicode(unsigned char* inputs,unsigned char* outputs,int inbyte);

#endif

main.c文件：

#include <stdio.h>
#include<string.h>

#include"unicode.h"
unsigned char out[1024];

void main()
{
    int len=0;              //utf8编码        unicode编码
    unsigned char pss[]={
                         0x74,              //0074
                         0xe7,0x9f,0xa5,    //77e5  
                         0xe4,0xb9,0x8e,    //4e4e  
                         0x49,              //0049
                         0x4e,              //004e  N
                         0xe6,0x97,0xa5,    //65e5
                         0xe6,0x8a,0xa5,    //62a5
                         0xE8,0xBF,0x85,    //8FC5
                         0xE8,0x83,0xBD,    //80fd
                         0x58               //0058  X
                         };

    len=utf8_to_unicode(pss,out,18);

    int j=0;
    int k;   
    for(k=0;k<len*2;k++)
    {
        printf("%.2x",out[k]);
        if(k%2>0)
            printf("    -%d\n",++j);

    }  
}

makefile文件：

mainFile = main.c  unicode.c

object = unicode

all:$(object)

$(object):$(mainFile)
    gcc -o $(object) $(mainFile) -lm -pthread -lrt

clean:
    rm $(object)

代码完成~!
ubuntu下运行终端，进入该文件保存目录，输入make回车完成编译，生成unicode可执行文件，输入./unicode回车执行。
执行结果：

snail@ubuntu:~/桌面/c/utf8-unicode$ make
gcc -o unicode main.c  unicode.c -lm -pthread -lrt
snail@ubuntu:~/桌面/c/utf8-unicode$ ./unicode
0074    -1
77e5    -2
4e4e    -3
0049    -4
004e    -5
65e5    -6
62a5    -7
8fc5    -8
snail@ubuntu:~/桌面/c/utf8-unicode$