linux c 字符串编码转换,linux c 怎么样进行字符编码转换

#include #define ALIVE() printf("still alive here [%s]:[%d]\n",__FUNCTION__, __LINE__)

/*get the low 8 bits of an unsigned short*/

#define LOWER_SHORT(num) (0x0ff&num)

/*get the high 8 bits of an unsigned short*/

#define HIGH_SHORT(num) ((0xff00&num)>>8)

//#define _BIG_ENDIAN

//#define DEBUG

typedef unsigned int char_16;

static int char16len(char_16 *in_char)

{

int ret = 0;

while(in_char[ret])

{

ret++;

}

return ret;

}

static int gb2312_to_utf16(char *inchar, size_t *in_size, char *outchar, size_t *out_size)

{

iconv_t cd;

char **ppinchar = &inchar;

char **ppoutchar = &outchar;

if ( ( iconv_t )-1 == ( cd = iconv_open( "UCS-2", "GB2312" ) ) )

{

perror( "iconv_open() error" );

return -1;

}

if ( -1 == iconv( cd, ppinchar, in_size, ppoutchar, out_size ) )

{

perror( "iconv() error" );

goto error;

}

{

int i = 0;

char *p = outchar;

while(p[i])

{

printf("%2x:", p[i]);

i++;

}

}

iconv_close(cd);

return 0;

error:

iconv_close(cd);

return -1;

}

static int char_to_char16(char *in, size_t in_size, char_16 out[], size_t out_size)

{

int i = 0;

unsigned short high, low;

printf("insize is [%d], out size is [%d]\n", in_size, out_size);

if(in_size > 2*out_size)

{

return -1;

}

ALIVE();

for(; i{

#ifdef _BIG_ENDIAN

high = in[2*i+1]&0xff;

low  = in[2*i]&0xff;

#else

high = in[2*i]&0xff;

low  = in[2*i+1]&0xff;

#endif

out[i] = (high<<8)|low;

}

return 0;

}

static int char16_to_char(char_16 in[], size_t size_in, char *out, size_t size_out)

{

int i = 0;

if(size_out < 2*size_in)

{

return -1;

}

for(; i< size_in; i++)

{

#ifdef _BIG_ENDIAN

out[i*2] = LOWER_SHORT(in[i]);

out[i*2 + 1] = HIGH_SHORT(in[i]);

#else

out[i*2] = HIGH_SHORT(in[i]);

out[i*2 + 1] = LOWER_SHORT(in[i]);

#endif

}

return 0;

}

static int utf16_to_gb2312(char *inchar, size_t *in_size, char *outchar, size_t *out_size)

{

iconv_t cd;

char **ppinchar = &inchar;

char **ppoutchar = &outchar;

if ( ( iconv_t )-1 == ( cd = iconv_open( "GB2312", "UCS-2") ) )

{

perror( "iconv_open() error" );

return -1;

}

if ( -1 == iconv( cd, ppinchar, in_size, ppoutchar, out_size ) )

{

perror( "iconv() error" );

goto error;

}

iconv_close(cd);

return 0;

error:

iconv_close(cd);

return -1;

}

extern int char_to_utf16(char *in_char, char_16 *out_wchar, size_t out_size)

{

size_t in_len = strlen(in_char) + 1;

size_t tmp_len = 2*in_len;

size_t tmp_len_org;

char *tmp;

printf("inlen is [%d], out size is [%d]\n", in_len, out_size);

if(in_len > 2*out_size)

{

return -1;

}

if(NULL == (tmp = (char *)malloc(tmp_len)))

{

return -1;

}

tmp_len_org = tmp_len;

if( 0 != gb2312_to_utf16(in_char, &in_len, tmp, &tmp_len))

{

goto error;

}

{

printf("tmp length is [%d]:[%d]\n", tmp_len, in_len);

}

if(0 != char_to_char16(tmp, tmp_len_org - tmp_len, out_wchar, out_size))

{

goto error;

}

ALIVE();

free(tmp);

return 0;

error:

free(tmp);

return -1;

}

extern int utf16_to_char(char_16 *in_char, char *out_char, size_t out_size)

{

size_t in_size = char16len(in_char) + 1;

char *tmp;

size_t in_len ;

size_t out_len;

#ifdef DEBUG

int j;

for(j=0;j{

fprintf(stderr, "[%4x]:",in_char[j]);

}

fprintf(stderr, "\n");

#endif

if(NULL == (tmp = (char *)malloc(2*in_size)))

{

return -1;

}

memset(tmp, 0, 2*in_size);

if(0 != char16_to_char(in_char, in_size, tmp, 2*in_size))

{

goto error;

}

#ifdef DEBUG

for(j=0; j<2*in_size; j++)

{

fprintf(stderr, "[%2x]:",tmp[j]);

}

fprintf(stderr, "\n");

#endif

in_len = in_size*2;

out_len = out_size;

if( 0 != (utf16_to_gb2312(tmp, &in_len, out_char, &out_len)))

{

goto error;

}

free(tmp);

return 0;

error:

free(tmp);

return -1;

}

1、什么是UTF-8?它与UNICODE是什么关系?

解答:

Unicode的最初目标,是用1个16位的编码来为超过65000个字符提供映射。但这还不够,它不能覆盖全部历史上的文字,也不能解决传输的问题 (implantation head-ache's),尤其在那些基于网络的应用中。已有的软件必须做大量的工作来实现16位的数据。

因此,Unicode用一些基本的保留字符制定了三套编码方式。它们分别是UTF-8,UTF-16和UTF-32。正如名字所示,在UTF-8中,字符是以8位序列来编码的,用一个或几个字节来表示一个字符。这种方式的最大好处,是UTF-8保留了ASCII字符的编码做为它的一部分,例如,在UTF- 8和ASCII中,“A”的编码都是0x41. UTF-16和UTF-32分别是Unicode的16位和32位编码方式。考虑到最初的目的,通常说的Unicode就是指UTF-16。

2、unicode的参考网站

.

3、unicode三种编码之间的转换

4、也可以通过查表的方式转换gb2312码和unicode码。表文件见附件gb2312.txt,文件中有使用说明。

char buff[1024];

size_t buff_len = 1024;

size_t s_len = len;

iconv_t cd = iconv_open("utf-8", "gbk");

if (cd != iconv_t(-1))

{

char* ptr = &buff[0];

size_t remain = buff_len;

size_t d_len = iconv(cd, &s, &s_len, &ptr, &remain);

//for (size_t i=0; i//      cout << buff[i];

//cout << '\n';

iconv_close(cd);

}

用它实现在Linux下解决gbk,gb2312unicode转换

可用在Clucene上

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值