utf8 string linux,Linux string conversion from UTF-8 to UNICODE, UCS-4LE, UCS-4LE

最新推荐文章于 2024-04-25 18:07:29 发布

半个科创史学先生

最新推荐文章于 2024-04-25 18:07:29 发布

阅读量205

点赞数

文章标签： utf8 string linux

Linux string conversion from UTF-8 to UNICODE, UCS-4LE, UCS-4LE.

It is astonishing for windows developers that Linux has two distinct difference to Windows character set.

1. standard char * is default in UTF-8 coding. strlen may greater than 2*(Chinese Word) + English.

2.wchar_t UNICODE string is 4 bytes long.

See Ubuntu 16.04

#include using namespace std;

#include #include #include #include #include //wprintf is here!!!!!!!!

void PrintHexData( unsigned char *str, int iLen )

{

unsigned char * pHex = (unsigned char *)str;

for ( int i = 0; i < iLen ; i ++ )

{

printf( "%02x ", pHex[i] );

}

printf( "\n" );

}

int main()

{

char szName[128] = "Linux-7字符串转换测试";

wchar_t wzName[128] = L"Linux-7字符串转换测试";

wchar_t szTransName[128] ;

const char * strCharSet[] = { "UCS-2LE", "UCS-4LE", "UNICODE" };

memset( szTransName, 0, sizeof(szTransName) );

cout << "char size : " << sizeof(char) << " Bytes" << endl;

cout<< "wchar_t size : " << sizeof(wchar_t) << " Bytes" << endl;

cout << "RAND_MAX : " << RAND_MAX << endl;

//Linux utf-8 length is greater than UNICODE*2 for non-English word.

size_t iNameByteLen = strlen( szName );

cout << "String: " << szName << endl;

cout << " length of strlen: " << iNameByteLen << endl;

int wzlen ;

wzlen = wcslen( wzName );

printf( " length wcslen of wchar_t %d \n", wzlen );

//Linux UNICODE-4LE for wchar_t.

cout << "string: " << szName << " from utf-8 conversion result:" << endl;

for ( int iset = 0 ; iset < (int)(sizeof( strCharSet )/ sizeof(const char *)); iset ++ )

{

cout << "dst char set: " << strCharSet[iset] << endl;

iconv_t ct = iconv_open( strCharSet[iset],"utf-8" );

if ( ct != (iconv_t)-1 )

{

char * s_in ;

char * s_out ;

size_t iInLen, iInLen1;

size_t iOutLen, iOutLen1;

int iconved = 0;

try{

s_in = (char *)szName;

s_out = (char *)szTransName;

iInLen1 = iInLen = strlen( szName );

iOutLen1 = iOutLen = sizeof( szTransName );

memset( szTransName, 0xff, iOutLen );

iconved = iconv( ct, (char **)&s_in, &iInLen, (char **)&s_out, &iOutLen );

iconv_close( ct );

int iConvChars = iOutLen1 - iOutLen;

printf( "ICONV in len: %d=>%d out len %d=>%d (%d) conv ret: %d\n",

(unsigned int)iInLen1, (unsigned int)iInLen, (unsigned int)iOutLen1, (unsigned int)iOutLen

,iConvChars, iconved );

unsigned char * pHex = (unsigned char *)szTransName;

PrintHexData ( pHex, iConvChars );

}

catch( ... )

{

cout << "erro iconv" << endl;

}

printf( "expected Unicode linux string : \n" );

PrintHexData ( (unsigned char *)wzName, wcslen(wzName)*sizeof(wchar_t) );

getchar();

return 0;

}

running result:

char size : 1 Bytes

wchar_t size : 4 Bytes

RAND_MAX : 2147483647

String: Linux-7字符串转换测试

length of strlen: 28

length wcslen of wchar_t 14

string: Linux-7字符串转换测试 from utf-8 conversion result:

dst char set: UCS-2LE

ICONV in len: 28=>0 out len 512=>484 (28) conv ret: 0

4c 00 69 00 6e 00 75 00 78 00 2d 00 37 00 57 5b 26 7b 32 4e 6c 8f 62 63 4b 6d d5 8b

dst char set: UCS-4LE

ICONV in len: 28=>0 out len 512=>456 (56) conv ret: 0

4c 00 00 00 69 00 00 00 6e 00 00 00 75 00 00 00 78 00 00 00 2d 00 00 00 37 00 00 00 57 5b 00 00 26 7b 00 00 32 4e 00 00 6c 8f 00 00 62 63 00 00 4b 6d 00 00 d5 8b 00 00

dst char set: UNICODE

ICONV in len: 28=>0 out len 512=>482 (30) conv ret: 0

ff fe 4c 00 69 00 6e 00 75 00 78 00 2d 00 37 00 57 5b 26 7b 32 4e 6c 8f 62 63 4b 6d d5 8b

expected Unicode linux string :

4c 00 00 00 69 00 00 00 6e 00 00 00 75 00 00 00 78 00 00 00 2d 00 00 00 37 00 00 00 57 5b 00 00 26 7b 00 00 32 4e 00 00 6c 8f 00 00 62 63 00 00 4b 6d 00 00 d5 8b 00 00