字符串编码问题学习

103 篇文章 0 订阅
5 篇文章 0 订阅

我们native层配置文件用了两套编码,unicode和gbk2312,都是两个字节表示中文。要统一采用gbk编码,顺便顺便熟悉一下gbk。

python3.3的测试

为方便使用python做测试。python默认的字符串类str采用unicode,s = '中文' 等价于 s = u'中文'。

dp_gbk_bs将字符串转化成gbk编码,并输出每个中文对应的unsigned short。

wd_2_gbk_val 将单个字符转化成gbk编码的unsigned short值。

gbk_val_2_wd 将gbk编码下的一个short数值还原成中文字符串。

# function 0
# 输出字符串中每个汉字码值
def dp_gbk_bs(name):
    '''
    dump gbk2312 string to unsigned short vector.
    '''
    gbk_bs = str.encode(name, 'gbk')
    print( name, "(gbk2312) = ", gbk_bs)
    size = len(name)

    form = ''
    for i in range(size):
        form += 'H'
    vec = struct.unpack(form, gbk_bs)       # return a tuple
    # v1, v2 = struct.unpack('HH', gbk_bs)

    for i in range(size):
        print(name[i], " = ", vec[i])

# function 1
def wd_2_gbk_val(wd):
    '''
    transform single gbk2312 word to unsigned short value.
    中  =  53462 ,  0xd0d6
    '''
    cd_ty = 'gbk'
    gbk_bt = str.encode(wd, cd_ty);
    vec = struct.unpack('H', gbk_bt)   # return tuple type.    
    return vec[0]

ss = u'中国技术交易大厦'
dp_gbk_bs(ss)

# function 2
# short 转成hex string,然后转化成bytes,最后转化成str(gbk2312)
def gbk_val_2_wd(us_v):
    '''
    transform unsigned short value to gbk2312 single word.
    53462 ,  0xd0d6  => 中
    '''
    hex_s = hex(us_v)   # 获取16进制串
    print('hex str = ', hex_s)

    hex_s = hex_s[2:]   # skip '0x' header
    hex_list = []
    while hex_s:
        str_tp = hex_s[0:2]
        hex_list.append(str_tp)
        hex_s = hex_s[2:]

    res = ""        
    # 大小端倒置,逆序遍历
    for w in reversed(hex_list):
        res += w
    
    print('reversed hex str = ', res)
    bys = bytes.fromhex(res)
    print('bytes = ', bys)
    return bys.decode('gbk')
    
string = '中国技术交易大厦'
val_l = []
for ss in string:
    val = wd_2_gbk_val(ss)
    val_l.append( val )
    print( ss, " = ", val)
    
print(val_l)

for val in val_l:
    print('------------------')
    print(val, " = ", gbk_val_2_wd(val))
输出结果:

中  =  53462
国  =  64185
技  =  48316
术  =  62922
交  =  48061
易  =  55250
大  =  62388
厦  =  50127
[53462, 64185, 48316, 62922, 48061, 55250, 62388, 50127]
------------------
hex str =  0xd0d6
reversed hex str =  d6d0
bytes =  b'\xd6\xd0'
53462  =  中
------------------
hex str =  0xfab9
reversed hex str =  b9fa
bytes =  b'\xb9\xfa'
64185  =  国
------------------
hex str =  0xbcbc
reversed hex str =  bcbc
bytes =  b'\xbc\xbc'
48316  =  技
------------------
hex str =  0xf5ca
reversed hex str =  caf5
bytes =  b'\xca\xf5'
62922  =  术
------------------
hex str =  0xbbbd
reversed hex str =  bdbb
bytes =  b'\xbd\xbb'
48061  =  交
------------------
hex str =  0xd7d2
reversed hex str =  d2d7
bytes =  b'\xd2\xd7'
55250  =  易
------------------
hex str =  0xf3b4
reversed hex str =  b4f3
bytes =  b'\xb4\xf3'
62388  =  大
------------------
hex str =  0xc3cf
reversed hex str =  cfc3
bytes =  b'\xcf\xc3'
50127  =  厦

C的测试

	char buf[2] = {0};
	short word = 53462;	// ‘中’				// 0xd0d6 
	memcpy(buf, &word, 2);					// [-42, -48],  0xd0是-48的补码, 0xd6是-42的补码
											// 0xd6	 0xd0
	unsigned char* byte_ptr = (unsigned char*)&word;	// 小端存储
	unsigned char byte0 = byte_ptr[0];		// 214	0xd6,0xd6是214的原码
	unsigned char byte1 = byte_ptr[1];		// 208	0xd0,0xd0是208的原码

unsigned short word存储汉字‘中’的gbk2312码值,其内容拷贝到char buf数组中,buf内容为负数,buf[0]为word的低字节,buf[1]为高字节。通过byte_ptr指针取得内容为正数。

0xd0d6,word变量内存结构:

内存高地址  |  内存 低地址

        d0         |       d6

        -48        |     -42           //  signed char,对应的负数补码

       208        |      214         // unsigned char 对应的正数原码

buf数组内存结构

   buf[0]       |      buf[1]

取低地址,  取高地址

    -42          |         -48

Unicode2GBK函数

/**
  * 将unicode字符串转化成gbk2312编码的字符串
  */
int Unicode2GBK( wchar_t *pUnicode, char** ppDest)
{
#ifndef CODE_PAGE_GB18030
#define CODE_PAGE_GB18030 54936
#endif
	// get the size of the dest string 
	const int size = ::WideCharToMultiByte( CODE_PAGE_GB18030, 0/* you can do more for it*/,
		pUnicode, -1, 0, 0, 0, 0 ); 
	if ( size == 0 ) 
	{
		return -1; 
	} 

	char* pDestString = new char[size + 2];
	::memset( pDestString, 0, sizeof(pDestString) );
	// transform

	int ret = ::WideCharToMultiByte( CODE_PAGE_GB18030, 0, pUnicode, -1, pDestString, size, 0, 0 );
	if( ret == 0 ) 
	{
		delete pDestString;		// 失败
		return -1;
	}
	else 
	{
		*ppDest = pDestString; 
		return 0; 
	}

	return -1;
}

refer:windows下 unicode转化成gbk: http://hi.baidu.com/zhangweijiqn/item/e2ca4c1acfcb42d4bf904284



  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值