Android NDK中字符串的相互转换

最新推荐文章于 2023-08-26 15:18:38 发布

xuhuan_wh

最新推荐文章于 2023-08-26 15:18:38 发布

阅读量3.4k

点赞数

分类专栏： android 文章标签： android android ndk 字符转换 UTF8 utf-8

android 专栏收录该内容

37 篇文章 0 订阅

订阅专栏

在Android NDK中gb2312字符串，unicode字符串，utf-8字符串相互转换

前提说明：在AndroidNDK中，一个wchar_t是4个字节，也就是说Android NDK下，unicode字符是采用ucs4的，而在windows系统下，unicode是采用ucs2,即每个unicode字符是占用两个字节的，可以用sizeof（）函数来证明上面说的。如果在Android NDK下，想要强制指定一个wchar_t是2个字节，即跟windows系统下一样，采用ucs2,需要在Android.mk中添加一句LOCAL_CFLAGS := -fshort-wchar即可。但是如果指定wchar_t为两个字节的话，则所有用于处理unicode字符串的函数（如wcscpy，wcslen，wcscmp，wcscat）则都不在适用，需要你自己重新写这些函数，当然这都很容易处理。我写的例子中还是使用默认的ucs4来处理unicode字符串的。见文件as_base_fun.cpp.

在Android NDK中还没有现成的函数来处理字符串的相互转换，但是Android底层有个libicuuc.so库文件，在目录/system/lib/下可以找到这个文件。在这个文件中有个函数ucnv_convert，我们可以从这个库中找到这个函数，然后利用这个函数来转换字符串。在不同的Android版本中，这个函数的名字可能会有点不一样，比如在Android 2.3下函数名为：ucnv_convert_44。

在Android NDK中，一个char *的字符串默认是utf-8编码的。比如：

char * str = “字符串示例”，字符串str在内存中应该占用15个字节，每个汉字占用3个字节。

如要将一个gb2312编码的字符串转换为ucs4的unicode，则目标缓冲区的大小应该至少为strlen(source)+2个wchar_t,注意不可以是strlen(source)+1，其中的source为要转换的字符串。

在gb2312编码的字符串中，一个英文字母或者是数字占用一个字节，汉字占两个字节

在utf-8编码的字符串中，一个英文字母或者是数字占用一个字节，汉字占3个字节

As_base_fun.cpp源码：

typedef wchar_t TCHAR
//ucnv_convert函数原型
void (*ucnv_convert)(const char *, const char *, char * , int32_t , const char *, int32_t,int32_t*) = 0;

//在libicuuc.so中找到函数ucnv_convert
void * init()
{
	void* pDL = dlopen("/system/lib/libicuuc.so", RTLD_LAZY);
	if (0 == pDL)
	{
		return pDL;
	}
	//这里以android2.2为例，函数名就是ucnv_convert_4_2
	//在我的机器（Android 2.3.1）中，函数名称是ucnv_convert44
	ucnv_convert = (void (*)(const char *, const char *, char * , int32_t , const char *, int32_t,int32_t*))dlsym(pDL, "ucnv_convert_3_8");
	char hanshu_name[256];
	int index = 0;
	while (0 == ucnv_convert)
	{
		sprintf(hanshu_name,"ucnv_convert_4%d",index);
		ucnv_convert = (void (*)(const char *, const char *, char * , int32_t , const char *, int32_t,int32_t*))dlsym(pDL, hanshu_name);
		if (ucnv_convert != 0)
		{
			return pDL;
		}
		index ++;
		if (index>9)
		{
			break;
		}
	}
	return pDL;
}

void close_pDL(void * pDL)
{
	if (pDL)
	{
		dlclose(pDL);
		pDL = NULL;
	}
}

//下面的6个函数功能就是gb2312,utf-8,ucs4字符串的相互转换
//参数dest为目标字符串缓冲，即用来存放转换出来的字符串
//参数dest_len为目标缓冲的大小，要保证目标缓冲足够大，能够用来存放目标字符串
//如要将一个gb2312编码的字符串转换为ucs4的unicode，则目标缓冲区的大小应该至少为strlen(source)+2个wchar_t,注意不可以是strlen(source)+1.
//在gb2312编码的字符串中，一个英文字母或者是数字占用一个字节，汉字占两个字节
//在utf-8编码的字符串中，一个英文字母或者是数字占用一个字节，汉字占3个字节
//参数source为要转换的字符串
//所有函数均经过测试，可以正常运行
int unicode2gb2312(char * dest,int dest_len,const TCHAR * source)
{
	if ((NULL == dest)||(NULL == source))
	{
		return 0;
	}
	memset(dest,0,dest_len);
	if (ucnv_convert)
	{
		int err_code = 0;
		ucnv_convert("gb2312","ucs4"
				,(char *)dest
				,dest_len
				,(const char *)source
				,lstrlen(source)*sizeof(TCHAR)
				,&err_code);
		return err_code;
	}
	return 0;
}

int unicode2utf8(char * dest,int dest_len,const TCHAR * source)
{
	if ((NULL == dest)||(NULL == source))
	{
		return 0;
	}
	memset(dest,0,dest_len);
	if (ucnv_convert)
	{
		int err_code = 0;
		ucnv_convert("utf-8","ucs4"
					,(char *)dest
					,dest_len
					,(const char *)source
					,lstrlen(source)*sizeof(TCHAR)
					,&err_code);
		return err_code;
	}
	return 0;
}

int gb23122unicode(TCHAR * dest,int dest_len,const char * source)
{
	if ((NULL == dest)||(NULL == source))
	{
		return 0;
	}
	memset(dest,0,dest_len*sizeof(TCHAR));
	if (ucnv_convert)
	{
		int err_code = 0;
		ucnv_convert("ucs4","gb2312"
					,(char *)dest
					,dest_len*sizeof(TCHAR)
					,(const char *)source
					,strlen(source)
					,&err_code);
		return err_code;
	}
	return 0;
}

int utf82unicode(TCHAR * dest,int dest_len,const char * source)
{
	if ((NULL == dest)||(NULL == source))
	{
		return 0;
	}
	memset(dest,0,dest_len*sizeof(TCHAR));
	if (ucnv_convert)
	{
		int err_code = 0;
		ucnv_convert("ucs4","utf-8"
					,(char *)dest
					,dest_len*sizeof(TCHAR)
					,(const char *)source
					,strlen(source)
					,&err_code);
		return err_code;
	}
	return 0;
}

int gb23122utf8(char * dest,int dest_len,const char * source)
{
	if ((NULL == dest)||(NULL == source))
	{
		return 0;
	}
	memset(dest,0,dest_len);
	if (ucnv_convert)
	{
		int err_code = 0;
		ucnv_convert("utf-8","gb2312"
					,(char *)dest
					,dest_len
					,(const char *)source
					,strlen(source)
					,&err_code);
		return err_code;
	}
	return 0;
}

int utf82gb2312(char * dest,int dest_len,const char * source)
{
	if ((NULL == dest)||(NULL == source))
	{
		return 0;
	}
	memset(dest,0,dest_len);
	if (ucnv_convert)
	{
		int err_code = 0;
		ucnv_convert("gb2312","utf-8"
					,(char *)dest
					,dest_len
					,(const char *)source
					,strlen(source)
					,&err_code);
		return err_code;
	}
	return 0;
}