对于json格式的字符串解析为json对象处理的时候,cJSON会自动处理为utf-8格式的字符串。
比如json字符串为:{“text”:"\u6295\u8d44\u7406\u8d22"}
经过cJSON_Parse解析后的对象再输出的值为:(cJSON已经进行了内部转换处理)
{“text”:“投资理财”}
cJSON解析函数:
CJSON_PUBLIC(cJSON *) cJSON_Parse(const char *value);
相关的内部处理函数:
static unsigned char utf16_literal_to_utf8(const unsigned char * const input_pointer, const unsigned char * const input_end, unsigned char **output_pointer)
utf16_literal_to_utf8源代码:
/* converts a UTF-16 literal to UTF-8
* A literal can be one or two sequences of the form \uXXXX */
static unsigned char utf16_literal_to_utf8(const unsigned char * const input_pointer, const unsigned char * const input_end, unsigned char **output_pointer)
{
long unsigned int codepoint = 0;
unsigned int first_code = 0;
const unsigned char *first_sequence = input_pointer;
unsigned char utf8_length = 0;
unsigned char utf8_position = 0;
unsigned char sequence_length = 0;
unsigned char first_byte_mark = 0;
if ((input_end - first_sequence) < 6)
{
/* input ends unexpectedly */
goto fail;
}
/* get the first utf16 sequence */
first_code = parse_hex4(first_sequence + 2);
/* check that the code is valid */
if (((first_code >= 0xDC00) && (first_code <= 0xDFFF)))
{
goto fail;
}
/* UTF16 surrogate pair */
if ((first_code >= 0xD800) && (first_code <= 0xDBFF))
{
const unsigned char *second_sequence = first_sequence + 6;
unsigned int second_code = 0;
sequence_length = 12; /* \uXXXX\uXXXX */
if ((input_end - second_sequence) < 6)
{
/* input ends unexpectedly */
goto fail;
}
if ((second_sequence[0] != '\\') || (second_sequence[1] != 'u'))
{
/* missing second half of the surrogate pair */
goto fail;
}
/* get the second utf16 sequence */
second_code = parse_hex4(second_sequence + 2);
/* check that the code is valid */
if ((second_code < 0xDC00) || (second_code > 0xDFFF))
{
/* invalid second half of the surrogate pair */
goto fail;
}
/* calculate the unicode codepoint from the surrogate pair */
codepoint = 0x10000 + (((first_code & 0x3FF) << 10) | (second_code & 0x3FF));
}
else
{
sequence_length = 6; /* \uXXXX */
codepoint = first_code;
}
/* encode as UTF-8
* takes at maximum 4 bytes to encode:
* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
if (codepoint < 0x80)
{
/* normal ascii, encoding 0xxxxxxx */
utf8_length = 1;
}
else if (codepoint < 0x800)
{
/* two bytes, encoding 110xxxxx 10xxxxxx */
utf8_length = 2;
first_byte_mark = 0xC0; /* 11000000 */
}
else if (codepoint < 0x10000)
{
/* three bytes, encoding 1110xxxx 10xxxxxx 10xxxxxx */
utf8_length = 3;
first_byte_mark = 0xE0; /* 11100000 */
}
else if (codepoint <= 0x10FFFF)
{
/* four bytes, encoding 1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx */
utf8_length = 4;
first_byte_mark = 0xF0; /* 11110000 */
}
else
{
/* invalid unicode codepoint */
goto fail;
}
/* encode as utf8 */
for (utf8_position = (unsigned char)(utf8_length - 1); utf8_position > 0; utf8_position--)
{
/* 10xxxxxx */
(*output_pointer)[utf8_position] = (unsigned char)((codepoint | 0x80) & 0xBF);
codepoint >>= 6;
}
/* encode first byte */
if (utf8_length > 1)
{
(*output_pointer)[0] = (unsigned char)((codepoint | first_byte_mark) & 0xFF);
}
else
{
(*output_pointer)[0] = (unsigned char)(codepoint & 0x7F);
}
*output_pointer += utf8_length;
return sequence_length;
fail:
return 0;
}