解析函数
对于字符串,调用cJSON_Parse解析为一个cJSON对象
CJSON_PUBLIC(cJSON *) cJSON_Parse(const char *value);
parser_buffer
为存解析的字符串,使用parse_buffer.
//解析的缓存
typedef struct
{
const unsigned char *content; //字符串内容
size_t length; //长度
size_t offset; //当前位置
size_t depth; //当前位置对象或数组的深度
internal_hooks hooks; //使用的hook
} parse_buffer;
parse_buffer常用函数的宏定义:
//判断buffer能否读取到第size个字节。条件:buffer不为空,且buffer当前的位置+size不大于buffer的总长度
#define can_read(buffer, size) ((buffer != NULL) && (((buffer)->offset + size) <= (buffer)->length))
//是否能方位index位置
#define can_access_at_index(buffer, index) ((buffer != NULL) && (((buffer)->offset + index) < (buffer)->length))
#define cannot_access_at_index(buffer, index) (!can_access_at_index(buffer, index))
//在offset位置的字符串指针
#define buffer_at_offset(buffer) ((buffer)->content + (buffer)->offset)
具体解析过程
//调用cJSON_ParseWithOpts
CJSON_PUBLIC(cJSON *) cJSON_Parse(const char *value)
{
return cJSON_ParseWithOpts(value, 0, 0);
}
CJSON_PUBLIC(cJSON *) cJSON_ParseWithOpts(const char *value, const char **return_parse_end, cJSON_bool require_null_terminated)
{
//1.1 初始化解析的buffer
parse_buffer buffer = { 0, 0, 0, 0, { 0, 0, 0 } };
//item是返回的cJSON对象
cJSON *item = NULL;
/* reset error position */
//1.2 初始化全局错误
global_error.json = NULL;
global_error.position = 0;
if (value == NULL)
{
goto fail;
}
//2. 填充buffer
buffer.content = (const unsigned char*)value;
buffer.length = strlen((const char*)value) + sizeof("");
buffer.offset = 0;
buffer.hooks = global_hooks;
//3. 创建一个节点
item = cJSON_New_Item(&global_hooks);
if (item == NULL) /* memory fail */
{
goto fail;
}
//4. 先去掉utf8的BOM,再去掉前导的空白,然后解析
if (!parse_value(item, buffer_skip_whitespace(skip_utf8_bom(&buffer))))
{
/* parse failure. ep is set. */
goto fail;
}
/* if we require null-terminated JSON without appended garbage, skip and then check for a null terminator */
//字符串已经解析完
//5. 如果要求必须以'\0'结尾,检查最后一个字符
if (require_null_terminated)
{
buffer_skip_whitespace(&buffer);
if ((buffer.offset >= buffer.length) || buffer_at_offset(&buffer)[0] != '\0')
{
goto fail;
}
}
//6. 返回解析后的buffer
if (return_parse_end)
{
*return_parse_end = (const char*)buffer_at_offset(&buffer);
}
//7. 返回解析后的cJSON
return item;
fail:
//出错处理
if (item != NULL)
{
cJSON_Delete(item);
}
if (value != NULL)
{
error local_error;
local_error.json = (const unsigned char*)value;
local_error.position = 0;
if (buffer.offset < buffer.length)
{
local_error.position = buffer.offset;
}
else if (buffer.length > 0)
{
local_error.position = buffer.length - 1;
}
if (return_parse_end != NULL)
{
*return_parse_end = (const char*)local_error.json + local_error.position;
}
global_error = local_error;
}
return NULL;
}
//解析时跳过前导的空格
static parse_buffer *buffer_skip_whitespace(parse_buffer * const buffer)
{
if ((buffer == NULL) || (buffer->content == NULL))
{
return NULL;
}
//这里ascii码<=32都认为是空格
while (can_access_at_index(buffer, 0) && (buffer_at_offset(buffer)[0] <= 32))
{
buffer->offset++;
}
if (buffer->offset == buffer->length)
{
buffer->offset--;
}
return buffer;
}
//去掉UTF-8 BOM "\xEF\xBB\xBF"
static parse_buffer *skip_utf8_bom(parse_buffer * const buffer)
{
if ((buffer == NULL) || (buffer->content == NULL) || (buffer->offset != 0))
{
return NULL;
}
//去掉前导的BOM
if (can_access_at_index(buffer, 4) && (strncmp((const char*)buffer_at_offset(buffer), "\xEF\xBB\xBF", 3) == 0))
{
buffer->offset += 3;
}
return buffer;
}
parse
在parse_buffer中做了一些逻辑处理后,主要的解析在parse函数中.
/*
解析核心算法:从buffer中解析出一个节点
item:解析后的item
input_buffer:输入的字符串
*/
static cJSON_bool parse_value(cJSON * const item, parse_buffer * const input_buffer)
{
if ((input_buffer == NULL) || (input_buffer->content == NULL))
{
return false; /* no input */
}
//1.1 解析null:前面4个字符为null
if (can_read(input_buffer, 4) && (strncmp((const char*)buffer_at_offset(input_buffer), "null", 4) == 0))
{
item->type = cJSON_NULL;
input_buffer->offset += 4;
return true;
}
//1.2 解析false: 前面5个字符为false
if (can_read(input_buffer, 5) && (strncmp((const char*)buffer_at_offset(input_buffer), "false", 5) == 0))
{
item->type = cJSON_False;
input_buffer->offset += 5;
return true;
}
//1.3 解析true: 前面4个字符为true
if (can_read(input_buffer, 4) && (strncmp((const char*)buffer_at_offset(input_buffer), "true", 4) == 0))
{
item->type = cJSON_True;
item->valueint = 1;
input_buffer->offset += 4;
return true;
}
//1.4 解析string, 以"开头。调用parse_string
if (can_access_at_index(input_buffer, 0) && (buffer_at_offset(input_buffer)[0] == '\"'))
{
return parse_string(item, input_buffer);
}
//1.5 解析数字:以-开头或者0-9开头。调用parse_number
if (can_access_at_index(input_buffer, 0) && ((buffer_at_offset(input_buffer)[0] == '-') || ((buffer_at_offset(input_buffer)[0] >= '0') && (buffer_at_offset(input_buffer)[0] <= '9'))))
{
return parse_number(item, input_buffer);
}
//1.6 解析数组:以[开头。调用parse_array
if (can_access_at_index(input_buffer, 0) && (buffer_at_offset(input_buffer)[0] == '['))
{
return parse_array(item, input_buffer);
}
//1.7 解析对象,以{开头。调用parse_object
if (can_access_at_index(input_buffer, 0) && (buffer_at_offset(input_buffer)[0] == '{'))
{
return parse_object(item, input_buffer);
}
return false;
}
字符串解析
调用parse_string解析字符串。
//解析字符串
static cJSON_bool parse_string(cJSON * const item, parse_buffer * const input_buffer)
{
const unsigned char *input_pointer = buffer_at_offset(input_buffer) + 1; //跳过第一个"
const unsigned char *input_end = buffer_at_offset(input_buffer) + 1;//跳过第一个"
unsigned char *output_pointer = NULL;
unsigned char *output = NULL;
/* not a string */
//1. 不是以"开头,不是字符串,直接返回
if (buffer_at_offset(input_buffer)[0] != '\"')
{
goto fail;
}
{
/* calculate approximate size of the output (overestimate) */
//2. 计算存放这些字符串需要的空间
size_t allocation_length = 0;
size_t skipped_bytes = 0;
//2.1 一直到字符串结尾",此后input_end指向最后一个字符
while (((size_t)(input_end - input_buffer->content) < input_buffer->length) && (*input_end != '\"'))
{
/* is escape sequence */
if (input_end[0] == '\\') //为反斜杠,说明遇到了转义字符
{ //出错:以\\结尾
if ((size_t)(input_end + 1 - input_buffer->content) >= input_buffer->length)
{
/* prevent buffer overflow when last input character is a backslash */
goto fail;
}
skipped_bytes++; //跳过字符+1
input_end++; //字符串指针向前+1
}
input_end++; //继续判断下一个字符
}
//2.2 再次判断字符串是不是正确以"结尾
if (((size_t)(input_end - input_buffer->content) >= input_buffer->length) || (*input_end != '\"'))
{
goto fail; /* string ended unexpectedly */
}
/* This is at most how much we need for the output */
//2.3 需要的最大的存储字符串长度
allocation_length = (size_t) (input_end - buffer_at_offset(input_buffer)) - skipped_bytes;
//2.4 output存放解析后的字符串
output = (unsigned char*)input_buffer->hooks.allocate(allocation_length + sizeof(""));
if (output == NULL)
{
goto fail; /* allocation failure */
}
}
//3. 输出的字符串指针指向解析字符串存放的位置
output_pointer = output;
/* loop through the string literal */
//4. 解析字符串
while (input_pointer < input_end)
{
//4.1 不是转义字符,直接判断下一个
if (*input_pointer != '\\')
{
*output_pointer++ = *input_pointer++;
}
/* escape sequence */
else //4.2 处理转义字符
{
unsigned char sequence_length = 2; //4.2.1 当前处理的序列长度
// "/t", input_end指向最后的",input_pointer指向/
if ((input_end - input_pointer) < 1)
{
goto fail;
}
//4.2.2 根据下一个字符判断
switch (input_pointer[1])
{
//转义字符
case 'b':
*output_pointer++ = '\b';
break;
case 'f':
*output_pointer++ = '\f';
break;
case 'n':
*output_pointer++ = '\n';
break;
case 'r':
*output_pointer++ = '\r';
break;
case 't':
*output_pointer++ = '\t';
break;
case '\"':
case '\\':
case '/':
*output_pointer++ = input_pointer[1];
break;
/* UTF-16 literal */
//UTF-16的处理
case 'u':
sequence_length = utf16_literal_to_utf8(input_pointer, input_end, &output_pointer);
if (sequence_length == 0)
{
/* failed to convert UTF16-literal to UTF-8 */
goto fail;
}
break;
default:
goto fail;
}
//input跳过的字节数
input_pointer += sequence_length;
}
}
/* zero terminate the output */
//5. 分析完所有的字符串,天界结束符
*output_pointer = '\0';
//6. 填充item结构体
item->type = cJSON_String;
item->valuestring = (char*)output;
input_buffer->offset = (size_t) (input_end - input_buffer->content);
input_buffer->offset++;
return true;
fail:
//失败的处理
if (output != NULL)
{
input_buffer->hooks.deallocate(output);
}
if (input_pointer != NULL)
{
input_buffer->offset = (size_t)(input_pointer - input_buffer->content);
}
return false;
}
UTF-16转UTF-8
UTF-8 的编码单元是 8 位的字节、UTF-16 为 16 位。JSON字符串中的 \uXXXX 是以 16 进制表示码点 U+0000 至 U+FFFF。如果第一个码点是 U+D800 至 U+DBFF,我们便知道它的代码对的高代理项(high surrogate),之后应该伴随一个 U+DC00 至 U+DFFF 的低代理项(low surrogate)。然后,我们用下列公式把代理对 (H, L) 变换成真实的码点:
codepoint = 0x10000 + (H − 0xD800) × 0x400 + (L − 0xDC00)
码点范围 | 码点位数 | 字节1 | 字节2 | 字节3 | 字节4 |
---|---|---|---|---|---|
U+0000~U+007F | 7 | 0xxxxxxx | |||
U+0080~U+07FF | 11 | 110xxxxx | 10xxxxxx | ||
U+0800~U+FFFF | 16 | 1110xxxx | 10xxxxxx | 10xxxxxx | |
U+10000~U+10FFFF | 21 | 11110xxx | 10xxxxxx | 10xxxxxx | 10xxxxxx |
/* parse 4 digit hexadecimal number */
//把四位16进制的数转为十进制的数
static unsigned parse_hex4(const unsigned char * const input)
{
unsigned int h = 0;
size_t i = 0;
for (i = 0; i < 4; i++)
{
/* parse digit */
if ((input[i] >= '0') && (input[i] <= '9'))
{
h += (unsigned int) input[i] - '0';
}
else if ((input[i] >= 'A') && (input[i] <= 'F'))
{
h += (unsigned int) 10 + input[i] - 'A';
}
else if ((input[i] >= 'a') && (input[i] <= 'f'))
{
h += (unsigned int) 10 + input[i] - 'a';
}
else /* invalid */
{
return 0;
}
if (i < 3)
{
/* shift left to make place for the next nibble */
h = h << 4;
}
}
return h;
}
/* converts a UTF-16 literal to UTF-8
* A literal can be one or two sequences of the form \uXXXX */
//utf-16转为utf-8
static unsigned char utf16_literal_to_utf8(const unsigned char * const input_pointer, const unsigned char * const input_end, unsigned char **output_pointer)
{
long unsigned int codepoint = 0;
unsigned int first_code = 0;
const unsigned char *first_sequence = input_pointer;
unsigned char utf8_length = 0;
unsigned char utf8_position = 0;
unsigned char sequence_length = 0;
unsigned char first_byte_mark = 0;
//至少有6个字符
if ((input_end - first_sequence) < 6)
{
/* input ends unexpectedly */
goto fail;
}
/* get the first utf16 sequence */
first_code = parse_hex4(first_sequence + 2);
/* check that the code is valid */
//检查第一个码点,有效范围为0xDC00~0xDFFF
if (((first_code >= 0xDC00) && (first_code <= 0xDFFF))) //无效字符
{
goto fail;
}
/* UTF16 surrogate pair */
//如果在U+D800 至 U+DBFF,还有低代理项
if ((first_code >= 0xD800) && (first_code <= 0xDBFF))
{
const unsigned char *second_sequence = first_sequence + 6;
unsigned int second_code = 0;
sequence_length = 12; /* \uXXXX\uXXXX */
if ((input_end - second_sequence) < 6)
{
/* input ends unexpectedly */
goto fail;
}
if ((second_sequence[0] != '\\') || (second_sequence[1] != 'u'))
{
/* missing second half of the surrogate pair */
goto fail;
}
/* get the second utf16 sequence */
second_code = parse_hex4(second_sequence + 2);
/* check that the code is valid */
if ((second_code < 0xDC00) || (second_code > 0xDFFF))
{
/* invalid second half of the surrogate pair */
goto fail;
}
/* calculate the unicode codepoint from the surrogate pair */
//对于有高代理项,码点计算方法codepoint = 0x10000 + (H − 0xD800) × 0x400 + (L − 0xDC00)
codepoint = 0x10000 + (((first_code & 0x3FF) << 10) | (second_code & 0x3FF));
}
else
{
//对于\uxxxx格式的
sequence_length = 6; /* \uXXXX */
codepoint = first_code;
}
/* encode as UTF-8
* takes at maximum 4 bytes to encode:
* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
if (codepoint < 0x80) //U+0000~U+007F
{
/* normal ascii, encoding 0xxxxxxx */
utf8_length = 1;
}
else if (codepoint < 0x800) //U+0080~U+07FF
{
/* two bytes, encoding 110xxxxx 10xxxxxx */
utf8_length = 2;
first_byte_mark = 0xC0; /* 11000000 */
}
else if (codepoint < 0x10000) //U+0800~U+FFFF
{
/* three bytes, encoding 1110xxxx 10xxxxxx 10xxxxxx */
utf8_length = 3;
first_byte_mark = 0xE0; /* 11100000 */
}
else if (codepoint <= 0x10FFFF)//U+10000~U+10FFFF
{
/* four bytes, encoding 1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx */
utf8_length = 4;
first_byte_mark = 0xF0; /* 11110000 */
}
else
{
/* invalid unicode codepoint */
goto fail;
}
/* encode as utf8 */
//1000 0000 10111111
for (utf8_position = (unsigned char)(utf8_length - 1); utf8_position > 0; utf8_position--)
{
/* 10xxxxxx */
(*output_pointer)[utf8_position] = (unsigned char)((codepoint | 0x80) & 0xBF);
codepoint >>= 6;
}
/* encode first byte */
if (utf8_length > 1)
{
(*output_pointer)[0] = (unsigned char)((codepoint | first_byte_mark) & 0xFF);
}
else
{
(*output_pointer)[0] = (unsigned char)(codepoint & 0x7F);
}
*output_pointer += utf8_length;
return sequence_length;
fail:
return 0;
}
解析数字
解析数字时,把数字的字符串复制出来,调用strtod函数
//解析数字
static cJSON_bool parse_number(cJSON * const item, parse_buffer * const input_buffer)
{
double number = 0;
unsigned char *after_end = NULL;
unsigned char number_c_string[64]; //存放数字的字符串
unsigned char decimal_point = get_decimal_point();
size_t i = 0;
if ((input_buffer == NULL) || (input_buffer->content == NULL))
{
return false;
}
for (i = 0; (i < (sizeof(number_c_string) - 1)) && can_access_at_index(input_buffer, i); i++)
{
switch (buffer_at_offset(input_buffer)[i])
{
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
case '+':
case '-':
case 'e':
case 'E':
number_c_string[i] = buffer_at_offset(input_buffer)[i];
break;
case '.':
number_c_string[i] = decimal_point;
break;
default:
goto loop_end;
}
}
loop_end:
number_c_string[i] = '\0';
//使用strtod解析数字,after_end返回解析完成后的下一个的字符的位置 11.22abc 解析后after_end指向a
number = strtod((const char*)number_c_string, (char**)&after_end);
if (number_c_string == after_end) //解析出错
{
return false; /* parse_error */
}
item->valuedouble = number;
/* use saturation in case of overflow */
if (number >= INT_MAX)
{
item->valueint = INT_MAX;
}
else if (number <= (double)INT_MIN)
{
item->valueint = INT_MIN;
}
else
{
item->valueint = (int)number; //转为整数
}
item->type = cJSON_Number;
input_buffer->offset += (size_t)(after_end - number_c_string);
return true;
}
解析数组
解析数组比较简单,遇到[
,后面直到遇到,
就是一个item, 遇到]
解析完毕。
static cJSON_bool parse_array(cJSON * const item, parse_buffer * const input_buffer)
{
cJSON *head = NULL; /* head of the linked list */
cJSON *current_item = NULL;
if (input_buffer->depth >= CJSON_NESTING_LIMIT)
{
return false; /* to deeply nested */
}
input_buffer->depth++; //1. 深度+1
if (buffer_at_offset(input_buffer)[0] != '[')
{
/* not an array */
goto fail;
}
input_buffer->offset++;
buffer_skip_whitespace(input_buffer); //2. 移除空格
if (can_access_at_index(input_buffer, 0) && (buffer_at_offset(input_buffer)[0] == ']'))
{
//3.1 空的数组
goto success;
}
/* check if we skipped to the end of the buffer */
if (cannot_access_at_index(input_buffer, 0))
{
input_buffer->offset--;
goto fail;
}
/* step back to character in front of the first element */
input_buffer->offset--;
/* loop through the comma separated array elements */
//3.2 开始解析每一个以逗号分隔的item
do
{
/* allocate next item */
//3.2.1 创建item
cJSON *new_item = cJSON_New_Item(&(input_buffer->hooks));
if (new_item == NULL)
{
goto fail; /* allocation failure */
}
/* attach next item to list */
//3.2.2 插入item
if (head == NULL)
{
/* start the linked list */
current_item = head = new_item;
}
else
{
/* add to the end and advance */
current_item->next = new_item;
new_item->prev = current_item;
current_item = new_item;
}
/* parse next value */
//3.2.3 解析下一个值
input_buffer->offset++;
buffer_skip_whitespace(input_buffer);
if (!parse_value(current_item, input_buffer)) //解析数组中的元素
{
goto fail; /* failed to parse value */
}
buffer_skip_whitespace(input_buffer); //3.2.4 移除空格
}
while (can_access_at_index(input_buffer, 0) && (buffer_at_offset(input_buffer)[0] == ','));
//4. 解析完了字符,判断是否以]结尾
if (cannot_access_at_index(input_buffer, 0) || buffer_at_offset(input_buffer)[0] != ']')
{
goto fail; /* expected end of array */
}
success:
//5.1 解析成功,设置item
input_buffer->depth--;
item->type = cJSON_Array;
item->child = head;
input_buffer->offset++;
return true;
fail:
//5.2 解析失败,释放item
if (head != NULL)
{
cJSON_Delete(head);
}
return false;
}
解析对象
解析对象和解析数组类似。
/*解析object*/
static cJSON_bool parse_object(cJSON * const item, parse_buffer * const input_buffer)
{
cJSON *head = NULL; /* linked list head */
cJSON *current_item = NULL;
if (input_buffer->depth >= CJSON_NESTING_LIMIT)
{
return false; /* to deeply nested */
}
input_buffer->depth++;
if (cannot_access_at_index(input_buffer, 0) || (buffer_at_offset(input_buffer)[0] != '{'))
{
goto fail; /* not an object */
}
input_buffer->offset++;
buffer_skip_whitespace(input_buffer);
if (can_access_at_index(input_buffer, 0) && (buffer_at_offset(input_buffer)[0] == '}'))
{
goto success; /* empty object */
}
/* check if we skipped to the end of the buffer */
if (cannot_access_at_index(input_buffer, 0))
{
input_buffer->offset--;
goto fail;
}
/* step back to character in front of the first element */
input_buffer->offset--;
/* loop through the comma separated array elements */
do
{
/* allocate next item */
cJSON *new_item = cJSON_New_Item(&(input_buffer->hooks));
if (new_item == NULL)
{
goto fail; /* allocation failure */
}
/* attach next item to list */
if (head == NULL)
{
/* start the linked list */
current_item = head = new_item;
}
else
{
/* add to the end and advance */
current_item->next = new_item;
new_item->prev = current_item;
current_item = new_item;
}
/* parse the name of the child */
input_buffer->offset++;
buffer_skip_whitespace(input_buffer);
if (!parse_string(current_item, input_buffer))
{
goto fail; /* failed to parse name */
}
buffer_skip_whitespace(input_buffer);
/* swap valuestring and string, because we parsed the name */
current_item->string = current_item->valuestring;
current_item->valuestring = NULL;
if (cannot_access_at_index(input_buffer, 0) || (buffer_at_offset(input_buffer)[0] != ':'))
{
goto fail; /* invalid object */
}
/* parse the value */
input_buffer->offset++;
buffer_skip_whitespace(input_buffer);
if (!parse_value(current_item, input_buffer))
{
goto fail; /* failed to parse value */
}
buffer_skip_whitespace(input_buffer);
}
while (can_access_at_index(input_buffer, 0) && (buffer_at_offset(input_buffer)[0] == ','));
if (cannot_access_at_index(input_buffer, 0) || (buffer_at_offset(input_buffer)[0] != '}'))
{
goto fail; /* expected end of object */
}
success:
input_buffer->depth--;
item->type = cJSON_Object;
item->child = head;
input_buffer->offset++;
return true;
fail:
if (head != NULL)
{
cJSON_Delete(head);
}
return false;
}