头歌：第1关：实现json解析器中数字和字符串的解析

最新推荐文章于 2024-04-19 13:45:00 发布

MSY～学习日记分享

最新推荐文章于 2024-04-19 13:45:00 发布

阅读量1.1k

点赞数 2

分类专栏： python 文章标签： json python 开发语言

本文链接：https://blog.csdn.net/weixin_54570972/article/details/129768238

版权

python 专栏收录该内容

28 篇文章 23 订阅

订阅专栏

代码解释在注释部分，有其他疑问可以评论区提问

本文内容原创，请勿转载！

需要手动敲的代码部分：

（一）处理数字

（二）处理字符串

源码：

from typing import List
from enum import Enum

"""
全局标量定义来表示符合 JSON 所规定的数据类型
（学生可以使用字典结构表示此结构）
其中：
BEGIN_OBJECT（{）
END_OBJECT（}）
BEGIN_ARRAY（[）
END_ARRAY（]）
NULL（null）
NUMBER（数字）
STRING（字符串）
BOOLEAN（true/false）
SEP_COLON（:）
SEP_COMMA（,）
"""

# Signal token
BEGIN_OBJECT = 1
BEGIN_ARRAY = 2
END_OBJECT = 4
END_ARRAY = 8

# variable token
NULL_TOKEN = 16
NUMBER_TOKEN = 32
STRING_TOKEN = 64
BOOL_TOKEN = 128

# separator token
COLON_TOKEN = 256
COMMA_TOKEN = 512

# end signal
END_JSON = 65536

# json index
json_index = 0

def token_parse(json_str: str, json_index: int) -> (tuple, int):
    """
    完成词法解析，返回token
    :param json_str: 输入的json字符串
    :param json_index: json字符串的位置
    :return: 返回已处理好的token和json字符串的位置
    """
    def read_num(json_index:int): #处理数字
        ##           begin           ##
        begin = end = json_index #当前json字符串的位置
        end_str = '\n\t\r,}]' #数字结束的字符串（换行符，制表符，回车符，逗号，右括号）

        while json_str[end] not in end_str:#不到结束符就继续循环读取
            end+=1
        number = json_str[begin:end] #提取出begin到end的字符，赋值给number
        #进行转换
        if '.' in number or 'e' in number or 'E' in number:#根据number中是否含有小数点，e,E，来判断是整数 or 浮点数
            res = float(number) #将变量中存储的字符串转换为浮点数类型
        else:
            res = int(number) #整数
        json_index = end #不可变对象，需重新赋值
        return(NUMBER_TOKEN,res),json_index #返回处理数字后的token序列


    def read_str(json_index: int): #处理字符串
        ##         begin       ##
        json_index +=1 #json字符串的当前位置
       
        while json_index <len(json_str) and json_str[json_index] in '\n\t\r':#跳过空白，换行，制表符tab，移动到下一个非空白字符的位置
             json_index = json_index+1
        begin = end = json_index

        #找到string的范围
        while json_str[end] != '"': #不是双引号
            if json_str[end]=='\\': #反斜杠：向后移动一位
                end+=1
                if json_str[end] not in '"\\/bfnrtu': #双引号，反斜杠，斜杠，退格，换行，回车，Unicode
                    print
            end+=1 #向后移动
        json_index = end+1 #下一个token位置的字符
        rem = json_str[begin:end] #begin到end 提取出字符串
        return(STRING_TOKEN,rem),json_index #返回处理字符串后的token序列
        
    def read_null():
        """
        处理null
        :return: 返回处理null后的token序列
        """
        rem = json_str[json_index: json_index + 4]
        return (NULL_TOKEN, rem), json_index + 4

    def read_bool(s: str):
        """
        处理true，false
        :param s: json字符串
        :return: 返回处理true，false后的token序列
        """
        if s == 't':
            rem = json_str[json_index: json_index + 4]
            return (BOOL_TOKEN, rem), json_index + 4
        else:
            rem = json_str[json_index: json_index + 5]
            return (BOOL_TOKEN, rem), json_index + 5


    if json_index == len(json_str):
        return (END_JSON, None), json_index
    elif json_str[json_index] == '{':
        return (BEGIN_OBJECT, json_str[json_index]), json_index + 1
    elif json_str[json_index] == '}':
        return (END_OBJECT, json_str[json_index]), json_index + 1
    elif json_str[json_index] == '[':
        return (BEGIN_ARRAY, json_str[json_index]), json_index + 1
    elif json_str[json_index] == ']':
        return (END_ARRAY, json_str[json_index]), json_index + 1
    elif json_str[json_index] == ',':
        return (COMMA_TOKEN, json_str[json_index]), json_index + 1
    elif json_str[json_index] == ':':
        return (COLON_TOKEN, json_str[json_index]), json_index + 1
    elif json_str[json_index] == 'n':
        return read_null()
    elif json_str[json_index] == 't' or json_str[json_index] == 'f':
        return read_bool(json_str[json_index])
    elif json_str[json_index] == '"':
        return read_str(json_index)
    if json_str[json_index].isdigit():
        return read_num(json_index)


def tokenizer(json_str: str) -> list:
    """
    生成token序列
    :param json_str:
    :return:
    """
    json_index = 0
    tk, cur_index = token_parse(json_str, json_index)
    token_list = []
    generate_tokenlist(token_list, tk)
    while tk[0] != END_JSON:
        tk, cur_index = token_parse(json_str, cur_index)
        generate_tokenlist(token_list, tk)
    return token_list


def generate_token(tokentype: int, tokenvalue: str) -> tuple:
    """
    生成token结构
    :param tokentype: token的类型
    :param tokenvalue: token的值
    :return: 返回token
    """
    token = (tokentype, tokenvalue)
    return token


def generate_tokenlist(tokenlist: list, token: tuple) -> list:

    tokenlist.append(token)
    return tokenlist


def parse_json(tokenlist: list):

    def check_token(expected: int, actual: int):
        if expected & actual == 0:
            raise Exception('Unexpected Token at position %d' % json_index)

    def parse_json_array():
        """
        处理array对象
        :return: 处理json中的array对象
        """
        global json_index
        expected = BEGIN_ARRAY | END_ARRAY | BEGIN_OBJECT | END_OBJECT | NULL_TOKEN | NUMBER_TOKEN | BOOL_TOKEN | STRING_TOKEN

        while json_index != len(tokenlist):
            json_index += 1
            token = tokenlist[json_index]
            # token_type -> TokenEnum
            token_type = token[0]
            token_value = token[1]
            check_token(expected, token_type)

            # check through each condition
            if token_type == BEGIN_OBJECT:
                array.append(parse_json_object())
                expected = COMMA_TOKEN | END_ARRAY
            elif token_type == BEGIN_ARRAY:
                array.append(parse_json_array())
                expected = COMMA_TOKEN | END_ARRAY
            elif token_type == END_ARRAY:
                return array
            elif token_type == NULL_TOKEN:
                array.append(None)
                expected = COMMA_TOKEN | END_ARRAY
            elif token_type == NUMBER_TOKEN:
                array.append(int(token_value))
                expected = COMMA_TOKEN | END_ARRAY
            elif token_type == STRING_TOKEN:
                # print("array-------------array")
                array.append(token_value)
                expected = COMMA_TOKEN | END_ARRAY
            elif token_type == BOOL_TOKEN:
                token_value = token_value.lower().capitalize()
                array.append({'True': True, 'False': False}[token_value])
                expected = COMMA_TOKEN | END_ARRAY
            elif COMMA_TOKEN:
                expected = BEGIN_ARRAY | BEGIN_OBJECT | STRING_TOKEN | BOOL_TOKEN | NULL_TOKEN | NUMBER_TOKEN
            elif END_JSON:
                return array
            else:
                raise Exception('Unexpected Token at position %d' % json_index)

    def parse_json_object():
        """
        处理json对象
        :return:处理json中的json对象
        """
        global json_index
        expected = STRING_TOKEN | END_OBJECT
        key = None
        while json_index != len(tokenlist):
            json_index += 1
            token = tokenlist[json_index]
            token_type = token[0]
            token_value = token[1]
            # print("expected: ", expected, "token_type: ", token_type, "token_value: ", token_value)
            check_token(expected, token_type)
            if token_type == BEGIN_OBJECT:
                obj.update({key: parse_json_object()})
                expected = COMMA_TOKEN | END_OBJECT
            elif token_type == END_OBJECT:
                return obj
            elif token_type == BEGIN_ARRAY:
                # print("join array")
                obj.update({key: parse_json_array()})
                expected = COMMA_TOKEN | END_OBJECT | STRING_TOKEN
            elif token_type == NULL_TOKEN:
                obj.update({key: None})
                expected = COMMA_TOKEN | END_OBJECT
            elif token_type == STRING_TOKEN:
                pre_token = tokenlist[json_index - 1]
                pre_token_value = pre_token[0]
                # print(pre_token_value)
                if pre_token_value == COLON_TOKEN:
                    value = token[1]
                    obj.update({key: value})
               #      print("----------")
                    expected = COMMA_TOKEN | END_OBJECT
                else:
                    key = token[1]
                    expected = COLON_TOKEN
               #     print("+++++++++")

            elif token_type == NUMBER_TOKEN:
                obj.update({key: int(token_value)})
                expected = COMMA_TOKEN | END_OBJECT
            elif token_type == BOOL_TOKEN:
                token_value = token_value.lower().capitalize()
                obj.update({key: {'True': True, 'False': False}[token_value]})
                expected = COMMA_TOKEN | END_OBJECT
            elif token_type == COLON_TOKEN:
                expected = NULL_TOKEN | NUMBER_TOKEN | BOOL_TOKEN | STRING_TOKEN | BEGIN_ARRAY | BEGIN_OBJECT
            elif token_type == COMMA_TOKEN:
                expected = STRING_TOKEN
            elif token_type == END_JSON:
                return obj
            else:
                raise Exception('Unexpected Token at position %d' % json_index)
    array = []
    obj = {}
    global json_index
    if tokenlist[0][0] == BEGIN_OBJECT:
        return parse_json_object()
    elif tokenlist[0][0] == BEGIN_ARRAY:
        return parse_json_array()
    else:
        raise Exception('Illegal Token at position %d' % json_index)


if __name__ == "__main__":
    raw_data = input()
    jlist = tokenizer(raw_data)
    try:
        jdict = parse_json(jlist)
        print(jdict)
    except BaseException as result:
        print(result)