[pyhon] json文件以层级形式输出

Markdown目录生成:[TOC]

问题

如何将json文件以层级形式输出?

现有文档级关系抽取的数据集DocRED的metadata——char2id.json:

{"BLANK": 0, "UNK": 1, "t": 2, "h": 3, "e": 4, "w": 5, "o": 6, "r": 7, "k": 8, "-": 9, "p": 10, "a": 11, "s": 12, "n": 13, "d": 14, "g": 15, "u": 16, "(": 17, ";": 18, ")": 19, ",": 20, "l": 21, "'": 22, "m": 23, "i": 24, "y": 25, "f": 26, "c": 27, ".": 28, "v": 29, "b": 30, "j": 31, "1": 32, "4": 33, "9": 34, "5": 35, "2": 36, "0": 37, "6": 38, "z": 39, "/": 40, "q": 41, "3": 42, "x": 43, "8": 44, ":": 45, "7": 46, "\u2014": 47, "\u00e9": 48, "\"": 49, "\u014d": 50, "\u1e63": 51, "\u0101": 52, "%": 53, "\u0645": 54, "\u0648": 55, "\u0627": 56, "\u0631": 57, "\u00a3": 58, "$": 59, "\u0142": 60, "\u015b": 61, "\u0119": 62, "\u2019": 63, "&": 64, "\u00e8": 65, "\u00ed": 66, "\u00f1": 67, "\u00e1": 68, "\u0131": 69, "\u00f6": 70, "\u00f3": 71, "!": 72, "\u00e4": 73, "\u017c": 74, "?": 75, "\u00e5": 76, "\u00e6": 77, "\u0161": 78, "\u010d": 79, "\u00fd": 80, "\u2212": 81, "\u00f4": 82, "\u00b0": 83, "\u201c": 84, "\u201d": 85, "\u00f8": 86, "[": 87, "]": 88, "\u00df": 89, "_": 90, "#": 91, "\u017e": 92, "+": 93, "\u00fe": 94, "\u00fc": 95, "\u011f": 96, "\u00ee": 97, "\u0307": 98, "\u00e7": 99, "\u00e0": 100, "\u0111": 101, "\u00ec": 102, "\u1ebf": 103, "\u0117": 104, "\u062d": 105, "\u0644": 106, "\u064a": 107, "\u0629": 108, "\u1e25": 109, "\u00ea": 110, "\u00fa": 111, "\u017a": 112, "\u021b": 113, "\u0107": 114, "\u2018": 115, "\u00f2": 116, "\u20ac": 117, "\u016b": 118, "\u0159": 119, "\u0105": 120, "\u0430": 121, "\u043c": 122, "\u0431": 123, "\u043d": 124, "\u0628": 125, "\u0646": 126, "\u00e3": 127, "\u00d7": 128, "\u4e2d": 129, "\u5c71": 130, "\u00b2": 131, "\u00f9": 132, "\u01b0": 133, "\u01a1": 134, "\u00eb": 135, "\u012b": 136, "\u02bb": 137, "\u0144": 138, "\u00fb": 139, "`": 140, "\u00e2": 141, "\u5927": 142, "\u011b": 143, "\u0151": 144, "\u0639": 145, "\u062f": 146, "\u0647": 147, "\u00ef": 148, "\u0219": 149, "\u0642": 150, "\u00ab": 151, "\u00bb": 152, "\u03c0": 153, "\u03b9": 154, "\u03bf": 155, "\u03c3": 156, "\u03c2": 157, "\u03bc": 158, "\u03ad": 159, "\u03c1": 160, "\u03bd": 161, "\u2026": 162, "\u02bf": 163, "\u0e23": 164, "\u0e32": 165, "\u0113": 166, "\u0146": 167, "\u738b": 168, "\u30fc": 169, "\u05d9": 170, "\u05dc": 171, "\u05d0": 172, "\u05d5": 173, "\u05e8": 174, "\u05d4": 175, "\u00f5": 176, "\u03bb": 177, "\u00bd": 178, "\u1e6d": 179, "\u0641": 180, "\u0633": 181, "\u06cc": 182, "=": 183, "\u1ec5": 184, "\u062c": 185, "\u03c7": 186, "\u03b1": 187, "\u03ac": 188, "\u03c4": 189, "\u03b7": 190, "\u015f": 191, "\u0440": 192, "\u0433": 193, "\u0443": 194, "\u0442": 195, "\u0438": 196, "\u0441": 197, "\u043a": 198, "\u044f": 199, "\u0434": 200, "\u043e": 201, "\u043b": 202, "\u1ea1": 203, "\u1ed3": 204, "\u1ec7": 205, "\u2032": 206, "*": 207, "\u03b5": 208, "\u0148": 209, "\u00f0": 210, "\u093e": 211, "\u0930": 212, "\u0940": 213, "\u0163": 214, "\u0103": 215, "\u1ecb": 216, "\u0623": 217, "\u0435": 218, "\u0432": 219, "\u043f": 220, "\u044b": 221, "\u093f": 222, "\u1ea7": 223, "\u1ea3": 224, "\u05d1": 225, "\u05ea": 226, "|": 227, "\u00b4": 228, "\u0632": 229, "\u0153": 230, "\u0445": 231, "\u03ba": 232, "\u03af": 233, "\u03b4": 234, "\u03cc": 235, "\u03c5": 236, "\u0437": 237, "\u0447": 238, "\u0439": 239, "\u0456": 240, "\u0259": 241, "\u03b8": 242, "\u0301": 243, "\u0634": 244, "\u0436": 245, "\u03b3": 246, "\u0448": 247, "\u062a": 248, "\u09be": 249, "~": 250, "\u03c9": 251, "\u044c": 252, "\u0444": 253, "\u094d": 254, "\u0446": 255, "\u0643": 256, "\\": 257, "\u02c8": 258, "\u0bcd": 259, "\u05de": 260, "\u014f": 261, "\u2033": 262, "\u0561": 263}

编写python脚本使char2id.json以层级形式进行输出,方便直观得观察文本。

初步分析

char2id.json共有一行,读取之后可以转变为一个字典格式的数据,这个字典深度为一层,共有264项数据,每一项的key为编码形式,value为序列号。其中key包含“BLANCK”、“UNK”、a-z、0-9、“-”、“(”、“)”、“;”、“,”、“‘”、“.”、“/”、“:”、“”“、“%”、“$”、“&”、“!”、“?”、“[”、“]”、“_”、“#”、“+”、“`”、“=”、“*”、“|”、“~”、“\”共64个字符和200个字符的Unicode编码。

第一次尝试

char2id.json文件中读取内容后,直接转为字典形式,并按照按照每项数据一行的形式进行保存。

import json

# 读取文件
file_read=open("./prepro_data/char2id.json","r")
js=file_read.read()
file_read.close()
# 将json文件转为字典形式
dic = json.loads(js)        # 字符串转字典方法1
#dic = eval(js)  			# 字符串转字典方法2
print(dic)
# 保存文件
file_save=open("./hightlight_data/char2id_1.json",mode='w',encoding='utf-8')	# 以写形式进行打开文件,当文件不存在时会直接创建
file_save.write(str(dic))
file_save.close()

结果:

在这里插入图片描述
在这里插入图片描述

保存为一行,并且key由原来的双引号包括变为单引号包括(在pycharm中双引号包括不会标红,但引号包括会标红),最重要的是Unicode编码全变成了对应的字符。

第二次尝试

解决Unicode编码全变成了对应的字符的问题。文件中Unicode编码从“\u00a3”到“\u738b”之间,中文编码是“\u4e00”到“\u9fa5”之间,用[1]的方法进行处理。

[1]的初始代码:

def unicode_to_str(unicode_str):
    return unicode_str.encode().decode('unicode_escape')


def str_to_unicode(string):
    new_str = ''
    for ch in string:
        if '\u4e00' <= ch <= '\u9fff':
            new_str += hex(ord(ch)).replace('0x', '\\u')
        else:
            new_str += ch
    return new_str


if __name__ == '__main__':
    unicode = str_to_unicode('你好')

    print(unicode) # \u4f60\u597d
    print(repr(unicode)) # '\\u4f60\\u597d'
    print(unicode_to_str('\\u4f60\\u597d')) # 你好

改编之后的代码:

import json

def str_to_unicode(string):
    new_str = ""
    for ch in string:
        if '\u00a3' <= ch <= '\u738b': 
            # a=ord(ch)           # 以一个字符(长度为1的字符串)作为参数,返回对应的 ASCII 数值,或者 Unicode 数值,
            # b=hex(a)
            new_str += hex(ord(ch)).replace("0x", "\\u")
            n=len(new_str)
            if len(new_str)==4:
                new_str=new_str[0:2]+"00"+new_str[2:]
            elif len(new_str)==5:
                new_str = new_str[0:2] + "0" + new_str[2:]
        else:
            new_str += ch
    return new_str


def json2str(json_file,save_path):
    # 读取文件
    file_read=open(json_file,"r")
    js=file_read.read()
    file_read.close()
    # 将json文件转为字典形式
    dic = json.loads(js)        # 字符串转字典方法1
    #dic = eval(js)  			# 字符串转字典方法2
    print(dic)
    key_list=list(dic.keys())
    value_list=list(dic.values())
    key_value_list=list(zip(key_list,value_list))
    for i, item in enumerate(key_value_list):
        key_value_list[i]=list(item)
        key_value_list[i][0]=str_to_unicode(key_value_list[i][0])
    dic=dict(key_value_list)
    # 保存文件
    file_save=open(save_path,mode='w',encoding='utf-8')	# 以写形式进行打开文件,当文件不存在时会直接创建
    file_save.write(str(dic))
    file_save.close()


if __name__ == '__main__':
    file_path="./prepro_data/char2id.json"
    save_path="./hightlight_data/char2id_1.json"
    json2str(file_path,save_path)

Unicode编码不再变为对应的字符:

在这里插入图片描述

再用[2]中的方法以层级方式进行输出保存,需要注意代码中64和65行对特殊字符的处理:

import json
from collections import abc

def str_to_unicode(string):
    new_str = ""
    for ch in string:
        if '\u00a3' <= ch <= '\u738b':          # '\u4e00' <= ch <= '\u9fff'
            # a=ord(ch)           # 以一个字符(长度为1的字符串)作为参数,返回对应的 ASCII 数值,或者 Unicode 数值,
            # b=hex(a)
            new_str += hex(ord(ch)).replace("0x", "\\u")
            n=len(new_str)
            if len(new_str)==4:
                new_str=new_str[0:2]+"00"+new_str[2:]
            elif len(new_str)==5:
                new_str = new_str[0:2] + "0" + new_str[2:]
        else:
            new_str += ch
    return new_str

def mapping2str_v2(mapping: abc.Mapping, *, prefix: str = "    ", lvl: int = 0, max_lvl: int = 3) -> str:
    sub_lvl = lvl + 1
    cur_prefix = prefix * lvl
    sub_prefix = prefix * sub_lvl

    if lvl == max_lvl:  # 用来处理最大层级下的各项内容,直接按照字符串一行输出
        sub_items = str(mapping)
    else:
        sub_items = ["{"]
        for i, (k, v) in enumerate(mapping.items()):
            if i==49:
                a=111
                pass
            if i==257:
                pass
            sub_item = sub_prefix + '"' + k + '"' + ": "        # "\""+
            if isinstance(v, abc.Mapping):
                sub_item += mapping2str_v2(v, prefix=prefix, lvl=sub_lvl, max_lvl=max_lvl)
            else:
                sub_item += str(v)  # 更多是用于处理各个层级非字典项的输出
            aaaa=mapping.__len__()
            if i!=mapping.__len__()-1:
                sub_item+=','
            sub_items.append(sub_item)
        sub_items.append(cur_prefix + "}")
        sub_items = "\n".join(sub_items)
    return sub_items

def json2str(json_file,save_path):
    # 1.读取文件
    file_read=open(json_file,"r")
    js=file_read.read()
    file_read.close()
    # 2.将json文件转为字典形式
    dic = json.loads(js)        # 字符串转字典方法1
    #dic = eval(js)  			# 字符串转字典方法2
    # 3.为了不改变输出文件的顺序,将字典形式转为列表形式,将字符变为Unicode的编码形式,并对特殊情况进行处理,再将其转回为字典形式
    key_list=list(dic.keys())
    value_list=list(dic.values())
    key_value_list=list(zip(key_list,value_list))

    for i, item in enumerate(key_value_list):
        key_value_list[i]=list(item)
        key_value_list[i][0]=str_to_unicode(key_value_list[i][0])
    key_value_list[49][0] = "\\\""
    key_value_list[257][0] = '\\\\'
    print(key_value_list)
    dic=dict(key_value_list)
    # 4.将字典以层级形式进行保存
    dic = mapping2str_v2(dic, max_lvl=1)
    # 保存文件
    file_save=open(save_path,mode='w',encoding='utf-8')	# 以写形式进行打开文件,当文件不存在时会直接创建
    file_save.write(str(dic))
    file_save.close()


if __name__ == '__main__':
    file_path="./prepro_data/char2id.json"
    save_path="./hightlight_data/char2id_1.json"
    json2str(file_path,save_path)

输出结果:

在这里插入图片描述

第三次尝试

第二次尝试中先将json文件读取为字典,再使用str_to_unicode()函数将字符变为对应的Unicode编码,再将字典以层级形式输出。这里尝试不再先将json文件读为字典,而是将json文件读取为字符串,再对字符串进行处理的方式进行处理,其中字符串分割时会用到“,”和“:”,因此需要对这两个字符进行特殊处理。第二次尝试中的特殊情况对于其他文本,可能会不止两种。

from collections import abc

def mapping2str(mapping: abc.Mapping, *, prefix: str = "    ", lvl: int = 0, max_lvl: int = 3) -> str:
    sub_lvl = lvl + 1
    cur_prefix = prefix * lvl
    sub_prefix = prefix * sub_lvl

    if lvl == max_lvl:  # 用来处理最大层级下的各项内容,直接按照字符串一行输出
        sub_items = str(mapping)
    else:
        sub_items = ["{"]
        for i, (k, v) in enumerate(mapping.items()):
            # sub_item = sub_prefix + '"' + k + '"' + ": "        # "\""+
            sub_item = sub_prefix + k + ": "  # "\""+
            if isinstance(v, abc.Mapping):
                sub_item += mapping2str(v, prefix=prefix, lvl=sub_lvl, max_lvl=max_lvl)
            else:
                sub_item += str(v)  # 更多是用于处理各个层级非字典项的输出
            if i!=mapping.__len__()-1:
                sub_item+=','
            sub_items.append(sub_item)
        sub_items.append(cur_prefix + "}")
        sub_items = "\n".join(sub_items)
    return sub_items



def json2str(json_file,save_path):
    # 1.读取文件
    file_read = open("./prepro_data/char2id.json", "r")
    js = file_read.read()
    file_read.close()
    # 2.对字符串进行处理,得到字典格式的数据
    js = js.replace(' ', '')    # 去除空格
    js = js[1:-1].split(',')    # 用‘,’分割得到每一项数据,因此当key、value中有‘,’会分割错误
    js.pop(20)
    for i, k in enumerate(js):
        js[i] = js[i].split(':')    # 用“:”分割每一项字典数据,因此当key、value中有‘,’会分割错误
    js[20][0] = '","'
    js[45].pop(0)
    js[45][0] = '":"'
    print(js)
    # for i, k in enumerate(js):
    #     js[i][0] = js[i][0][1:-1]
    # print(js)
    dic = dict(js)
    # 3.将字典以层级形式进行保存
    dic = mapping2str(dic, max_lvl=1)
    # 4.保存文件
    file_save = open(save_path, mode='w')
    file_save.write(str(dic))
    file_save.close()

if __name__ == '__main__':
    file_path = "./prepro_data/char2id.json"
    save_path = "./hightlight_data/char2id.json"
    json2str(file_path, save_path)

最终结果:

{
    "BLANK": 0,
    "UNK": 1,
    "t": 2,
    "h": 3,
    "e": 4,
    "w": 5,
    "o": 6,
    "r": 7,
    "k": 8,
    "-": 9,
    "p": 10,
    "a": 11,
    "s": 12,
    "n": 13,
    "d": 14,
    "g": 15,
    "u": 16,
    "(": 17,
    ";": 18,
    ")": 19,
    ",": 20,
    "l": 21,
    "'": 22,
    "m": 23,
    "i": 24,
    "y": 25,
    "f": 26,
    "c": 27,
    ".": 28,
    "v": 29,
    "b": 30,
    "j": 31,
    "1": 32,
    "4": 33,
    "9": 34,
    "5": 35,
    "2": 36,
    "0": 37,
    "6": 38,
    "z": 39,
    "/": 40,
    "q": 41,
    "3": 42,
    "x": 43,
    "8": 44,
    ":": 45,
    "7": 46,
    "\u2014": 47,
    "\u00e9": 48,
    "\"": 49,
    "\u014d": 50,
    "\u1e63": 51,
    "\u0101": 52,
    "%": 53,
    "\u0645": 54,
    "\u0648": 55,
    "\u0627": 56,
    "\u0631": 57,
    "\u00a3": 58,
    "$": 59,
    "\u0142": 60,
    "\u015b": 61,
    "\u0119": 62,
    "\u2019": 63,
    "&": 64,
    "\u00e8": 65,
    "\u00ed": 66,
    "\u00f1": 67,
    "\u00e1": 68,
    "\u0131": 69,
    "\u00f6": 70,
    "\u00f3": 71,
    "!": 72,
    "\u00e4": 73,
    "\u017c": 74,
    "?": 75,
    "\u00e5": 76,
    "\u00e6": 77,
    "\u0161": 78,
    "\u010d": 79,
    "\u00fd": 80,
    "\u2212": 81,
    "\u00f4": 82,
    "\u00b0": 83,
    "\u201c": 84,
    "\u201d": 85,
    "\u00f8": 86,
    "[": 87,
    "]": 88,
    "\u00df": 89,
    "_": 90,
    "#": 91,
    "\u017e": 92,
    "+": 93,
    "\u00fe": 94,
    "\u00fc": 95,
    "\u011f": 96,
    "\u00ee": 97,
    "\u0307": 98,
    "\u00e7": 99,
    "\u00e0": 100,
    "\u0111": 101,
    "\u00ec": 102,
    "\u1ebf": 103,
    "\u0117": 104,
    "\u062d": 105,
    "\u0644": 106,
    "\u064a": 107,
    "\u0629": 108,
    "\u1e25": 109,
    "\u00ea": 110,
    "\u00fa": 111,
    "\u017a": 112,
    "\u021b": 113,
    "\u0107": 114,
    "\u2018": 115,
    "\u00f2": 116,
    "\u20ac": 117,
    "\u016b": 118,
    "\u0159": 119,
    "\u0105": 120,
    "\u0430": 121,
    "\u043c": 122,
    "\u0431": 123,
    "\u043d": 124,
    "\u0628": 125,
    "\u0646": 126,
    "\u00e3": 127,
    "\u00d7": 128,
    "\u4e2d": 129,
    "\u5c71": 130,
    "\u00b2": 131,
    "\u00f9": 132,
    "\u01b0": 133,
    "\u01a1": 134,
    "\u00eb": 135,
    "\u012b": 136,
    "\u02bb": 137,
    "\u0144": 138,
    "\u00fb": 139,
    "`": 140,
    "\u00e2": 141,
    "\u5927": 142,
    "\u011b": 143,
    "\u0151": 144,
    "\u0639": 145,
    "\u062f": 146,
    "\u0647": 147,
    "\u00ef": 148,
    "\u0219": 149,
    "\u0642": 150,
    "\u00ab": 151,
    "\u00bb": 152,
    "\u03c0": 153,
    "\u03b9": 154,
    "\u03bf": 155,
    "\u03c3": 156,
    "\u03c2": 157,
    "\u03bc": 158,
    "\u03ad": 159,
    "\u03c1": 160,
    "\u03bd": 161,
    "\u2026": 162,
    "\u02bf": 163,
    "\u0e23": 164,
    "\u0e32": 165,
    "\u0113": 166,
    "\u0146": 167,
    "\u738b": 168,
    "\u30fc": 169,
    "\u05d9": 170,
    "\u05dc": 171,
    "\u05d0": 172,
    "\u05d5": 173,
    "\u05e8": 174,
    "\u05d4": 175,
    "\u00f5": 176,
    "\u03bb": 177,
    "\u00bd": 178,
    "\u1e6d": 179,
    "\u0641": 180,
    "\u0633": 181,
    "\u06cc": 182,
    "=": 183,
    "\u1ec5": 184,
    "\u062c": 185,
    "\u03c7": 186,
    "\u03b1": 187,
    "\u03ac": 188,
    "\u03c4": 189,
    "\u03b7": 190,
    "\u015f": 191,
    "\u0440": 192,
    "\u0433": 193,
    "\u0443": 194,
    "\u0442": 195,
    "\u0438": 196,
    "\u0441": 197,
    "\u043a": 198,
    "\u044f": 199,
    "\u0434": 200,
    "\u043e": 201,
    "\u043b": 202,
    "\u1ea1": 203,
    "\u1ed3": 204,
    "\u1ec7": 205,
    "\u2032": 206,
    "*": 207,
    "\u03b5": 208,
    "\u0148": 209,
    "\u00f0": 210,
    "\u093e": 211,
    "\u0930": 212,
    "\u0940": 213,
    "\u0163": 214,
    "\u0103": 215,
    "\u1ecb": 216,
    "\u0623": 217,
    "\u0435": 218,
    "\u0432": 219,
    "\u043f": 220,
    "\u044b": 221,
    "\u093f": 222,
    "\u1ea7": 223,
    "\u1ea3": 224,
    "\u05d1": 225,
    "\u05ea": 226,
    "|": 227,
    "\u00b4": 228,
    "\u0632": 229,
    "\u0153": 230,
    "\u0445": 231,
    "\u03ba": 232,
    "\u03af": 233,
    "\u03b4": 234,
    "\u03cc": 235,
    "\u03c5": 236,
    "\u0437": 237,
    "\u0447": 238,
    "\u0439": 239,
    "\u0456": 240,
    "\u0259": 241,
    "\u03b8": 242,
    "\u0301": 243,
    "\u0634": 244,
    "\u0436": 245,
    "\u03b3": 246,
    "\u0448": 247,
    "\u062a": 248,
    "\u09be": 249,
    "~": 250,
    "\u03c9": 251,
    "\u044c": 252,
    "\u0444": 253,
    "\u094d": 254,
    "\u0446": 255,
    "\u0643": 256,
    "\\": 257,
    "\u02c8": 258,
    "\u0bcd": 259,
    "\u05de": 260,
    "\u014f": 261,
    "\u2033": 262,
    "\u0561": 263
}

参考资料

  1. 【Python】字符串与unicode字符之间的转换
  2. Python之字典以层级形式输出
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值