Markdown目录生成:[TOC]
问题
如何将json文件以层级形式输出?
现有文档级关系抽取的数据集DocRED的metadata——char2id.json
:
{"BLANK": 0, "UNK": 1, "t": 2, "h": 3, "e": 4, "w": 5, "o": 6, "r": 7, "k": 8, "-": 9, "p": 10, "a": 11, "s": 12, "n": 13, "d": 14, "g": 15, "u": 16, "(": 17, ";": 18, ")": 19, ",": 20, "l": 21, "'": 22, "m": 23, "i": 24, "y": 25, "f": 26, "c": 27, ".": 28, "v": 29, "b": 30, "j": 31, "1": 32, "4": 33, "9": 34, "5": 35, "2": 36, "0": 37, "6": 38, "z": 39, "/": 40, "q": 41, "3": 42, "x": 43, "8": 44, ":": 45, "7": 46, "\u2014": 47, "\u00e9": 48, "\"": 49, "\u014d": 50, "\u1e63": 51, "\u0101": 52, "%": 53, "\u0645": 54, "\u0648": 55, "\u0627": 56, "\u0631": 57, "\u00a3": 58, "$": 59, "\u0142": 60, "\u015b": 61, "\u0119": 62, "\u2019": 63, "&": 64, "\u00e8": 65, "\u00ed": 66, "\u00f1": 67, "\u00e1": 68, "\u0131": 69, "\u00f6": 70, "\u00f3": 71, "!": 72, "\u00e4": 73, "\u017c": 74, "?": 75, "\u00e5": 76, "\u00e6": 77, "\u0161": 78, "\u010d": 79, "\u00fd": 80, "\u2212": 81, "\u00f4": 82, "\u00b0": 83, "\u201c": 84, "\u201d": 85, "\u00f8": 86, "[": 87, "]": 88, "\u00df": 89, "_": 90, "#": 91, "\u017e": 92, "+": 93, "\u00fe": 94, "\u00fc": 95, "\u011f": 96, "\u00ee": 97, "\u0307": 98, "\u00e7": 99, "\u00e0": 100, "\u0111": 101, "\u00ec": 102, "\u1ebf": 103, "\u0117": 104, "\u062d": 105, "\u0644": 106, "\u064a": 107, "\u0629": 108, "\u1e25": 109, "\u00ea": 110, "\u00fa": 111, "\u017a": 112, "\u021b": 113, "\u0107": 114, "\u2018": 115, "\u00f2": 116, "\u20ac": 117, "\u016b": 118, "\u0159": 119, "\u0105": 120, "\u0430": 121, "\u043c": 122, "\u0431": 123, "\u043d": 124, "\u0628": 125, "\u0646": 126, "\u00e3": 127, "\u00d7": 128, "\u4e2d": 129, "\u5c71": 130, "\u00b2": 131, "\u00f9": 132, "\u01b0": 133, "\u01a1": 134, "\u00eb": 135, "\u012b": 136, "\u02bb": 137, "\u0144": 138, "\u00fb": 139, "`": 140, "\u00e2": 141, "\u5927": 142, "\u011b": 143, "\u0151": 144, "\u0639": 145, "\u062f": 146, "\u0647": 147, "\u00ef": 148, "\u0219": 149, "\u0642": 150, "\u00ab": 151, "\u00bb": 152, "\u03c0": 153, "\u03b9": 154, "\u03bf": 155, "\u03c3": 156, "\u03c2": 157, "\u03bc": 158, "\u03ad": 159, "\u03c1": 160, "\u03bd": 161, "\u2026": 162, "\u02bf": 163, "\u0e23": 164, "\u0e32": 165, "\u0113": 166, "\u0146": 167, "\u738b": 168, "\u30fc": 169, "\u05d9": 170, "\u05dc": 171, "\u05d0": 172, "\u05d5": 173, "\u05e8": 174, "\u05d4": 175, "\u00f5": 176, "\u03bb": 177, "\u00bd": 178, "\u1e6d": 179, "\u0641": 180, "\u0633": 181, "\u06cc": 182, "=": 183, "\u1ec5": 184, "\u062c": 185, "\u03c7": 186, "\u03b1": 187, "\u03ac": 188, "\u03c4": 189, "\u03b7": 190, "\u015f": 191, "\u0440": 192, "\u0433": 193, "\u0443": 194, "\u0442": 195, "\u0438": 196, "\u0441": 197, "\u043a": 198, "\u044f": 199, "\u0434": 200, "\u043e": 201, "\u043b": 202, "\u1ea1": 203, "\u1ed3": 204, "\u1ec7": 205, "\u2032": 206, "*": 207, "\u03b5": 208, "\u0148": 209, "\u00f0": 210, "\u093e": 211, "\u0930": 212, "\u0940": 213, "\u0163": 214, "\u0103": 215, "\u1ecb": 216, "\u0623": 217, "\u0435": 218, "\u0432": 219, "\u043f": 220, "\u044b": 221, "\u093f": 222, "\u1ea7": 223, "\u1ea3": 224, "\u05d1": 225, "\u05ea": 226, "|": 227, "\u00b4": 228, "\u0632": 229, "\u0153": 230, "\u0445": 231, "\u03ba": 232, "\u03af": 233, "\u03b4": 234, "\u03cc": 235, "\u03c5": 236, "\u0437": 237, "\u0447": 238, "\u0439": 239, "\u0456": 240, "\u0259": 241, "\u03b8": 242, "\u0301": 243, "\u0634": 244, "\u0436": 245, "\u03b3": 246, "\u0448": 247, "\u062a": 248, "\u09be": 249, "~": 250, "\u03c9": 251, "\u044c": 252, "\u0444": 253, "\u094d": 254, "\u0446": 255, "\u0643": 256, "\\": 257, "\u02c8": 258, "\u0bcd": 259, "\u05de": 260, "\u014f": 261, "\u2033": 262, "\u0561": 263}
编写python脚本使char2id.json
以层级形式进行输出,方便直观得观察文本。
初步分析
char2id.json
共有一行,读取之后可以转变为一个字典格式的数据,这个字典深度为一层,共有264项数据,每一项的key为编码形式,value为序列号。其中key包含“BLANCK”、“UNK”、a-z、0-9、“-”、“(”、“)”、“;”、“,”、“‘”、“.”、“/”、“:”、“”“、“%”、“$”、“&”、“!”、“?”、“[”、“]”、“_”、“#”、“+”、“`”、“=”、“*”、“|”、“~”、“\”共64个字符和200个字符的Unicode编码。
第一次尝试
从char2id.json
文件中读取内容后,直接转为字典形式,并按照按照每项数据一行的形式进行保存。
import json
# 读取文件
file_read=open("./prepro_data/char2id.json","r")
js=file_read.read()
file_read.close()
# 将json文件转为字典形式
dic = json.loads(js) # 字符串转字典方法1
#dic = eval(js) # 字符串转字典方法2
print(dic)
# 保存文件
file_save=open("./hightlight_data/char2id_1.json",mode='w',encoding='utf-8') # 以写形式进行打开文件,当文件不存在时会直接创建
file_save.write(str(dic))
file_save.close()
结果:
保存为一行,并且key由原来的双引号包括变为单引号包括(在pycharm中双引号包括不会标红,但引号包括会标红),最重要的是Unicode编码全变成了对应的字符。
第二次尝试
解决Unicode编码全变成了对应的字符的问题。文件中Unicode编码从“\u00a3”到“\u738b”之间,中文编码是“\u4e00”到“\u9fa5”之间,用[1]的方法进行处理。
[1]的初始代码:
def unicode_to_str(unicode_str):
return unicode_str.encode().decode('unicode_escape')
def str_to_unicode(string):
new_str = ''
for ch in string:
if '\u4e00' <= ch <= '\u9fff':
new_str += hex(ord(ch)).replace('0x', '\\u')
else:
new_str += ch
return new_str
if __name__ == '__main__':
unicode = str_to_unicode('你好')
print(unicode) # \u4f60\u597d
print(repr(unicode)) # '\\u4f60\\u597d'
print(unicode_to_str('\\u4f60\\u597d')) # 你好
改编之后的代码:
import json
def str_to_unicode(string):
new_str = ""
for ch in string:
if '\u00a3' <= ch <= '\u738b':
# a=ord(ch) # 以一个字符(长度为1的字符串)作为参数,返回对应的 ASCII 数值,或者 Unicode 数值,
# b=hex(a)
new_str += hex(ord(ch)).replace("0x", "\\u")
n=len(new_str)
if len(new_str)==4:
new_str=new_str[0:2]+"00"+new_str[2:]
elif len(new_str)==5:
new_str = new_str[0:2] + "0" + new_str[2:]
else:
new_str += ch
return new_str
def json2str(json_file,save_path):
# 读取文件
file_read=open(json_file,"r")
js=file_read.read()
file_read.close()
# 将json文件转为字典形式
dic = json.loads(js) # 字符串转字典方法1
#dic = eval(js) # 字符串转字典方法2
print(dic)
key_list=list(dic.keys())
value_list=list(dic.values())
key_value_list=list(zip(key_list,value_list))
for i, item in enumerate(key_value_list):
key_value_list[i]=list(item)
key_value_list[i][0]=str_to_unicode(key_value_list[i][0])
dic=dict(key_value_list)
# 保存文件
file_save=open(save_path,mode='w',encoding='utf-8') # 以写形式进行打开文件,当文件不存在时会直接创建
file_save.write(str(dic))
file_save.close()
if __name__ == '__main__':
file_path="./prepro_data/char2id.json"
save_path="./hightlight_data/char2id_1.json"
json2str(file_path,save_path)
Unicode编码不再变为对应的字符:
再用[2]中的方法以层级方式进行输出保存,需要注意代码中64和65行对特殊字符的处理:
import json
from collections import abc
def str_to_unicode(string):
new_str = ""
for ch in string:
if '\u00a3' <= ch <= '\u738b': # '\u4e00' <= ch <= '\u9fff'
# a=ord(ch) # 以一个字符(长度为1的字符串)作为参数,返回对应的 ASCII 数值,或者 Unicode 数值,
# b=hex(a)
new_str += hex(ord(ch)).replace("0x", "\\u")
n=len(new_str)
if len(new_str)==4:
new_str=new_str[0:2]+"00"+new_str[2:]
elif len(new_str)==5:
new_str = new_str[0:2] + "0" + new_str[2:]
else:
new_str += ch
return new_str
def mapping2str_v2(mapping: abc.Mapping, *, prefix: str = " ", lvl: int = 0, max_lvl: int = 3) -> str:
sub_lvl = lvl + 1
cur_prefix = prefix * lvl
sub_prefix = prefix * sub_lvl
if lvl == max_lvl: # 用来处理最大层级下的各项内容,直接按照字符串一行输出
sub_items = str(mapping)
else:
sub_items = ["{"]
for i, (k, v) in enumerate(mapping.items()):
if i==49:
a=111
pass
if i==257:
pass
sub_item = sub_prefix + '"' + k + '"' + ": " # "\""+
if isinstance(v, abc.Mapping):
sub_item += mapping2str_v2(v, prefix=prefix, lvl=sub_lvl, max_lvl=max_lvl)
else:
sub_item += str(v) # 更多是用于处理各个层级非字典项的输出
aaaa=mapping.__len__()
if i!=mapping.__len__()-1:
sub_item+=','
sub_items.append(sub_item)
sub_items.append(cur_prefix + "}")
sub_items = "\n".join(sub_items)
return sub_items
def json2str(json_file,save_path):
# 1.读取文件
file_read=open(json_file,"r")
js=file_read.read()
file_read.close()
# 2.将json文件转为字典形式
dic = json.loads(js) # 字符串转字典方法1
#dic = eval(js) # 字符串转字典方法2
# 3.为了不改变输出文件的顺序,将字典形式转为列表形式,将字符变为Unicode的编码形式,并对特殊情况进行处理,再将其转回为字典形式
key_list=list(dic.keys())
value_list=list(dic.values())
key_value_list=list(zip(key_list,value_list))
for i, item in enumerate(key_value_list):
key_value_list[i]=list(item)
key_value_list[i][0]=str_to_unicode(key_value_list[i][0])
key_value_list[49][0] = "\\\""
key_value_list[257][0] = '\\\\'
print(key_value_list)
dic=dict(key_value_list)
# 4.将字典以层级形式进行保存
dic = mapping2str_v2(dic, max_lvl=1)
# 保存文件
file_save=open(save_path,mode='w',encoding='utf-8') # 以写形式进行打开文件,当文件不存在时会直接创建
file_save.write(str(dic))
file_save.close()
if __name__ == '__main__':
file_path="./prepro_data/char2id.json"
save_path="./hightlight_data/char2id_1.json"
json2str(file_path,save_path)
输出结果:
第三次尝试
第二次尝试中先将json文件读取为字典,再使用str_to_unicode()函数将字符变为对应的Unicode编码,再将字典以层级形式输出。这里尝试不再先将json文件读为字典,而是将json文件读取为字符串,再对字符串进行处理的方式进行处理,其中字符串分割时会用到“,”和“:”,因此需要对这两个字符进行特殊处理。第二次尝试中的特殊情况对于其他文本,可能会不止两种。
from collections import abc
def mapping2str(mapping: abc.Mapping, *, prefix: str = " ", lvl: int = 0, max_lvl: int = 3) -> str:
sub_lvl = lvl + 1
cur_prefix = prefix * lvl
sub_prefix = prefix * sub_lvl
if lvl == max_lvl: # 用来处理最大层级下的各项内容,直接按照字符串一行输出
sub_items = str(mapping)
else:
sub_items = ["{"]
for i, (k, v) in enumerate(mapping.items()):
# sub_item = sub_prefix + '"' + k + '"' + ": " # "\""+
sub_item = sub_prefix + k + ": " # "\""+
if isinstance(v, abc.Mapping):
sub_item += mapping2str(v, prefix=prefix, lvl=sub_lvl, max_lvl=max_lvl)
else:
sub_item += str(v) # 更多是用于处理各个层级非字典项的输出
if i!=mapping.__len__()-1:
sub_item+=','
sub_items.append(sub_item)
sub_items.append(cur_prefix + "}")
sub_items = "\n".join(sub_items)
return sub_items
def json2str(json_file,save_path):
# 1.读取文件
file_read = open("./prepro_data/char2id.json", "r")
js = file_read.read()
file_read.close()
# 2.对字符串进行处理,得到字典格式的数据
js = js.replace(' ', '') # 去除空格
js = js[1:-1].split(',') # 用‘,’分割得到每一项数据,因此当key、value中有‘,’会分割错误
js.pop(20)
for i, k in enumerate(js):
js[i] = js[i].split(':') # 用“:”分割每一项字典数据,因此当key、value中有‘,’会分割错误
js[20][0] = '","'
js[45].pop(0)
js[45][0] = '":"'
print(js)
# for i, k in enumerate(js):
# js[i][0] = js[i][0][1:-1]
# print(js)
dic = dict(js)
# 3.将字典以层级形式进行保存
dic = mapping2str(dic, max_lvl=1)
# 4.保存文件
file_save = open(save_path, mode='w')
file_save.write(str(dic))
file_save.close()
if __name__ == '__main__':
file_path = "./prepro_data/char2id.json"
save_path = "./hightlight_data/char2id.json"
json2str(file_path, save_path)
最终结果:
{
"BLANK": 0,
"UNK": 1,
"t": 2,
"h": 3,
"e": 4,
"w": 5,
"o": 6,
"r": 7,
"k": 8,
"-": 9,
"p": 10,
"a": 11,
"s": 12,
"n": 13,
"d": 14,
"g": 15,
"u": 16,
"(": 17,
";": 18,
")": 19,
",": 20,
"l": 21,
"'": 22,
"m": 23,
"i": 24,
"y": 25,
"f": 26,
"c": 27,
".": 28,
"v": 29,
"b": 30,
"j": 31,
"1": 32,
"4": 33,
"9": 34,
"5": 35,
"2": 36,
"0": 37,
"6": 38,
"z": 39,
"/": 40,
"q": 41,
"3": 42,
"x": 43,
"8": 44,
":": 45,
"7": 46,
"\u2014": 47,
"\u00e9": 48,
"\"": 49,
"\u014d": 50,
"\u1e63": 51,
"\u0101": 52,
"%": 53,
"\u0645": 54,
"\u0648": 55,
"\u0627": 56,
"\u0631": 57,
"\u00a3": 58,
"$": 59,
"\u0142": 60,
"\u015b": 61,
"\u0119": 62,
"\u2019": 63,
"&": 64,
"\u00e8": 65,
"\u00ed": 66,
"\u00f1": 67,
"\u00e1": 68,
"\u0131": 69,
"\u00f6": 70,
"\u00f3": 71,
"!": 72,
"\u00e4": 73,
"\u017c": 74,
"?": 75,
"\u00e5": 76,
"\u00e6": 77,
"\u0161": 78,
"\u010d": 79,
"\u00fd": 80,
"\u2212": 81,
"\u00f4": 82,
"\u00b0": 83,
"\u201c": 84,
"\u201d": 85,
"\u00f8": 86,
"[": 87,
"]": 88,
"\u00df": 89,
"_": 90,
"#": 91,
"\u017e": 92,
"+": 93,
"\u00fe": 94,
"\u00fc": 95,
"\u011f": 96,
"\u00ee": 97,
"\u0307": 98,
"\u00e7": 99,
"\u00e0": 100,
"\u0111": 101,
"\u00ec": 102,
"\u1ebf": 103,
"\u0117": 104,
"\u062d": 105,
"\u0644": 106,
"\u064a": 107,
"\u0629": 108,
"\u1e25": 109,
"\u00ea": 110,
"\u00fa": 111,
"\u017a": 112,
"\u021b": 113,
"\u0107": 114,
"\u2018": 115,
"\u00f2": 116,
"\u20ac": 117,
"\u016b": 118,
"\u0159": 119,
"\u0105": 120,
"\u0430": 121,
"\u043c": 122,
"\u0431": 123,
"\u043d": 124,
"\u0628": 125,
"\u0646": 126,
"\u00e3": 127,
"\u00d7": 128,
"\u4e2d": 129,
"\u5c71": 130,
"\u00b2": 131,
"\u00f9": 132,
"\u01b0": 133,
"\u01a1": 134,
"\u00eb": 135,
"\u012b": 136,
"\u02bb": 137,
"\u0144": 138,
"\u00fb": 139,
"`": 140,
"\u00e2": 141,
"\u5927": 142,
"\u011b": 143,
"\u0151": 144,
"\u0639": 145,
"\u062f": 146,
"\u0647": 147,
"\u00ef": 148,
"\u0219": 149,
"\u0642": 150,
"\u00ab": 151,
"\u00bb": 152,
"\u03c0": 153,
"\u03b9": 154,
"\u03bf": 155,
"\u03c3": 156,
"\u03c2": 157,
"\u03bc": 158,
"\u03ad": 159,
"\u03c1": 160,
"\u03bd": 161,
"\u2026": 162,
"\u02bf": 163,
"\u0e23": 164,
"\u0e32": 165,
"\u0113": 166,
"\u0146": 167,
"\u738b": 168,
"\u30fc": 169,
"\u05d9": 170,
"\u05dc": 171,
"\u05d0": 172,
"\u05d5": 173,
"\u05e8": 174,
"\u05d4": 175,
"\u00f5": 176,
"\u03bb": 177,
"\u00bd": 178,
"\u1e6d": 179,
"\u0641": 180,
"\u0633": 181,
"\u06cc": 182,
"=": 183,
"\u1ec5": 184,
"\u062c": 185,
"\u03c7": 186,
"\u03b1": 187,
"\u03ac": 188,
"\u03c4": 189,
"\u03b7": 190,
"\u015f": 191,
"\u0440": 192,
"\u0433": 193,
"\u0443": 194,
"\u0442": 195,
"\u0438": 196,
"\u0441": 197,
"\u043a": 198,
"\u044f": 199,
"\u0434": 200,
"\u043e": 201,
"\u043b": 202,
"\u1ea1": 203,
"\u1ed3": 204,
"\u1ec7": 205,
"\u2032": 206,
"*": 207,
"\u03b5": 208,
"\u0148": 209,
"\u00f0": 210,
"\u093e": 211,
"\u0930": 212,
"\u0940": 213,
"\u0163": 214,
"\u0103": 215,
"\u1ecb": 216,
"\u0623": 217,
"\u0435": 218,
"\u0432": 219,
"\u043f": 220,
"\u044b": 221,
"\u093f": 222,
"\u1ea7": 223,
"\u1ea3": 224,
"\u05d1": 225,
"\u05ea": 226,
"|": 227,
"\u00b4": 228,
"\u0632": 229,
"\u0153": 230,
"\u0445": 231,
"\u03ba": 232,
"\u03af": 233,
"\u03b4": 234,
"\u03cc": 235,
"\u03c5": 236,
"\u0437": 237,
"\u0447": 238,
"\u0439": 239,
"\u0456": 240,
"\u0259": 241,
"\u03b8": 242,
"\u0301": 243,
"\u0634": 244,
"\u0436": 245,
"\u03b3": 246,
"\u0448": 247,
"\u062a": 248,
"\u09be": 249,
"~": 250,
"\u03c9": 251,
"\u044c": 252,
"\u0444": 253,
"\u094d": 254,
"\u0446": 255,
"\u0643": 256,
"\\": 257,
"\u02c8": 258,
"\u0bcd": 259,
"\u05de": 260,
"\u014f": 261,
"\u2033": 262,
"\u0561": 263
}