import pandas as pd
import pymysql
import json
conn = pymysql.connect(
)
#读取ht_contract_party表
sql = 'select * from ht_contract_party'
df = pd.read_sql(sql,conn)
sql = 'select * from ht_contract_party'
df = pd.read_sql(sql,conn)
data_json = df.iloc[0]['json']
list_keys = list(json.loads(data_json).keys())
# 展开key
def get_full_dict_list(df): # df存储了一个ht_contract_party二维表
# 空列表,用来储存key
list_keys_all = []
for i in range(len(df)):
# 读取行记录中的json
data_json = json.loads(df.iloc[i]['json']) # 从df的第i行的json列中提取JSON格式的字符串,将其解析为 Python 字典
list_keys = list(data_json.keys()) # 将python字典里的key存入list_keys
# 循环遍历key
for k in list_keys:
# 判断,如果key在列表中,跳过,没有的话加入
if k not in list_keys_all:
list_keys_all.append(k)
# 嵌套展开,如果嵌套为json(字典),展开,用一级加下划线加二级命名
if type(data_json[k]) == dict:
list_sub_keys = list(data_json[k].keys()) #将字典 data_json[k] 的所有键提取出来,并将它们转换为一个列表
for sk in list_sub_keys:
sub_key_full_name = k + '_' + sk
if sub_key_full_name not in list_keys_all:
list_keys_all.append(sub_key_full_name)
elif type(data_json[k]) == list:
for contact in data_json[k]:
for ck, cv in contact.items():
sub_key_full_name = k + '_' + ck
if sub_key_full_name not in list_keys_all:
list_keys_all.append(sub_key_full_name)
return (list_keys_all)
9.4上午更新
import pandas as pd
import pymysql
import json
import openpyxl as xl
conn = pymysql.connect(
)
# 读取ht_contract_party表
sql = 'select * from ht_contract'
df = pd.read_sql(sql, conn)
# 展开key
def get_full_dict_list(df, max_levels=5):
# 空列表,用来储存key
list_keys_all = []
list_keys=[]
for i in range(len(df)):
# 读取行记录中的json
try:
data_json = json.loads(df.iloc[i]['json']) # 从df的第i行的json列中提取JSON格式的字符串,将其解析为 Python 字典
if isinstance(data_json, dict):
list_keys = list(data_json.keys()) # 将python字典里的key存入list_keys
print(f"字典类型:{data_json}")
elif isinstance(data_json, str):
print(f"字符串类型:{data_json}")
except json.JSONDecodeError as e:
print(f"JSON 解析错误: {e}")
continue
# 循环遍历key
for k in list_keys:
# 判断,如果key在列表中,跳过,没有的话加入
if k not in list_keys_all:
list_keys_all.append(k)
# 嵌套展开,如果嵌套为json(字典),展开,用一级加下划线加二级命名
if type(data_json[k]) == dict and max_levels > 0:
list_sub_keys = list(data_json[k].keys()) # 将字典 data_json[k] 的所有键提取出来,并将它们转换为一个列表
for sk in list_sub_keys:
sub_key_full_name = k + '_' + sk
if sub_key_full_name not in list_keys_all:
list_keys_all.append(sub_key_full_name)
# 递归处理更深层级
if max_levels > 1:
sub_data = data_json[k][sk]
sub_keys = get_full_dict_list(pd.DataFrame([{'json': json.dumps(sub_data)}]), max_levels - 1)
for sub_key in sub_keys:
full_sub_key = k + '_' + sk + '_' + sub_key
if full_sub_key not in list_keys_all:
list_keys_all.append(full_sub_key)
elif type(data_json[k]) == list:
for contact in data_json[k]:
for ck, cv in contact.items():
sub_key_full_name = k + '_' + ck
if sub_key_full_name not in list_keys_all:
list_keys_all.append(sub_key_full_name)
return list_keys_all
9.4 15:30更新
import pandas as pd
import pymysql
import json
import openpyxl as xl
conn = pymysql.connect(
)
# 读取ht_contract_party表
sql = 'select * from ht_contract'
df = pd.read_sql(sql, conn)
# 展开key
def get_full_dict_list(df, max_levels=5):
# 空列表,用来储存key
list_keys_all = []
list_keys=[]
for i in range(len(df)):
# 读取行记录中的json
try:
data_json = json.loads(df.iloc[i]['json']) # 从df的第i行的json列中提取JSON格式的字符串,将其解析为 Python 字典
if isinstance(data_json, dict):
list_keys = list(data_json.keys()) # 将python字典里的key存入list_keys
# print(f"字典类型:{data_json}")
elif isinstance(data_json, str):
continue
# print(f"字符串类型:{data_json}")
except json.JSONDecodeError as e:
print(f"JSON 解析错误: {e}")
continue
# 循环遍历key
for k in list_keys:
# 判断,如果key在列表中,跳过,没有的话加入
if k not in list_keys_all:
list_keys_all.append(k)
# 嵌套展开,如果嵌套为json(字典),展开,用一级加下划线加二级命名
if type(data_json[k]) == dict and max_levels > 0:
list_sub_keys = list(data_json[k].keys()) # 将字典 data_json[k] 的所有键提取出来,并将它们转换为一个列表
for sk in list_sub_keys:
sub_key_full_name = k + '_' + sk
if sub_key_full_name not in list_keys_all:
list_keys_all.append(sub_key_full_name)
# 递归处理更深层级
if max_levels > 1:
sub_data = data_json[k][sk]
sub_keys = get_full_dict_list(pd.DataFrame([{'json': json.dumps(sub_data)}]), max_levels - 1)
for sub_key in sub_keys:
full_sub_key = k + '_' + sk + '_' + sub_key
if full_sub_key not in list_keys_all:
list_keys_all.append(full_sub_key)
elif type(data_json[k]) == list:
for contact in data_json[k]:
for ck, cv in contact.items():
sub_key_full_name = k + '_' + ck
if sub_key_full_name not in list_keys_all:
list_keys_all.append(sub_key_full_name)
return list_keys_all
# 按顺序获取最大长度和类型
def get_max_length(df, list_keys_all):
list_max_length = []
list_types = []
for list_key in list_keys_all:
max_len = 0
list_type_key = set()
if '_' in list_key and list_key[0] != '_':
keys = list_key.split('_')
key = keys[0]
sub_key = '_'.join(keys[1:])
for js in range(len(df)):
data_json = json.loads(df.iloc[js]['json'])
try:
if data_json[key] is not None and sub_key in data_json[key]:
value = data_json[key][sub_key]
else:
value = None # 或者设置为默认值
length = len(str(value))
value_type = str(type(value))[8:-2]
list_type_key.add(value_type)
except KeyError:
value = None # 没有找到子键,可以设置为默认值
length = 0
if length > max_len:
max_len = length
else:
key = list_key # 没有下划线的键,将其作为子键
for js in range(len(df)):
data_json = json.loads(df.iloc[js]['json'])
try:
if data_json[key] is not None:
value = data_json[key]
else:
value = None # 或者设置为默认值
length = len(str(value))
value_type = str(type(value))[8:-2]
list_type_key.add(value_type)
except KeyError:
value = None # 没有找到键,可以设置为默认值
length = 0
if length > max_len:
max_len = length
list_max_length.append(max_len)
list_types.append(list(list_type_key))
return list_max_length, list_types
list_keys_all = get_full_dict_list(df, max_levels=5)
results = get_max_length(df,list_keys_all)
df_result = pd.DataFrame(columns=['key','length','type'])
df_result['key'] = list_keys_all
df_result['length'] = results[0]
df_result['type'] = results[1]
print(df_result)
9.4号16:37更新
import pandas as pd
import pymysql
import json
import openpyxl as xl
conn = pymysql.connect(
)
# 读取ht_contract_party表
sql = 'select * from ht_contract'
df = pd.read_sql(sql, conn)
# 展开key
def get_full_dict_list(df, max_levels=5):
# 空列表,用来储存key
list_keys_all = []
list_keys=[]
list_max_length=[]
list_type=set()
for i in range(len(df)):
max_length=0
# 读取行记录中的json
try:
data_json = json.loads(df.iloc[i]['json']) # 从df的第i行的json列中提取JSON格式的字符串,将其解析为 Python 字典
if isinstance(data_json, dict):
list_keys = list(data_json.keys()) # 将python字典里的key存入list_keys
# print(f"字典类型:{data_json}")
elif isinstance(data_json, str):
continue
# print(f"字符串类型:{data_json}")
length = len(str(value))
value_type=str(type(data_json))[8:-2]
list_type.add(value_type)
except json.JSONDecodeError as e:
print(f"JSON 解析错误: {e}")
continue
if length>max_length:
max_length=length
list_max_length.append(max_length)
# 循环遍历key
for k in list_keys:
# 判断,如果key在列表中,跳过,没有的话加入
if k not in list_keys_all:
list_keys_all.append(k)
# 嵌套展开,如果嵌套为json(字典),展开,用一级加下划线加二级命名
if type(data_json[k]) == dict and max_levels > 0:
list_sub_keys = list(data_json[k].keys()) # 将字典 data_json[k] 的所有键提取出来,并将它们转换为一个列表
for sk in list_sub_keys:
sub_key_full_name = k + '_' + sk
if sub_key_full_name not in list_keys_all:
list_keys_all.append(sub_key_full_name)
length = len(str(data_json[k]))
value_type = str(type(data_json[k]))[8:-2]
list_type.add(value_type)
if length > max_length:
max_length = length
list_max_length.append(max_length)
# 递归处理更深层级
if max_levels > 1:
sub_data = data_json[k][sk]
sub_keys = get_full_dict_list(pd.DataFrame([{'json': json.dumps(sub_data)}]), max_levels - 1)
for sub_key in sub_keys:
full_sub_key = k + '_' + sk + '_' + sub_key
if full_sub_key not in list_keys_all:
list_keys_all.append(full_sub_key)
elif type(data_json[k]) == list:
for contact in data_json[k]:
for ck, cv in contact.items():
sub_key_full_name = k + '_' + ck
if sub_key_full_name not in list_keys_all:
list_keys_all.append(sub_key_full_name)
return list_keys_all
df_result['key'] = list_keys_all
df_result['length'] = results[0]
df_result['type'] = results[1]
print(df_result)
9.5 00:10
import pandas as pd
import pymysql
import json
import openpyxl as xl
# 连接到数据库
conn = pymysql.connect(
host="192.168.65.86",
user="root",
password="jRz&nj%RmJ9fRj5SL",
database="an_hetong"
)
cursor = conn.cursor()
cursor.execute("SHOW TABLES")
tables = [table[0] for table in cursor.fetchall()]
# 展开key
def get_full_dict_list(data_json, table, parent_key='', max_levels=5):
max_levels = int(max_levels) # 确保 max_levels 是整数
for k, v in data_json.items():
# 排除数字键
if isinstance(k, int):
continue
current_key = f"{parent_key}_{k}" if parent_key else k
if current_key in processed_keys:
continue
key_info = {
'table': table,
'key': current_key,
'length': len(str(v)),
'type': str(type(v))[8:-2]
}
key_info_list.append(key_info)
processed_keys.add(current_key) # 将处理过的键添加到集合中
if isinstance(v, dict) and max_levels > 0:
get_full_dict_list(v, table, current_key, max_levels - 1)
elif isinstance(v, list) and max_levels > 0:
for item in v:
if isinstance(item, dict):
get_full_dict_list(item, table, current_key, max_levels - 1)
# 将键的信息存储到 DataFrame
df_info = pd.DataFrame(key_info_list)
df_info.to_excel("key_info.xlsx", index=False)
for table in tables:
# 检查表格是否包含'json'列
cursor.execute(f"DESCRIBE {table}")
columns = [column[0] for column in cursor.fetchall()]
if 'json' not in columns:
print(f"表格 {table} 不包含 'json' 列,跳过处理。")
continue
sql = f'select * from {table}'
df = pd.read_sql(sql, conn)
# 尝试获取表名,如果DataFrame为空则使用table变量
table_name = df.columns[0].split("_")[0] if not df.empty else table
# 用于存储每个键的信息的字典
key_info_list = []
# 已经处理过的 key 集合
processed_keys = set()
# 循环遍历DataFrame中的每一行
for i in range(len(df)):
try:
data_json = json.loads(df.iloc[i]['json'])
get_full_dict_list(data_json, table, max_levels=5) # 传递 max_levels=5
except Exception as e:
print(f"处理表格 {table} 时发生错误: {e}")