import pandas as pd
import csv
import copy
# 处理tb
def save_tb_dict_to_csv_with_degree(filename, dictionary):
with open(filename, 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['tb_code', 'out degre', 'in degre', 'fd nums'])
for key, value in dictionary.items():
data = []
data.append(key)
data.extend(value)
writer.writerow(data)
def read_data(path):
# 读取CSV文件
df = pd.read_csv(path)
# 将每一列数据保存在列表中
column_data = []
for column in df.columns:
column_data.append(df[column].tolist())
# print(len(column_data))
# print(len(column_data[0]))
# print(column_data[0][0])
# 显示列表中的数据(可选)
# for col_data in column_data:
# print(len(col_data))
print(len(column_data))
print(len(column_data[0]))
return column_data
path = './re_code_all.csv'
# 取数据
datas = read_data(path)
# 进一步处理
db_f = datas[1]
tb_f = datas[2]
db_t = datas[4]
tb_t = datas[5]
fd_f = datas[3]
fd_t = datas[6]
# 待会保存联合起来的库表,是元组
tb_f_true = []
tf_t_true = []
fd_f_true = [(str(d) + "_"+str(t), f) for d, t, f in zip(db_f, tb_f, fd_f)]
fd_t_true = [(str(d) + "_"+str(t), f) for d, t, f in zip(db_t, tb_t, fd_t)]
fd_f_true_cp = copy.deepcopy(fd_f_true)
fd_t_true_cp = copy.deepcopy(fd_t_true)
# 连接在一起处理所有字段,并且去重 字段
fd_f_true.extend(fd_t_true)
fd_f_true = set(fd_f_true)
fd_f_true = list(fd_f_true)
# 准备所有的元组放入列表
for item in zip(db_f, tb_f):
tb_f_true.append(item)
for item in zip(db_t, tb_t):
tf_t_true.append(item)
print(f"原始数据长度:{len(tf_t_true)}")
# print(tb_f_true)
# print(tf_t_true)
tb_f_true_del = []
tf_t_true_del = []
# 去重,是指自己指向自己的边
for i in range(len(tb_f_true)):
if tb_f_true[i] != tf_t_true[i]:
# del tb_f_true[i]
tb_f_true_del.append(tb_f_true[i])
tf_t_true_del.append(tf_t_true[i])
# del tf_t_true[i]
print(f"第一次去重后的长度,删除自身的环:{len(tb_f_true_del)}")
relations = [(f, t) for f, t in zip(tb_f_true_del, tf_t_true_del)]
print(f"未删除多余的情况,边的数量:{len(relations)}")
# 去掉重复的边,两个结点之间多余的
relations_del = set(relations)
relations_del = list(relations_del)
print(f"删除多余的情况,边的数量:{len(relations_del)}")
degree_tb = {}
key_tb = []
# 准备所有的键
for item in relations_del:
f, t = item
db_f_, tb_f_ = f
db_t_, tb_t_ = t
f_str = str(db_f_) + "_" + str(tb_f_)
t_str = str(db_t_) + "_" + str(tb_t_)
key_tb.append(f_str)
key_tb.append(t_str)
key_tb = set(key_tb)
key_tb = list(key_tb)
# 建立字典,保存 每个表的字段
fd_value = {}
for key in key_tb:
if key not in fd_value:
fd_value[key] = []
# 统计 每个表的字段
for key, fd in fd_f_true:
if key in fd_value:
fd_value[key].append(fd)
# print(fd_value)
# 建立字典,保存 每个表的入度
tb_ind = {}
for key in key_tb:
if key not in tb_ind:
tb_ind[key] = 0
for _, t in relations_del:
db_t_, tb_t_ = t
t_str = str(db_t_) + "_" + str(tb_t_)
if t_str in tb_ind:
tb_ind[t_str] += 1
# 建立字典,保存 每个表的出度
tb_outd = {}
for key in key_tb:
if key not in tb_outd:
tb_outd[key] = 0
for f, _ in relations_del:
db_f_, tb_f_ = f
f_str = str(db_f_) + "_" + str(tb_f_)
if f_str in tb_outd:
tb_outd[f_str] += 1
# 建立字典,统筹所有的数据
tb_all_info = {key: [tb_outd[key], tb_ind[key], len(fd_value[key])] for key in key_tb}
# save_tb_dict_to_csv_with_degree("./tb_info.csv", tb_all_info)
# 字典,保存每种类型的表的名称
tb_classify = {"0_0": [], "0_1": [], "0_n": [], "1_0": [], "1_1": [], "1_n": [], "n_0": [], "n_1": [], "n_m": []}
# 字典,保存每种类型表的数量
tb_classify_num = {"0_0": 0, "0_1": 0, "0_n": 0, "1_0": 0, "1_1": 0, "1_n": 0, "n_0": 0, "n_1": 0, "n_m": 0}
for key, v in tb_all_info.items():
outd = v[0]
ind = v[1]
if outd == 0 and ind == 0:
tb_classify["0_0"].append(key)
elif outd == 0 and ind == 1:
tb_classify["0_1"].append(key)
elif outd == 0 and ind > 1:
tb_classify["0_n"].append(key)
elif outd == 1 and ind == 0:
tb_classify["1_0"].append(key)
elif outd == 1 and ind == 1:
tb_classify["1_1"].append(key)
elif outd == 1 and ind > 1:
tb_classify["1_n"].append(key)
elif outd > 1 and ind == 0:
tb_classify["n_0"].append(key)
elif outd > 1 and ind == 1:
tb_classify["n_1"].append(key)
elif outd > 1 and ind > 1:
tb_classify["n_m"].append(key)
for key, v in tb_classify.items():
tb_classify_num[key] = len(tb_classify[key])
print(f"分类情况:{tb_classify_num}")
# 搜索1_1类型的表,所有字段是否可能是全部复制于上一个表,也就是两个表的字段数量完全相等可能为全部复制,不完全相等为部分复制
key_search = "1_1"
tb_1_1 = tb_classify[key_search]
for tb in tb_1_1:
search = tb
from_tb = []
from_tb_fd_num = []
to_tb = []
to_tb_fd_num = []
print(f"正在查找表: {search},类型为: {key_search}")
for item in relations_del:
f, t = item
db_f_, tb_f_ = f
db_t_, tb_t_ = t
f_str = str(db_f_) + "_" + str(tb_f_)
t_str = str(db_t_) + "_" + str(tb_t_)
# 寻找从search指向的表,也就是to
if f_str == search:
fd_num_t = tb_all_info[t_str][2]
fd_num_search = tb_all_info[search][2]
if fd_num_t == fd_num_search:
to_tb.append(t_str)
to_tb_fd_num.append(fd_num_t)
# 寻找指向search的表,也就是from
if t_str == search:
fd_num_f = tb_all_info[f_str][2]
fd_num_search = tb_all_info[search][2]
if fd_num_f == fd_num_search:
from_tb.append(f_str)
from_tb_fd_num.append(fd_num_f)
for index in range(len(from_tb)):
print(f"表关系:from table: {from_tb[index]} ->> current table: {search} ->> to table: {to_tb[index]}")
print(f"字段数:from table:: {from_tb_fd_num[index]} ->> current table: {fd_num_search} ->> to table: {to_tb_fd_num[index]}")
print("--------------------------------------------------------------")
# if t_str == search:
# fd_num_f = tb_all_info[f_str][2]
# fd_num_search = tb_all_info[search][2]
# if fd_num_f == fd_num_search:
# print("----------------------------------------------------")
# print(f"可能是全复制 from table: {f_str}, to table: {search}, fd_num from: {fd_num_f}, fd_num to: {fd_num_search}")
# print("----------------------------------------------------")
# else:
# print("****************************************************")
# print(f"可能是部分复制 from table: {f_str}, to table: {search}, fd_num from: {fd_num_f}, fd_num to: {fd_num_search}")
# print("****************************************************")
# for f, t in relations_del:
# tb_t, fd_t = t
# tb_f, fd_f = f
# if tb_t == search:
# fd_num_f = tb_all_info[tb_f][2]
# fd_num_search = tb_all_info[search][2]
# print(f"from: {tb_f}, to: {fd_num_search}, fd_num from: {fd_num_f}, fd_num search: {fd_num_search}")
# break
# 建立字典,用来保存 每个表的入度,出度,字段数
# for key in key_tb:
# if key not in degree_tb:
# # 值分别代表:出度,入度,字段数
# degree_tb[key] = [0, 0, 0]
# print(len(key_tb))
# 使用建立的字典,计算 每个表的入度,出度,
# for item in relations_del:
# f, t = item
# db_f_, tb_f_ = f
# db_t_, tb_t_ = t
# f_str = str(db_f_) + "_" + str(tb_f_)
# t_str = str(db_t_) + "_" + str(tb_t_)
# if f_str in degree_tb:
# degree_tb[f_str][0] += 1
# if t_str in degree_tb:
# degree_tb[f_str][1] += 1
# print(degree_tb)
# for key, v in degree_tb.items():
# uni_fd = set(fd_value[key])
# degree_tb[key][2] = len(uni_fd)
# print(f)
# print(t)
# print(degree_tb)
# sta_tb_num_in_db(path)
# sta_fd_num_in_tb(path)
python学习数据处理
最新推荐文章于 2024-09-17 23:15:58 发布