python学习数据处理

最新推荐文章于 2024-09-17 23:15:58 发布
Charles Chou
最新推荐文章于 2024-09-17 23:15:58 发布
阅读量120
点赞数 1
文章标签： python 学习前端
本文链接：https://blog.csdn.net/weixin_43249548/article/details/140038017
版权
import pandas as pd
import csv
import copy
 
# 处理tb
def save_tb_dict_to_csv_with_degree(filename, dictionary):
    with open(filename, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['tb_code', 'out degre', 'in degre', 'fd nums'])
        for key, value in dictionary.items():
            data = []
            data.append(key)
            data.extend(value)
            writer.writerow(data)
 
 
def read_data(path):
    # 读取CSV文件
    df = pd.read_csv(path)
    # 将每一列数据保存在列表中
    column_data = []
    for column in df.columns:
        column_data.append(df[column].tolist())
    # print(len(column_data))
    # print(len(column_data[0]))
    # print(column_data[0][0])
    # 显示列表中的数据（可选）
    # for col_data in column_data:
    #     print(len(col_data))
    print(len(column_data))
    print(len(column_data[0]))
    return column_data
 
 
path = './re_code_all.csv'
# 取数据
datas = read_data(path)
 
# 进一步处理
db_f = datas[1]
tb_f = datas[2]
 
db_t = datas[4]
tb_t = datas[5]
 
fd_f = datas[3]
fd_t = datas[6]
 
# 待会保存联合起来的库表，是元组
tb_f_true = []
tf_t_true = []
 
fd_f_true = [(str(d) + "_"+str(t), f) for d, t, f in zip(db_f, tb_f, fd_f)]
fd_t_true = [(str(d) + "_"+str(t), f) for d, t, f in zip(db_t, tb_t, fd_t)]
 
fd_f_true_cp = copy.deepcopy(fd_f_true)
fd_t_true_cp = copy.deepcopy(fd_t_true)
 
 
# 连接在一起处理所有字段，并且去重 字段
fd_f_true.extend(fd_t_true)
fd_f_true = set(fd_f_true)
fd_f_true = list(fd_f_true)
 
 
# 准备所有的元组放入列表
for item in zip(db_f, tb_f):
    tb_f_true.append(item)
for item in zip(db_t, tb_t):
    tf_t_true.append(item)
 
print(f"原始数据长度：{len(tf_t_true)}")
# print(tb_f_true)
# print(tf_t_true)
tb_f_true_del = []
tf_t_true_del = []
 
# 去重，是指自己指向自己的边
for i in range(len(tb_f_true)):
    if tb_f_true[i] != tf_t_true[i]:
        # del tb_f_true[i]
        tb_f_true_del.append(tb_f_true[i])
        tf_t_true_del.append(tf_t_true[i])
        # del tf_t_true[i]
print(f"第一次去重后的长度，删除自身的环：{len(tb_f_true_del)}")
 
 
relations = [(f, t) for f, t in zip(tb_f_true_del, tf_t_true_del)]
print(f"未删除多余的情况，边的数量:{len(relations)}")
# 去掉重复的边，两个结点之间多余的
relations_del = set(relations)
relations_del = list(relations_del)
print(f"删除多余的情况，边的数量:{len(relations_del)}")
degree_tb = {}
key_tb = []
 
# 准备所有的键
for item in relations_del:
    f, t = item
    db_f_, tb_f_ = f
    db_t_, tb_t_ = t
    f_str = str(db_f_) + "_" + str(tb_f_)
    t_str = str(db_t_) + "_" + str(tb_t_)
    key_tb.append(f_str)
    key_tb.append(t_str)
key_tb = set(key_tb)
key_tb = list(key_tb)
 
# 建立字典，保存 每个表的字段
fd_value = {}
for key in key_tb:
    if key not in fd_value:
        fd_value[key] = []
# 统计 每个表的字段
for key, fd in fd_f_true:
    if key in fd_value:
        fd_value[key].append(fd)
# print(fd_value)
 
# 建立字典，保存 每个表的入度
tb_ind = {}
for key in key_tb:
    if key not in tb_ind:
        tb_ind[key] = 0
for _, t in relations_del:
    db_t_, tb_t_ = t
    t_str = str(db_t_) + "_" + str(tb_t_)
    if t_str in tb_ind:
        tb_ind[t_str] += 1
 
# 建立字典，保存 每个表的出度
tb_outd = {}
for key in key_tb:
    if key not in tb_outd:
        tb_outd[key] = 0
for f, _ in relations_del:
    db_f_, tb_f_ = f
    f_str = str(db_f_) + "_" + str(tb_f_)
    if f_str in tb_outd:
        tb_outd[f_str] += 1
 
 
# 建立字典，统筹所有的数据
tb_all_info = {key: [tb_outd[key], tb_ind[key], len(fd_value[key])] for key in key_tb}
# save_tb_dict_to_csv_with_degree("./tb_info.csv", tb_all_info)
 
# 字典，保存每种类型的表的名称
tb_classify = {"0_0": [], "0_1": [], "0_n": [], "1_0": [], "1_1": [], "1_n": [], "n_0": [], "n_1": [], "n_m": []}
# 字典，保存每种类型表的数量
tb_classify_num = {"0_0": 0, "0_1": 0, "0_n": 0, "1_0": 0, "1_1": 0, "1_n": 0, "n_0": 0, "n_1": 0, "n_m": 0}
for key, v in tb_all_info.items():
    outd = v[0]
    ind = v[1]
    if outd == 0 and ind == 0:
        tb_classify["0_0"].append(key)
    elif outd == 0 and ind == 1:
        tb_classify["0_1"].append(key)
    elif outd == 0 and ind > 1:
        tb_classify["0_n"].append(key)
    elif outd == 1 and ind == 0:
        tb_classify["1_0"].append(key)
    elif outd == 1 and ind == 1:
        tb_classify["1_1"].append(key)
    elif outd == 1 and ind > 1:
        tb_classify["1_n"].append(key)
    elif outd > 1 and ind == 0:
        tb_classify["n_0"].append(key)
    elif outd > 1 and ind == 1:
        tb_classify["n_1"].append(key)
    elif outd > 1 and ind > 1:
        tb_classify["n_m"].append(key)
 
for key, v in tb_classify.items():
    tb_classify_num[key] = len(tb_classify[key])
print(f"分类情况：{tb_classify_num}")
 
# 搜索1_1类型的表，所有字段是否可能是全部复制于上一个表，也就是两个表的字段数量完全相等可能为全部复制，不完全相等为部分复制
key_search = "1_1"
tb_1_1 = tb_classify[key_search]
for tb in tb_1_1:
    search = tb
    from_tb = []
    from_tb_fd_num = []
    to_tb = []
    to_tb_fd_num = []
    print(f"正在查找表: {search},类型为: {key_search}")
    for item in relations_del:
        f, t = item
        db_f_, tb_f_ = f
        db_t_, tb_t_ = t
        f_str = str(db_f_) + "_" + str(tb_f_)
        t_str = str(db_t_) + "_" + str(tb_t_)
        # 寻找从search指向的表，也就是to
        if f_str == search:
            fd_num_t = tb_all_info[t_str][2]
            fd_num_search = tb_all_info[search][2]
            if fd_num_t == fd_num_search:
                to_tb.append(t_str)
                to_tb_fd_num.append(fd_num_t)
        # 寻找指向search的表，也就是from
        if t_str == search:
            fd_num_f = tb_all_info[f_str][2]
            fd_num_search = tb_all_info[search][2]
            if fd_num_f == fd_num_search:    
                from_tb.append(f_str)
                from_tb_fd_num.append(fd_num_f)
                
    for index in range(len(from_tb)):
        print(f"表关系：from table: {from_tb[index]} ->> current table: {search} ->> to table: {to_tb[index]}")
        print(f"字段数：from table:: {from_tb_fd_num[index]} ->> current table: {fd_num_search} ->> to table: {to_tb_fd_num[index]}")
        print("--------------------------------------------------------------")
        # if t_str == search:
        #     fd_num_f = tb_all_info[f_str][2]
        #     fd_num_search = tb_all_info[search][2]
        #     if fd_num_f == fd_num_search:
        #         print("----------------------------------------------------")
        #         print(f"可能是全复制 from table: {f_str}, to table: {search}, fd_num from: {fd_num_f}, fd_num to: {fd_num_search}")
        #         print("----------------------------------------------------")
        #     else:
        #         print("****************************************************")
        #         print(f"可能是部分复制 from table: {f_str}, to table: {search}, fd_num from: {fd_num_f}, fd_num to: {fd_num_search}")
        #         print("****************************************************")
 
 
 
 
 
# for f, t in relations_del:
#     tb_t, fd_t = t
#     tb_f, fd_f = f
#     if tb_t == search:
#         fd_num_f = tb_all_info[tb_f][2]
#         fd_num_search = tb_all_info[search][2]
#         print(f"from: {tb_f}, to: {fd_num_search}, fd_num from: {fd_num_f}, fd_num search: {fd_num_search}")
#         break
 
 
# 建立字典,用来保存 每个表的入度，出度，字段数
# for key in key_tb:
#     if key not in degree_tb:
#         # 值分别代表：出度，入度，字段数
#         degree_tb[key] = [0, 0, 0]
# print(len(key_tb))
 
# 使用建立的字典,计算 每个表的入度，出度，
# for item in relations_del:
#     f, t = item
#     db_f_, tb_f_ = f
#     db_t_, tb_t_ = t
#     f_str = str(db_f_) + "_" + str(tb_f_)
#     t_str = str(db_t_) + "_" + str(tb_t_)
#     if f_str in degree_tb:
#         degree_tb[f_str][0] += 1
#     if t_str in degree_tb:
#         degree_tb[f_str][1] += 1
# print(degree_tb)
 
# for key, v in degree_tb.items():
#     uni_fd = set(fd_value[key])
#     degree_tb[key][2] = len(uni_fd)
    # print(f)
    # print(t)
 
# print(degree_tb)
# sta_tb_num_in_db(path)
# sta_fd_num_in_tb(path)