data process

import csv
import networkx as nx
import matplotlib.pyplot as plt
from openpyxl import Workbook
import numpy as np
def read_csv_columns(file_path, column1_index, column2_index):
    # 创建两个空列表,用于存储两列数据
    column1_data = []
    column2_data = []

    # 打开 CSV 文件进行读取
    with open(file_path, 'r', newline='') as csvfile:
        # 创建 CSV 读取器
        csv_reader = csv.reader(csvfile)
        next(csv_reader)
        # 读取 CSV 文件中的每一行
        for row in csv_reader:
            # 读取指定索引的两列数据并存储到对应的列表中
            column1_data.append(row[column1_index])
            column2_data.append(row[column2_index])

    return column1_data, column2_data


def split_strings_to_tuples(string_list):
    # 使用列表推导式,将每个字符串按点分割成元组
    tuple_list = [tuple(s.split('.')) for s in string_list]
    return tuple_list


def get_from_to(csv_file_path):
    # 指定要读取的两列索引(假设这两列分别是第一列和第三列,索引从0开始)
    column1_index = 3
    column2_index = 4

    # 读取 CSV 文件的两列数据
    from_table, to_table = read_csv_columns(csv_file_path, column1_index, column2_index)
    # print(len(from_table))
    # print(len(to_table))

    # 数据分裂为tuple数据
    from_tuple_list = split_strings_to_tuples(from_table)
    to_tuple_list = split_strings_to_tuples(to_table)

    db_from = []
    tb_from = []
    from_tuple_list_err = []
    for item in from_tuple_list:
        if len(item) >= 2:
            db_from.append(item[0])
            tb_from.append(item[1])
        else:
            print(" from Tuple length is not enough:", item)
            from_tuple_list_err.append(item)
            # 元组数量错误,用相同的填充
            db_from.append(item[0])
            tb_from.append(item[0])
    db_to = []
    tb_to = []
    to_tuple_list_err = []
    for item in to_tuple_list:
        if len(item) >= 2:
            db_to.append(item[0])
            tb_to.append(item[1])
        else:
            print("to Tuple length is not enough:", item)
            to_tuple_list_err.append(item)
            db_to.append(item[0])
            tb_to.append(item[0])
    return from_tuple_list, to_tuple_list, db_from, tb_from, db_to, tb_to


def make_2d_dir_graph(start_nodes, end_nodes):
    # 创建一个有向图对象
    G = nx.DiGraph()
    datassss = zip(start_nodes, end_nodes)
    print(type(datassss))
    # 添加边到图中
    for start, end in zip(start_nodes, end_nodes):
        G.add_edge(start, end)
    # 绘制图形
    nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_color='k', linewidths=1, arrows=True)
    # 显示图形
    plt.show()


def make_3d_dir_graph(start_nodes, end_nodes):
    # 创建一个3D图形
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    # 创建一个有向图对象
    G = nx.DiGraph()

    # 添加边到图中
    for start, end in zip(start_nodes, end_nodes):
        G.add_edge(start, end)

    # 使用spring_layout函数确定节点的布局
    pos = nx.spring_layout(G, dim=3)

    # 绘制节点
    for node, (x, y, z) in pos.items():
        ax.scatter(x, y, z, color='skyblue')
        ax.text(x, y, z, str(node), fontsize=12)

    # 绘制边
    for start, end in G.edges():
        x = [pos[start][0], pos[end][0]]
        y = [pos[start][1], pos[end][1]]
        z = [pos[start][2], pos[end][2]]
        ax.plot(x, y, z, color='k')

    # 显示图形
    plt.show()


def encode_str_list_2_num(strings):
    # 创建一个空字典,用于存储字符串到数字的映射关系
    string_to_number = {}
    # 计数器,用于连续编号
    counter = 0
    # 遍历字符串列表,确保连续编号
    for string in strings:
        if string not in string_to_number:
            string_to_number[string] = counter
            counter += 1
    # 输出映射关系
    # print("字符串到数字的映射关系:", string_to_number)
    # 如果需要将原始列表替换为数字列表,可以使用列表推导式
    numeric_list = [string_to_number[string] for string in strings]
    return numeric_list


def make_dir_use_str(strings):
    # 创建一个空字典,用于存储字符串到数字的映射关系
    string_to_number = {}
    # 计数器,用于连续编号
    counter = 0
    # 遍历字符串列表,确保连续编号
    for string in strings:
        if string not in string_to_number:
            string_to_number[string] = counter
            counter += 1
    return string_to_number


def encode_str_use_dir(string_list, string_to_number):
    # 将字符串列表中的字符串替换为对应的数字
    number_list = [string_to_number[string] for string in string_list]
    return number_list


def adjacency_matrix(start_points, end_points):
    # Create a dictionary to store adjacency matrix
    adjacency_dict = {}
    # Iterate through the edges and populate the adjacency dictionary
    for start, end in zip(start_points, end_points):
        if start in adjacency_dict:
            adjacency_dict[start].append(end)
        else:
            adjacency_dict[start] = [end]
    # Find the maximum node index
    max_node = max(max(start_points), max(end_points))
    # Initialize the adjacency matrix with zeros
    matrix = [[0] * (max_node + 1) for _ in range(max_node + 1)]
    # Fill in the adjacency matrix using the dictionary
    for start, ends in adjacency_dict.items():
        for end in ends:
            matrix[start][end] = 1
    return matrix


def build_adjacency_matrix(start_points, end_points, direc):
    num_nodes = len(set(direc))
    edges = list(zip(start_points, end_points))
    adjacency_matrix = [[0] * num_nodes for _ in range(num_nodes)]
    for start, end in edges:
        adjacency_matrix[start][end] += 1
        # print(f"start:{start},end:{end}")
    return adjacency_matrix


def save_adjacency_matrix_to_excel(adjacency_matrix, filename):
    wb = Workbook()
    ws = wb.active
    for row_index, row in enumerate(adjacency_matrix):
        for col_index, value in enumerate(row):
            ws.cell(row=row_index+1, column=col_index+1, value=value)
    wb.save(filename)
    print(f"邻接矩阵已保存到 {filename}")


def plot_adjacency_matrix(adjacency_matrix, filename):
    plt.imshow(adjacency_matrix, cmap='viridis', interpolation='nearest')
    # 添加横纵坐标
    plt.xticks(np.arange(len(adjacency_matrix)), np.arange(len(adjacency_matrix)))
    plt.yticks(np.arange(len(adjacency_matrix)), np.arange(len(adjacency_matrix)))
    plt.xlabel('X Label')
    plt.ylabel('Y Label')
    plt.colorbar()
    plt.savefig(filename)
    print(f"图形已保存到 {filename}")


def save_dict_to_excel(data_dict, filename):
    wb = Workbook()
    ws = wb.active
    # 写入字典的键和值到Excel表格中
    for row_index, (key, value) in enumerate(data_dict.items(), start=1):
        ws.cell(row=row_index, column=1, value=key)
        ws.cell(row=row_index, column=2, value=value)
    wb.save(filename)
    print(f"字典已保存到 {filename}")


def find_nodes_with_indegree_but_no_outdegree(matrix):
    nodes = []
    for j in range(len(matrix[0])):
        out_degree = sum(row[j] for row in matrix)
        if out_degree == 0:
            in_degree = sum(matrix[i][j] for i in range(len(matrix)))
            if in_degree > 0:
                nodes.append(j)
    return nodes


def find_nodes_with_outdegree_but_no_indegree(matrix):
    nodes = []
    for i in range(len(matrix)):
        in_degree = sum(matrix[i])
        if in_degree == 0:
            out_degree = sum(matrix[i][j] for j in range(len(matrix[0])))
            if out_degree > 0:
                nodes.append(i)
    return nodes


# 获取所有数据
csv_file_path = './data/表血缘-带任务.csv'
from_tuple_list, to_tuple_list, db_from, tb_from, db_to, tb_to = get_from_to(csv_file_path)

# 制作数据库字典
concat_db_list_dir = db_from + db_to
dir_all_db = make_dir_use_str(concat_db_list_dir)
# 制作表字典
concat_tb_list_dir = tb_from + tb_to
dir_all_tb = make_dir_use_str(concat_tb_list_dir)

# 数据库编号
db_from_ = encode_str_use_dir(db_from, dir_all_db)
db_to_ = encode_str_use_dir(db_to, dir_all_db)
# 表编号
tb_from_ = encode_str_use_dir(tb_from, dir_all_tb)
tb_to_ = encode_str_use_dir(tb_to, dir_all_tb)
print(len(dir_all_db))
print(len(set(db_from_)))
print(len(set(db_to_)))

print(len(dir_all_tb))
print(len(set(tb_from_)))
print(len(set(tb_to_)))
# print(f"len dir tb from {len(set(tb_from_))} ,tb to {len(set(tb_to_))}")
# 数据库关系邻接矩阵
db_adj_matrix = build_adjacency_matrix(db_from_, db_to_, dir_all_db)


# 表关系邻接矩阵
tb_adj_matrix = build_adjacency_matrix(tb_from_, tb_to_, dir_all_tb)

find_final = find_nodes_with_indegree_but_no_outdegree(db_adj_matrix)
find_start = find_nodes_with_outdegree_but_no_indegree(db_adj_matrix)
print("db有入度但没有出度的点:", find_final)
print("db没有入度但有出度的点:", find_start)
# # 数据库字典文件保存
# db_filename_dict = "db_data_dict.xlsx"
# save_dict_to_excel(dir_all_db, db_filename_dict)
# # 表字典文件保存
# tb_filename_dict = "tb_data_dict.xlsx"
# save_dict_to_excel(dir_all_tb, tb_filename_dict)
#
# # 数据库邻接矩阵文件保存
# db_filename_adj = "db_adjacency_matrix.xlsx"  # 保存的文件名
# db_filename_adj_plt = "db_adjacency_plt.png"  # 保存的文件名
# save_adjacency_matrix_to_excel(db_adj_matrix, db_filename_adj)
# # plot_adjacency_matrix(db_adj_matrix, db_filename_adj_plt)
#
# # 表邻接矩阵文件保存
# tb_filename_adj = "tb_adjacency_matrix.xlsx"  # 保存的文件名
# tb_filename_adj_plt = "tb_adjacency_plt.png"  # 保存的文件名
# save_adjacency_matrix_to_excel(tb_adj_matrix, tb_filename_adj)
# # plot_adjacency_matrix(tb_adj_matrix, tb_filename_adj_plt)

  • 3
    点赞
  • 10
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值