import csv
import networkx as nx
import matplotlib.pyplot as plt
from openpyxl import Workbook
import numpy as np
def read_csv_columns(file_path, column1_index, column2_index):
# 创建两个空列表,用于存储两列数据
column1_data = []
column2_data = []
# 打开 CSV 文件进行读取
with open(file_path, 'r', newline='') as csvfile:
# 创建 CSV 读取器
csv_reader = csv.reader(csvfile)
next(csv_reader)
# 读取 CSV 文件中的每一行
for row in csv_reader:
# 读取指定索引的两列数据并存储到对应的列表中
column1_data.append(row[column1_index])
column2_data.append(row[column2_index])
return column1_data, column2_data
def split_strings_to_tuples(string_list):
# 使用列表推导式,将每个字符串按点分割成元组
tuple_list = [tuple(s.split('.')) for s in string_list]
return tuple_list
def get_from_to(csv_file_path):
# 指定要读取的两列索引(假设这两列分别是第一列和第三列,索引从0开始)
column1_index = 3
column2_index = 4
# 读取 CSV 文件的两列数据
from_table, to_table = read_csv_columns(csv_file_path, column1_index, column2_index)
# print(len(from_table))
# print(len(to_table))
# 数据分裂为tuple数据
from_tuple_list = split_strings_to_tuples(from_table)
to_tuple_list = split_strings_to_tuples(to_table)
db_from = []
tb_from = []
from_tuple_list_err = []
for item in from_tuple_list:
if len(item) >= 2:
db_from.append(item[0])
tb_from.append(item[1])
else:
print(" from Tuple length is not enough:", item)
from_tuple_list_err.append(item)
# 元组数量错误,用相同的填充
db_from.append(item[0])
tb_from.append(item[0])
db_to = []
tb_to = []
to_tuple_list_err = []
for item in to_tuple_list:
if len(item) >= 2:
db_to.append(item[0])
tb_to.append(item[1])
else:
print("to Tuple length is not enough:", item)
to_tuple_list_err.append(item)
db_to.append(item[0])
tb_to.append(item[0])
return from_tuple_list, to_tuple_list, db_from, tb_from, db_to, tb_to
def make_2d_dir_graph(start_nodes, end_nodes):
# 创建一个有向图对象
G = nx.DiGraph()
datassss = zip(start_nodes, end_nodes)
print(type(datassss))
# 添加边到图中
for start, end in zip(start_nodes, end_nodes):
G.add_edge(start, end)
# 绘制图形
nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_color='k', linewidths=1, arrows=True)
# 显示图形
plt.show()
def make_3d_dir_graph(start_nodes, end_nodes):
# 创建一个3D图形
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
# 创建一个有向图对象
G = nx.DiGraph()
# 添加边到图中
for start, end in zip(start_nodes, end_nodes):
G.add_edge(start, end)
# 使用spring_layout函数确定节点的布局
pos = nx.spring_layout(G, dim=3)
# 绘制节点
for node, (x, y, z) in pos.items():
ax.scatter(x, y, z, color='skyblue')
ax.text(x, y, z, str(node), fontsize=12)
# 绘制边
for start, end in G.edges():
x = [pos[start][0], pos[end][0]]
y = [pos[start][1], pos[end][1]]
z = [pos[start][2], pos[end][2]]
ax.plot(x, y, z, color='k')
# 显示图形
plt.show()
def encode_str_list_2_num(strings):
# 创建一个空字典,用于存储字符串到数字的映射关系
string_to_number = {}
# 计数器,用于连续编号
counter = 0
# 遍历字符串列表,确保连续编号
for string in strings:
if string not in string_to_number:
string_to_number[string] = counter
counter += 1
# 输出映射关系
# print("字符串到数字的映射关系:", string_to_number)
# 如果需要将原始列表替换为数字列表,可以使用列表推导式
numeric_list = [string_to_number[string] for string in strings]
return numeric_list
def make_dir_use_str(strings):
# 创建一个空字典,用于存储字符串到数字的映射关系
string_to_number = {}
# 计数器,用于连续编号
counter = 0
# 遍历字符串列表,确保连续编号
for string in strings:
if string not in string_to_number:
string_to_number[string] = counter
counter += 1
return string_to_number
def encode_str_use_dir(string_list, string_to_number):
# 将字符串列表中的字符串替换为对应的数字
number_list = [string_to_number[string] for string in string_list]
return number_list
def adjacency_matrix(start_points, end_points):
# Create a dictionary to store adjacency matrix
adjacency_dict = {}
# Iterate through the edges and populate the adjacency dictionary
for start, end in zip(start_points, end_points):
if start in adjacency_dict:
adjacency_dict[start].append(end)
else:
adjacency_dict[start] = [end]
# Find the maximum node index
max_node = max(max(start_points), max(end_points))
# Initialize the adjacency matrix with zeros
matrix = [[0] * (max_node + 1) for _ in range(max_node + 1)]
# Fill in the adjacency matrix using the dictionary
for start, ends in adjacency_dict.items():
for end in ends:
matrix[start][end] = 1
return matrix
def build_adjacency_matrix(start_points, end_points, direc):
num_nodes = len(set(direc))
edges = list(zip(start_points, end_points))
adjacency_matrix = [[0] * num_nodes for _ in range(num_nodes)]
for start, end in edges:
adjacency_matrix[start][end] += 1
# print(f"start:{start},end:{end}")
return adjacency_matrix
def save_adjacency_matrix_to_excel(adjacency_matrix, filename):
wb = Workbook()
ws = wb.active
for row_index, row in enumerate(adjacency_matrix):
for col_index, value in enumerate(row):
ws.cell(row=row_index+1, column=col_index+1, value=value)
wb.save(filename)
print(f"邻接矩阵已保存到 {filename}")
def plot_adjacency_matrix(adjacency_matrix, filename):
plt.imshow(adjacency_matrix, cmap='viridis', interpolation='nearest')
# 添加横纵坐标
plt.xticks(np.arange(len(adjacency_matrix)), np.arange(len(adjacency_matrix)))
plt.yticks(np.arange(len(adjacency_matrix)), np.arange(len(adjacency_matrix)))
plt.xlabel('X Label')
plt.ylabel('Y Label')
plt.colorbar()
plt.savefig(filename)
print(f"图形已保存到 {filename}")
def save_dict_to_excel(data_dict, filename):
wb = Workbook()
ws = wb.active
# 写入字典的键和值到Excel表格中
for row_index, (key, value) in enumerate(data_dict.items(), start=1):
ws.cell(row=row_index, column=1, value=key)
ws.cell(row=row_index, column=2, value=value)
wb.save(filename)
print(f"字典已保存到 {filename}")
def find_nodes_with_indegree_but_no_outdegree(matrix):
nodes = []
for j in range(len(matrix[0])):
out_degree = sum(row[j] for row in matrix)
if out_degree == 0:
in_degree = sum(matrix[i][j] for i in range(len(matrix)))
if in_degree > 0:
nodes.append(j)
return nodes
def find_nodes_with_outdegree_but_no_indegree(matrix):
nodes = []
for i in range(len(matrix)):
in_degree = sum(matrix[i])
if in_degree == 0:
out_degree = sum(matrix[i][j] for j in range(len(matrix[0])))
if out_degree > 0:
nodes.append(i)
return nodes
# 获取所有数据
csv_file_path = './data/表血缘-带任务.csv'
from_tuple_list, to_tuple_list, db_from, tb_from, db_to, tb_to = get_from_to(csv_file_path)
# 制作数据库字典
concat_db_list_dir = db_from + db_to
dir_all_db = make_dir_use_str(concat_db_list_dir)
# 制作表字典
concat_tb_list_dir = tb_from + tb_to
dir_all_tb = make_dir_use_str(concat_tb_list_dir)
# 数据库编号
db_from_ = encode_str_use_dir(db_from, dir_all_db)
db_to_ = encode_str_use_dir(db_to, dir_all_db)
# 表编号
tb_from_ = encode_str_use_dir(tb_from, dir_all_tb)
tb_to_ = encode_str_use_dir(tb_to, dir_all_tb)
print(len(dir_all_db))
print(len(set(db_from_)))
print(len(set(db_to_)))
print(len(dir_all_tb))
print(len(set(tb_from_)))
print(len(set(tb_to_)))
# print(f"len dir tb from {len(set(tb_from_))} ,tb to {len(set(tb_to_))}")
# 数据库关系邻接矩阵
db_adj_matrix = build_adjacency_matrix(db_from_, db_to_, dir_all_db)
# 表关系邻接矩阵
tb_adj_matrix = build_adjacency_matrix(tb_from_, tb_to_, dir_all_tb)
find_final = find_nodes_with_indegree_but_no_outdegree(db_adj_matrix)
find_start = find_nodes_with_outdegree_but_no_indegree(db_adj_matrix)
print("db有入度但没有出度的点:", find_final)
print("db没有入度但有出度的点:", find_start)
# # 数据库字典文件保存
# db_filename_dict = "db_data_dict.xlsx"
# save_dict_to_excel(dir_all_db, db_filename_dict)
# # 表字典文件保存
# tb_filename_dict = "tb_data_dict.xlsx"
# save_dict_to_excel(dir_all_tb, tb_filename_dict)
#
# # 数据库邻接矩阵文件保存
# db_filename_adj = "db_adjacency_matrix.xlsx" # 保存的文件名
# db_filename_adj_plt = "db_adjacency_plt.png" # 保存的文件名
# save_adjacency_matrix_to_excel(db_adj_matrix, db_filename_adj)
# # plot_adjacency_matrix(db_adj_matrix, db_filename_adj_plt)
#
# # 表邻接矩阵文件保存
# tb_filename_adj = "tb_adjacency_matrix.xlsx" # 保存的文件名
# tb_filename_adj_plt = "tb_adjacency_plt.png" # 保存的文件名
# save_adjacency_matrix_to_excel(tb_adj_matrix, tb_filename_adj)
# # plot_adjacency_matrix(tb_adj_matrix, tb_filename_adj_plt)
data process
于 2024-03-21 17:23:57 首次发布