code4job

最新推荐文章于 2024-08-10 23:14:03 发布
Charles Chou
最新推荐文章于 2024-08-10 23:14:03 发布
阅读量105
点赞数 1
文章标签： python
本文链接：https://blog.csdn.net/weixin_43249548/article/details/138197211
版权
import pandas as pd
import csv
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt


def read_data(path):
    # 读取CSV文件
    df = pd.read_csv(path)
    # 将每一列数据保存在列表中
    column_data = []
    for column in df.columns:
        column_data.append(df[column].tolist())
    return column_data


def get_fd(path):
    data = read_data(path)
    fd_from = [fd for fd in zip(data[1], data[2], data[3])]
    fd_to = [fd for fd in zip(data[4], data[5], data[6])]
    fd_from_uni = set(fd_from)
    fd_to_uni = set(fd_to)
    return fd_from, fd_to, fd_from_uni, fd_to_uni


def get_tb(path):
    data = read_data(path)
    td_from = [fd for fd in zip(data[1], data[2])]
    td_to = [fd for fd in zip(data[4], data[5])]
    td_from_uni = set(td_from)
    td_to_uni = set(td_to)
    tasks = data[0]
    return td_from, td_to, td_from_uni, td_to_uni, tasks


def get_db(path):
    data_all = read_data(path)
    db_from, db_to = data_all[1], data_all[4]
    tasks = data_all[0]
    return db_from, db_to, tasks


def cons_db_dict_graph(path):
    graph = {}
    db_from, db_to, tasks = get_db(path)
    db_all = db_from + db_to
    for db in db_all:
        if db not in graph:
            graph[db] = []
    for db_f, db_t in zip(db_from, db_to):
        have_nodes = graph[db_f]
        # 处理重复边，和自己指向自己的边
        if db_t not in have_nodes and db_t != db_f:
            graph[db_f].append(db_t)
    return graph


def save_dict_graph(graph, filename):
    with open(filename, 'w') as f:
        for node, neighbors in graph.items():
            neighbors_str = ".".join(map(str, neighbors))
            # f.write(f"{str(node)}: {', '.join(str(neighbors))}\n")
            f.write(f"{node}:{neighbors_str}\n")
        print(f"Graph data saved to {filename}")


def load_graph(filename):
    graph = {}
    with open(filename, 'r') as f:
        for line in f:
            parts = line.strip().split(':')
            node = int(parts[0])
            neighbors = []
            if len(parts) > 1 and parts[1]:
                neighbors = list(map(int, parts[1].split('.')))
            graph[node] = neighbors
    return graph


def dfs(graph, start, path=[], paths=[]):
    path = path + [start]
    if start not in graph:
        return paths
    for node in graph[start]:
        if node not in path:
            paths = dfs(graph, node, path, paths)
    paths.append(path)  # 添加起点到当前节点的路径
    return paths


from collections import deque


def bfs(graph, start, end):
    queue = deque([(start, [start])])  # 初始化队列，起始节点和路径
    while queue:
        current, path = queue.popleft()  # 从队列中取出当前节点和路径
        if current == end:  # 如果当前节点是终点，则返回路径
            return path
        for neighbor in graph.get(current, []):  # 遍历当前节点的邻居节点
            if neighbor not in path:  # 如果邻居节点不在路径中
                queue.append((neighbor, path + [neighbor]))  # 将邻居节点和更新后的路径加入队列
    return None  # 如果没有找到路径，则返回 None


def cons_graph_deduplicate_edges(start, end, tasks):
    deduplicate_edges = {}
    start_uni = []
    end_uni = []
    for edge in zip(start, end):
        if edge not in deduplicate_edges:
            deduplicate_edges[edge] = 1
            start_uni.append(edge[0])
            end_uni.append(edge[1])
        else:
            deduplicate_edges[edge] += 1
    print(len(start))
    print(len(deduplicate_edges))
    edges = pd.DataFrame()
    edges["start"] = start_uni
    edges["end"] = end_uni
    G = nx.from_pandas_edgelist(edges, source="start", target="end")
    return G


def print_all_paths(G, start):
    # 使用广度优先搜索找到所有路径
    for end in G.nodes():
        if start == end:
            continue
        paths = list(nx.all_simple_paths(G, start, end))
        if paths:
            print(f"从节点 {start} 到节点 {end} 的路径:")
            for path in paths:
                print(path)


def db_graph_info():
    path = './process_ret/re_code_all.csv'
    db_from, db_to, tasks = get_db(path)
    G = cons_graph_deduplicate_edges(db_from, db_to, tasks)
    start_node = 1
    print_all_paths(G, start_node)

    # print(nx.degree(G))
    # print(len(list(nx.connected_components(G))))
    # print(list(nx.connected_components(G))[0])
    # print(list(nx.connected_components(G))[1])
    # print(list(nx.connected_components(G))[2])
    # nx.draw(G, with_labels=True, edge_color='b', node_color='g', node_size=1000)
    # plt.show()


def db_layer(fr, to, path='./process_ret/tb_nums_in_db_degree.csv'):
    data = read_data(path)
    db_code, out_d, in_d = data[0], data[2], data[3]

    db_degree = {}
    for info in zip(db_code, out_d, in_d):
        db, od, id = info
        db_degree[db] = [od, id]

    # print(db_degree)
    all_db = fr + to
    all_db = list(set(all_db))
    # print(len(all_db))
    layer = []
    layer_cur = []
    index = 0

    # 单独处理第一层
    for db in all_db:
        # 如果入度为0
        if db_degree[db][1] == 0:
            layer_cur.append(db)
            all_db.remove(db)
    layer.append(layer_cur)
    layer_cur.clear()

    layer_last = []
    while len(all_db) > 0:
        print(f"before process len of all_db : {len(all_db)}")
        # 处理中间的层
        # 如果入度不为0，出度不为0，并且入度来自上一层，
        for db in all_db:
            if db_degree[db][0] != 0 and db_degree[db][1] != 0:  # 出度和入度不为0
                for edge in zip(fr, to):
                    f, t = edge
                    if t == db and f in layer[index]:  # 入度节点是来自于上一层
                        layer_cur.append(db)
                        all_db.remove(db)
                        break
            if db_degree[db][0] == 0 and db_degree[db][1] != 0:  # 出度为0
                layer_last.append(db)
                all_db.remove(db)
        print(f"after process len of all_db : {len(all_db)}")
        layer.append(layer_cur)
        layer_cur.clear()
        index += 1

    layer.append(layer_last)
    print(len(layer))


def cons_nx_graph(start_fd, end_fd, weights):
    edges = pd.DataFrame()
    edges["start"] = start_fd
    edges["end"] = end_fd
    edges["weights"] = weights
    G = nx.from_pandas_edgelist(edges,source="start",target="end",edge_attr="weights")
    return G


# 之前的入度出度文件此处不能用，因为没有去除重复表达以及自身到自身的度
def db_layer_handle_circle(fr, to, path='./process_ret/tb_nums_in_db_degree.csv'):
    layers = []
    layer_first = []
    layer_last = []
    all_db = list(set(fr + to))
    print(f"all uni fr len:{len(fr)}, to len: {len(to)}")
    print(f"all uni db len:{len(all_db)}")
    # data = read_data(path)
    # db_code, out_d, in_d = data[0], data[2], data[3]
    # print(f"all uni db_code len:{len(db_code)}")
    # 重新构造出度和入度的数据，排除重复表达的以及自身到自身的边
    # 减少数据量，去掉重复的from - to
    relations = [(f, t) for f, t in zip(fr, to)]
    print(f"len of relation : {len(relations)}")
    print(f"ele relation : {relations[0]}")
    uni_relation = list(set(relations))  # 去除完全相同的边
    uni_relation = [(f, t) for f, t in uni_relation if f != t]  # 去除自身到自身的边
    print(f"uni relations len : {len(uni_relation)}")
    deduplicate_f = [f for f, _ in uni_relation]
    deduplicate_t = [t for _, t in uni_relation]
    dict_of_degree = {db: [deduplicate_f.count(db), deduplicate_t.count(db)] for db in all_db}
    print(f"len of deduplicate_f and t : {len(deduplicate_f)} , {len(deduplicate_t)}")
    print(f"deduplicate_f {deduplicate_f}")
    print(f"deduplicate_t {deduplicate_t}")
    print(f"dict degree : {dict_of_degree}")
    print(f"len of dict degree : {len(dict_of_degree)}")
    db_code, out_d, in_d = [], [], []
    for k, v in dict_of_degree.items():
        db_code.append(k)
        out_d.append(v[0])
        in_d.append(v[1])
    print(f"db code : {db_code}")
    print(f"db out_d : {out_d}")
    print(f"db in_d : {in_d}")



    # # 看看连通关系
    # G = cons_nx_graph(deduplicate_f, deduplicate_t, deduplicate_f)
    # print(f"lian tong",len(list(nx.connected_components(G))))

    print(f"len of rest db all : {len(all_db)}")
    # 找入度为0的结点，作为第一层
    for info in zip(db_code, out_d, in_d):
        db, od, id = info
        if id == 0 and od != 0:
            layer_first.append(db)
            all_db.remove(db)
        if od == 0 and id == 0:
            layer_first.append(db)
            all_db.remove(db)
    print(f"len of first layer : {len(layer_first)}")
    # 找出度为0的结点，作为最后层
    for info in zip(db_code, out_d, in_d):
        db, od, id = info
        if od == 0 and id != 0 :
            layer_last.append(db)
            all_db.remove(db)
    # print(f"len of first layer: {len(layer_first)}")
    # print(f"ele of first layer: {layer_first}")
    layers.append(layer_first)
    print(f"len of last layer : {len(layer_last)}\n")
    # 验证
    # uni_re = {}
    # for re in zip(fr,to):
    #     if re not in uni_re:
    #         uni_re[re] = 1
    # print(f"uni dict relation len : {len(uni_re)}")

    # 找中间层
    index = 0
    layer_cur = []
    print(f"len of rest db all : {len(all_db)}")
    while len(all_db) > 0:
        # step1，寻找直接来自于上一层的结点,全部找完之后进行下一步
        # for db in all_db:
        #     for re in uni_relation:
        #         f, t = re
        #         if db == t and f in layers[index] and t not in layer_cur:  # 某条f到t的关系满足起点f在上一层，终点t是当前的结点
        #             # print(f"from : {f}, to : {t}")
        #             # print(f"process cur db : {db}")
        #             tp = db
        #             layer_cur.append(tp)
        #             all_db.remove(db)

        # for re in uni_relation:
        #     f, t = re
        #     if f in layers[index] and t not in layer_cur and t in all_db:
        #         # print(f"from : {f}, to : {t}")
        #         layer_cur.append(t)
        #         all_db.remove(t)

        for db in layers[index]:
            for re in uni_relation:
                f, t = re
                if db == f and t in all_db:
                    layer_cur.append(t)
                    all_db.remove(t)

        # print(f"cur layer len : {len(layer_cur)} , db : {layer_cur}")
        # step2，寻找指向本层的结点,因为要找完，不确定有没有，并且顺序也不知道
        flag = 1
        # for re in uni_relation:
        #     f, t = re
        #     if t in layer_cur and f in all_db:  # 存在一个边指向本层结点，并且起点是没使用过的，表明可以开始寻找
        #         flag = 1
        while flag != 0:
            flag = 0
            for re in uni_relation:
                f, t = re
                if t in layer_cur and f in all_db:  # 存在一个边指向本层结点，并且起点是没使用过的，表明可以开始寻找
                    flag = 1
                    layer_cur.append(f)
                    all_db.remove(f)
        layers.append(layer_cur)
        print(f"len of cur layer   : {len(layer_cur)}")
        print(f"len of rest db all : {len(all_db)}")
        print("\n")
        index += 1
        layer_cur.clear()
        # print(f"index : {index}")
        # print(f"layer all len: {len(layers)}")




path = './process_ret/re_code_all.csv'

data = read_data(path)
fr, to = data[1], data[4]
db_layer_handle_circle(fr, to)

# db_graph_info()

# G = cons_db_dict_graph(path)
# print(G)
# save_dict_graph(G, './process_ret/db_graph.txt')
# start = 1
# end = 238
# print(bfs(G,start,end))
# PATH = dfs(G, start)
# print(PATH)

# g = load_graph('./process_ret/db_graph.txt')
# print(g)