data_process11

最新推荐文章于 2024-05-23 14:11:31 发布

Charles Chou

最新推荐文章于 2024-05-23 14:11:31 发布

阅读量169

点赞数 2

文章标签： windows python 开发语言

本文链接：https://blog.csdn.net/weixin_43249548/article/details/137650951

版权

encode

import csv

import pandas as pd


def read_taskid(path,index):
    df = pd.read_csv(path)
    taskid = df.iloc[:,index]
    return  taskid


def write_lists_to_csv_columns(lists, header, filename):
    # Transpose the lists
    transposed_lists = list(map(list, zip(*lists)))

    # Write to CSV
    with open(filename, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(header)
        writer.writerows(transposed_lists)


def encode_str_use_dir(string_list, string_to_number):
    # 将字符串列表中的字符串替换为对应的数字
    number_list = [string_to_number[string] for string in string_list]
    return number_list


def make_dir_use_str(strings):
    # 创建一个空字典，用于存储字符串到数字的映射关系
    string_to_number = {}
    # 计数器，用于连续编号
    counter = 0
    # 遍历字符串列表，确保连续编号
    for string in strings:
        if string not in string_to_number:
            string_to_number[string] = counter
            counter += 1
    return string_to_number


# def save_dict_to_csv(dictionary, filename):
#     with open(filename, 'w', newline='') as csvfile:
#         fieldnames = list(dictionary.keys())
#         writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
#
#         writer.writeheader()
#         writer.writerow(dictionary)

def save_dict_to_csv(dictionary, filename):
    with open(filename, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        for key, value in dictionary.items():
            writer.writerow([key, value])


def encode_list(lst):
    encoding_dict = {}
    encoded_list = []
    code = 0
    for item in lst:
        if item not in encoding_dict:
            encoding_dict[item] = code
            code += 1
        encoded_list.append(encoding_dict[item])
    # print(encoding_dict)
    return encoded_list, encoding_dict


def read_field_csv(path, index_of_data):
    # 打开 CSV 文件
    with open(path, 'r', newline='', encoding='utf-8') as csvfile:
        # 创建 CSV 读取器
        reader = csv.reader(csvfile)
        # 跳过第一行表头
        next(reader)
        # 读取第四列数据
        fourth_column_data = []
        for row in reader:
            split_v = row[index_of_data].split('.')
            fourth_column_data.append(split_v)  # 注意，Python 中的索引是从0开始的

    for col_v in fourth_column_data:
        if len(col_v) == 2:
            print("xxx")
    # # 输出第四列数据
    # print(fourth_column_data[299])
    # print(fourth_column_data[299][0])
    # print(fourth_column_data[299][1])
    # print(fourth_column_data[299][2])
    db = [col_vs[0] for col_vs in fourth_column_data]
    tb = [col_vs[1] for col_vs in fourth_column_data]
    fd = [col_vs[2] for col_vs in fourth_column_data]
    # 字段全数据是fourth_column_data 用fourth_column_data代替fd_tb_db
    # fd_tb_db = [col_vs[:3] for col_vs in fourth_column_data]
    tb_db = [col_vs[:2] for col_vs in fourth_column_data]

    return fourth_column_data, db, tb, fd, tb_db




f_path = '../data/字段血缘-带任务.csv'
column_data_from, db_from, tb_from, fd_from, tb_db_from = read_field_csv(f_path, 3)
column_data_to, db_to, tb_to, fd_to, tb_db_to = read_field_csv(f_path, 4)
# print(f"all : {len(fourth_column_data)}, db : {len(db)}, tb : {len(tb)}, fd : {len(fd)}")


# db_from_ecd, db_from_dict = encode_list(db_from)
# tb_from_ecd, tb_from_dict = encode_list(tb_from)
# fd_from_ecd, fd_from_dict = encode_list(fd_from)
#
# print(f"all : {len(column_data_from)}, db : {len(db_from_ecd)}, tb : {len(tb_from_ecd)}, fd : {len(fd_from_ecd)}")
# print(f"all : {len(column_data_from)}, db uni : {len(set(db_from_ecd))}, tb uni: {len(set(tb_from_ecd))}, fd uni: {len(set(fd_from_ecd))}")
# print(f"all : {len(column_data_from)}, db dict: {len(db_from_ecd)}, tb dict: {len(tb_from_ecd)}, fd dict: {len(fd_from_ecd)}")

# save_dict_to_csv(db_from_dict, './process_ret/db_dict.csv')
# save_dict_to_csv(tb_from_dict, './process_ret/tb_dict.csv')
# save_dict_to_csv(fd_from_dict, './process_ret/fd_dict.csv')

# 全部出现的数据
db_for_make_dir = db_from + db_to
tb_for_make_dir = tb_from + tb_to
fd_for_make_dir = fd_from + fd_to

# 全部词典
db_dir = make_dir_use_str(db_for_make_dir)
tb_dir = make_dir_use_str(tb_for_make_dir)
fd_dir = make_dir_use_str(fd_for_make_dir)

_, db_dict = encode_list(db_for_make_dir)
_, tb_dict = encode_list(tb_for_make_dir)
_, fd_dict = encode_list(fd_for_make_dir)

save_dict_to_csv(db_dir, './process_ret/db_dict.csv')
save_dict_to_csv(tb_dir, './process_ret/tb_dict.csv')
save_dict_to_csv(fd_dir, './process_ret/fd_dict.csv')

# from编码
db_from_ecd = encode_str_use_dir(db_from, db_dict)
tb_from_ecd = encode_str_use_dir(tb_from, tb_dict)
fd_from_ecd = encode_str_use_dir(fd_from, fd_dict)

# to编码
db_to_ecd = encode_str_use_dir(db_to, db_dir)
tb_to_ecd = encode_str_use_dir(tb_to, tb_dir)
fd_to_ecd = encode_str_use_dir(fd_to, fd_dir)

rewrite_path = './process_ret/re_code_all.csv'
taskid = read_taskid(f_path,0)
rewrite_all_data = []
rewrite_all_data.append(taskid)
rewrite_all_data.append(db_from_ecd)
rewrite_all_data.append(tb_from_ecd)
rewrite_all_data.append(fd_from_ecd)
rewrite_all_data.append(db_to_ecd)
rewrite_all_data.append(tb_to_ecd)
rewrite_all_data.append(fd_to_ecd)
header = ['task id', "db_from_ecd", "tb_from_ecd", "fd_from_ecd", "db_to_ecd", "tb_to_ecd", "fd_to_ecd"]
write_lists_to_csv_columns(rewrite_all_data, header, rewrite_path)



print(f"db dict: {len(db_dir)}, tb dict: {len(tb_dir)}, fd dict: {len(fd_dir)}")
print(f"FROM : db : {len(db_from_ecd)}, tb : {len(tb_from_ecd)}, fd : {len(fd_from_ecd)}")
print(f"FROM : db uni : {len(set(db_from_ecd))}, tb uni: {len(set(tb_from_ecd))}, fd uni: {len(set(fd_from_ecd))}")
print(f"TO : db : {len(db_to_ecd)}, tb : {len(tb_to_ecd)}, fd : {len(fd_to_ecd)}")
print(f"TO : db uni : {len(set(db_to_ecd))}, tb uni: {len(set(tb_to_ecd))}, fd uni: {len(set(fd_to_ecd))}")


# db_ecd, db_dict = encode_list(db_from + db_to)
# tb_ecd, tb_dict = encode_list(tb_from + tb_to)
# fd_ecd, fd_dict = encode_list(fd_from + fd_to)

# save_dict_to_csv(db_dict, './process_ret/db_dict.csv')
# save_dict_to_csv(tb_dict, './process_ret/tb_dict.csv')
# save_dict_to_csv(fd_dict, './process_ret/fd_dict.csv')

# print(f"all : {len(column_data_from)}, db : {len(db_ecd)}, tb : {len(tb_ecd)}, fd : {len(fd_ecd)}")
# print(f"all : {len(column_data_from)}, db uni : {len(set(db_ecd))}, tb uni: {len(set(tb_ecd))}, fd uni: {len(set(fd_ecd))}")
# print(f"all : {len(column_data_from)}, db dict: {len(db_dict)}, tb dict: {len(tb_dict)}, fd dict: {len(fd_dict)}")

process

import pandas as pd
import csv


# 处理tb
def save_tb_dict_to_csv_with_degree(filename, dictionary):
    with open(filename, 'w', newline='') as csvfile:

        writer = csv.writer(csvfile)
        writer.writerow(['tb_code','tb_num', 'out', 'in'])
        for key, value in dictionary.items():
            data = []
            data.append(key)
            data.extend(value)
            # print(data)
            writer.writerow(data)

# 处理db
def save_dict_to_csv_with_degree(filename, dictionary):
    with open(filename, 'w', newline='') as csvfile:

        writer = csv.writer(csvfile)
        writer.writerow(['db_code','tb_num', 'out', 'in'])
        for key, value in dictionary.items():
            data = []
            data.append(key)
            data.extend(value)
            print(data)
            writer.writerow(data)


def save_dict_to_csv(filename, dictionary):
    with open(filename, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        for key, value in dictionary.items():
            writer.writerow([key, value])


def save_dict(save_name, dict_save):
    # 设置CSV文件路径
    csv_file = save_name
    # 将字典保存为CSV文件
    with open(csv_file, 'w', newline='') as file:
        writer = csv.writer(file)
        # 写入表头
        writer.writerow(dict_save.keys())
        # 写入数据
        writer.writerow(dict_save.values())

    print("CSV文件已保存为:", csv_file)


def read_data(path):
    # 读取CSV文件
    df = pd.read_csv(path)
    # 将每一列数据保存在列表中
    column_data = []
    for column in df.columns:
        column_data.append(df[column].tolist())
    # print(len(column_data))
    # print(len(column_data[0]))
    # print(column_data[0][0])
    # 显示列表中的数据（可选）
    # for col_data in column_data:
    #     print(len(col_data))
    return column_data


def sta_tb_num_in_db(path):
    data = read_data(path)
    db_from, tb_from, db_to, tb_to = data[1], data[2], data[4], data[5]
    print(f"uni db 0 :{db_from.count(0)}")

    db_all = db_from + db_to
    tb_all = tb_from + tb_to

    # db_from.extend(db_to)
    # tb_from.extend(tb_to)

    tb_in_db = {}
    # tb_in_db['db_code'] = ['tb_num','out','in']
    uni_tb_db = {}

    db_out = {}
    db_in = {}
    for d_o, d_i in zip(db_from, db_to):
        if d_o not in db_out:
            db_out[d_o] = 1
        else:
            db_out[d_o] += 1
        if d_i not in db_in:
            db_in[d_i] = 1
        else:
            db_in[d_i] += 1
    print(f"len of out : {len(db_out)}")
    print(f"len of in : {len(db_in)}")

    for relation in zip(db_all, tb_all):
        if relation not in uni_tb_db:
            uni_tb_db[relation] = relation[0]
    print(len(uni_tb_db))
    # print(uni_tb_db)

    for key, value in uni_tb_db.items():
        if value not in tb_in_db:
            tb_in_db[value] = 1
        else:
            tb_in_db[value] += 1
    print(len(tb_in_db))
    print(tb_in_db)

    # 解决某些库没有入度或者出度的情况
    for key, value in tb_in_db.items():
        if key not in db_out:
            db_out[key] = 0
        if key not in db_in:
            db_in[key] = 0
    print(f"len of out : {len(db_out)}")
    print(f"len of in : {len(db_in)}")

    merge_dict = {key: [tb_in_db[key], db_out[key], db_in[key]] for key in tb_in_db}
    # save_dict_to_csv('./process_ret/tb_nums_in_db.csv', tb_in_db)
    save_dict_to_csv_with_degree('./process_ret/tb_nums_in_db_degree.csv', merge_dict)


def sta_fd_num_in_tb(path):
    data = read_data(path)
    db_from, tb_from, fd_from, db_to, tb_to, fd_to = data[1], data[2], data[3], data[4], data[5], data[6]

    tb_out = {}
    tb_in = {}
    for tb_o in zip(db_from, tb_from):
        d, t = tb_o
        key = str(d) + "__" + str(t)
        if key not in tb_out:
            tb_out[key] = 1
        else:
            tb_out[key] += 1
    for tb_i in zip(db_to, tb_to):
        d, t = tb_i
        key = str(d) + "__" + str(t)
        if key not in tb_in:
            tb_in[key] = 1
        else:
            # print("esle")
            tb_in[key] += 1

    db_all = db_from + db_to
    tb_all = tb_from + tb_to
    fd_all = fd_from + fd_to


    # print(tb_in)
    print(f"len of out : {len(tb_out)}")
    print(f"len of in : {len(tb_in)}")

    fb_in_tb = {}
    # fb_in_tb['tb_code'] = 'fb_num'
    uni_tb_db = {}

    for relation in zip(db_all, tb_all, fd_all):
        if relation not in uni_tb_db:
            uni_tb_db[relation] =str(relation[0])+"__"+str(relation[1])
            # uni_tb_db[relation] = relation[1]
    print(len(uni_tb_db))
    # print(len(fd_all))
    # print(uni_tb_db)

    for key, value in uni_tb_db.items():
        if value not in fb_in_tb:
            fb_in_tb[value] = 1
        else:
            fb_in_tb[value] += 1

    for key, value in fb_in_tb.items():
        if key not in tb_out:
            tb_out[key] = 0
        if key not in tb_in:
            tb_in[key] = 0
    print(len(fb_in_tb))
    print(f"len of out : {len(tb_out)}")
    print(f"len of in : {len(tb_in)}")
    merge_dict = {key: [fb_in_tb[key], tb_out[key], tb_in[key]] for key in fb_in_tb}
    # print(fb_in_tb)
    save_tb_dict_to_csv_with_degree('./process_ret/fb_nums_in_tb_degree.csv', merge_dict)
    # save_dict_to_csv('./process_ret/fb_nums_in_tb.csv', fb_in_tb)


path = './process_ret/re_code_all.csv'
sta_tb_num_in_db(path)
# sta_fd_num_in_tb(path)

search graph

import pandas as pd
import csv

def read_data(path):
    # 读取CSV文件
    df = pd.read_csv(path)
    # 将每一列数据保存在列表中
    column_data = []
    for column in df.columns:
        column_data.append(df[column].tolist())
    return column_data


def get_fd(path):
    data = read_data(path)
    fd_from = [fd for fd in zip(data[1],data[2],data[3])]
    fd_to = [fd for fd in zip(data[4], data[5], data[6])]
    fd_from_uni = set(fd_from)
    fd_to_uni = set(fd_to)
    start = (0,0,0)
    end = (1,1,1)
    edges = []
    # print(f"len of fd all :{len(fd_from),len(fd_to)}, len of fd uni :{len(fd_from_uni),len(fd_to_uni)}")
    return fd_from,fd_to,fd_from_uni,fd_to_uni


def cons_graph(path):
    graph = {}
    fd_from, fd_to, fd_from_uni, fd_to_uni = get_fd(path)
    fd_all = fd_from + fd_to
    print(f"len of all fd : {len(fd_all)}")
    for fd in fd_all:
        d, t, f = fd
        key = str(d)+"_"+str(t)+"_"+str(f)
        if key not in graph:
            graph[key] = []
    print(f"len of uni fd : {len(graph)}")

    for fd_f, fd_t in zip(fd_from,fd_to):
        d, t, f = fd_f
        key = str(d)+"_"+str(t)+"_"+str(f)
        d1, t1, f1 = fd_t
        fd_2 = str(d1) + "_" + str(t1) + "_" + str(f1)
        graph[key].append(fd_2)

    return graph




# def dfs(graph, start, end, path=[], paths=[]):
#     path = path + [start]
#     if start == end:
#         paths.append(path)
#     if start not in graph:
#         return paths
#     for node in graph[start]:
#         if node not in path:
#             paths = dfs(graph, node, end, path, paths)
#     return paths


def dfs(graph, start, path=[], paths=[]):
    path = path + [start]
    if start not in graph:
        return paths
    for node in graph[start]:
        if node not in path:
            paths = dfs(graph, node, path, paths)
    paths.append(path)  # 添加起点到当前节点的路径
    return paths


def save_graph(graph, filename):
    with open(filename, 'w') as f:
        for node, neighbors in graph.items():
            f.write(f"{node}: {', '.join(neighbors)}\n")
        print(f"Graph data saved to {filename}")


def load_graph(filename):
    graph = {}
    with open(filename, 'r') as f:
        for line in f:
            node, neighbors = line.strip().split(':')
            neighbors = neighbors.split(',')
            graph[node] = neighbors
    return graph

path = './process_ret/re_code_all.csv'
g_save = './process_ret/graph.txt'
G = cons_graph(path)
save_graph(G,g_save)
start = '8_51_67'
end = '10_3616_10974'
# PATH = dfs(G,start)
# print(PATH)
# print(g)
# for data in g.items():
#     key,v = data
#     if(len(v) != 0):
#         print(key)