目录
前言
目录知识图谱是一种数据结构,它将文件系统的目录结构和文件属性映射为图形结构。在这个图谱中,节点可以代表文件或目录,边可以代表目录结构(例如,一个目录包含另一个目录或文件)。节点的属性可以包含文件或目录的元数据,如创建时间、修改时间、访问时间等。以下是我的代码,它构建了目录知识图谱的数据源。
一、配置文件
配置文件设置了一些参数,如要处理的文件夹,抽取实体和关系的存储路径等。这样通过修改配置文件可以达到提取任何文件夹的目录结构。
config.yaml:
cwd: ???
root_dir: 'centos7/etc' #要处理的文件夹
excel_path: 'excel' #抽取实体和关系的excel存储路径
triple_number_path: 'triples_number' #抽取实体和关系的三元组存储路径(数字化)
二、实体属性存储文件
节点的属性可以存储到一个excel文件里,然后给节点编号(后面D3.js会用到)。
run_excel.py:
import os
import pandas as pd
from datetime import datetime
import time
from omegaconf import DictConfig
import hydra
from hydra import utils
# 获取文件或文件夹时间属性的函数
def get_attributes(path,root_dir):
try:
stat = os.stat(path)
relative_path = os.path.relpath(path, os.path.dirname(root_dir))
parent_dir = '' if path == root_dir else os.path.dirname(relative_path)
#parent_name=os.path.basename(parent_dir)
attributes = {
'Path': os.path.basename(path),
'Parent': parent_dir,
'Creation Time': time.ctime(stat.st_ctime),
# 'Modification Time': datetime.fromtimestamp(stat.st_mtime),
# 'Access Time': datetime.fromtimestamp(stat.st_atime),
'Type': 'Directory' if os.path.isdir(path) else 'File'
}
return attributes
except Exception as e:
print(f"Error processing {path}: {e}")
return {'Path': os.path.basename(path), 'Creation Time': None, 'Type': None}
@hydra.main(config_path='conf', config_name='config', version_base='1.1')
def main(cfg: DictConfig):
cwd = utils.get_original_cwd()
cfg.cwd = cwd
# 定义要处理的根目录和结果存储目录
root_dir = os.path.join(cwd,cfg.root_dir)
result_dir = os.path.join(cwd,cfg.excel_path)
os.makedirs(result_dir, exist_ok=True)
# 获取根目录本身的属性
all_attributes = [get_attributes(root_dir,root_dir)]
# 遍历根目录下的所有文件夹和文件
for dirpath, dirnames, filenames in os.walk(root_dir):
dirnames.sort()
filenames.sort()
# 获取当前文件夹下所有子文件夹的属性
for dirname in dirnames:
all_attributes.append(get_attributes(os.path.join(dirpath, dirname),root_dir))
# 获取当前文件夹下所有文件的属性
for filename in filenames:
all_attributes.append(get_attributes(os.path.join(dirpath, filename),root_dir))
# 将所有属性保存到一个CSV文件
df = pd.DataFrame(all_attributes)
csv_filename = os.path.join(result_dir, 'ner_all.xlsx')
df.index.name = 'Index'
df.to_excel(csv_filename, index=True)
print("所有属性已成功提取并保存到excel文件中。")
if __name__ == '__main__':
main()
三、三元组文件
一共两个,分别是ner.csv(存储节点属性,编号和excel保持一致)和relation.csv(存储节点之间的关系)。
run_triples.py:
import os
import pandas as pd
import time
from omegaconf import DictConfig
import hydra
from hydra import utils
# 从ner_all.csv中读取路径和编号映射关系
def load_number_mapping(ner_all_path):
df = pd.read_excel(ner_all_path, dtype={'Index': str,'Path': str ,'Parent': str},keep_default_na=False)
# 创建一个新列,存储相对路径
df['relpath_Path'] = df.apply(lambda row: str(row['Parent']) + os.sep + str(row['Path'])
if row['Parent'] != ''
else str(row['Path']), axis=1)
number_mapping = dict(zip(df['relpath_Path'], df['Index']))
return number_mapping
def get_attributes(path, number_mapping,root_dir):
# 获取从根目录到当前路径的相对路径
relpath_path = os.path.relpath(path, os.path.dirname(root_dir))
# 获取当前路径的编号
number = number_mapping.get(relpath_path, None)
try:
stat = os.stat(path)
parent_dir = '' if path == root_dir else os.path.dirname(relpath_path)
# 计算当前路径相对于root_dir的深度
depth = relpath_path.count(os.sep) + 1
attributes = {
'number': number,
'Path': os.path.basename(path),
'Parent': parent_dir,
'Creation Time': time.ctime(stat.st_ctime),
'Type': 'Directory' if os.path.isdir(path) else 'File' ,
'Depth': depth
}
return attributes
except Exception as e:
print(f"Error processing {path}: {e}")
return {'number': number, 'Path': os.path.basename(path), 'Creation Time': None, 'Type': None, 'Depth': None}
@hydra.main(config_path='conf', config_name='config', version_base='1.1')
def main(cfg: DictConfig):
cwd = utils.get_original_cwd()
cfg.cwd = cwd
# 定义要处理的根目录和结果存储目录
root_dir = os.path.join(cwd, cfg.root_dir)
result_dir = os.path.join(cwd, cfg.triple_number_path)
os.makedirs(result_dir, exist_ok=True)
# 读取ner_all.xlsx中的路径和编号映射关系
ner_all_path = os.path.join(cwd, 'excel','ner_all.xlsx')
number_mapping = load_number_mapping(ner_all_path)
# 定义结果文件路径
csv_filename_ner = os.path.join(result_dir, f'ner.csv')
csv_filename_relation = os.path.join(result_dir, f'relation.csv')
# 在循环开始之前删除文件
if os.path.exists(csv_filename_ner):
os.remove(csv_filename_ner)
if os.path.exists(csv_filename_relation):
os.remove(csv_filename_relation)
# 开始处理根目录
root_attributes = [get_attributes(root_dir, number_mapping,root_dir)]
df_ner = pd.DataFrame(root_attributes)
df_ner.to_csv(csv_filename_ner, mode='a',index=False, header=False)
# 获取当前目录下所有子文件夹和文件的属性
for dirpath, dirnames, filenames in os.walk(root_dir):
if not dirnames and not filenames:
continue
dirnames.sort()
filenames.sort()
all_attributes_tail = []
relationships = []
# 获取子文件夹的属性
all_attributes_head = [get_attributes(dirpath, number_mapping,root_dir)]
head = all_attributes_head[0]['number']
for dirname in dirnames:
sub_dirpath = os.path.join(dirpath, dirname)
tail_attributes = get_attributes(sub_dirpath, number_mapping,root_dir)
all_attributes_tail.append(tail_attributes)
tail = tail_attributes['number']
relationships.append((head, tail, 'include'))
# 获取文件的属性
for filename in filenames:
filepath = os.path.join(dirpath, filename)
tail_attributes = get_attributes(filepath, number_mapping,root_dir)
tail = tail_attributes['number']
all_attributes_tail.append(get_attributes(filepath, number_mapping,root_dir))
relationships.append((head, tail, 'include'))
df_tail = pd.DataFrame(all_attributes_tail)
df_tail.to_csv(csv_filename_ner, mode='a',index=False, header=False)
df_relation = pd.DataFrame(relationships, columns=['head', 'tail', 'Relation'])
df_relation.to_csv(csv_filename_relation, mode='a',index=False, header=False)
print("所有属性已成功提取并保存到CSV文件中。")
if __name__ == '__main__':
main()
四、csv转化成json
之所以转化,是因为D3.js中的力导图要求。
csv_json.py:
import csv
import json
# 输入CSV文件名
csv_re = 'relation.csv'
csv_ner = 'ner.csv'
# 输出JSON文件名
json_re = 'relation.json'
json_ner = 'ner.json'
# 初始化一个空的字典,用于存储最终的JSON数据
json_links = {"links": []}
json_nodes = {"nodes": []}
json_item= {}
# 读取CSV文件
with open(csv_re, mode='r', newline='') as csv_file:
csv_reader = csv.reader(csv_file)
for row in csv_reader:
source, target, relation = row
json_links["links"].append({
"relation": relation,
"source": int(source),
"target": int(target)
})
with open(csv_ner, mode='r', newline='',encoding='utf-8') as csv_file:
csv_reader = csv.reader(csv_file)
for row in csv_reader:
index, name, parent_path,time,type,depth = row
json_item[index] = {
"name": name,
"parent_path": parent_path,
"time": time,
"type": type
}
json_nodes["nodes"].append({
"index": int(index),
"name": name,
"type": type,
"depth": int(depth)
})
json_combined = {**json_links, **json_nodes}
# 将字典数据写入JSON文件
with open(json_re, mode='w') as json_file:
# json.dump(json_links, json_file, indent=4)
# json.dump(json_nodes, json_file, indent=4)
json.dump(json_combined, json_file, indent=4)
with open(json_ner, mode='w') as json_file:
json.dump(json_item, json_file, indent=4)
print("JSON文件已生成!")
总结
先提取出csv文件,然后再转换成json文件。可能有人会问为啥不直接转换成json文件,因为csv文件还要导入图数据库。后期还要用图数据库打造一些东西。