如何自己构建目录知识图谱的数据源(二)

小酋仍在学习

已于 2024-07-02 22:12:40 修改

阅读量368

点赞数 12

文章标签：前端 python java

于 2024-07-02 22:09:14 首次发布

本文链接：https://blog.csdn.net/qq_47753695/article/details/140135476

版权

引言

与第一期相比，增加了对文件的处理。如果是docx文件，会把文件内部的标题实体和关系也提取出来；其他文件就是增加了一个content属性（包含文件的所有内容）。

最后的可视化效果：

箭头指向就是表示包含关系，01.docx是文档，1.2，1.3等是章节。

代码

ner_excel.py:

import os
import pandas as pd
from datetime import datetime
import time
from omegaconf import DictConfig
import hydra
from hydra import utils
import re
from docx import Document
import sys


# 获取文件或文件夹时间属性的函数
def get_attributes(path,root_dir):
    try:
        stat = os.stat(path)
        relative_path = os.path.relpath(path, os.path.dirname(root_dir))
        # parent_dir = '' if path == root_dir else os.path.dirname(relative_path)
        attributes = {
                'Name': os.path.basename(path),
                'Path': path,
                'Creation Time':  time.ctime(stat.st_ctime),
                # 'Modification Time': datetime.fromtimestamp(stat.st_mtime),
                # 'Access Time': datetime.fromtimestamp(stat.st_atime),
                'Type': 'Directory' if os.path.isdir(path) else 'File',
                'Title': None,
                'Content': None,
                # 'parent_title': None,   
            }
        if os.path.isfile(path):
            if path.endswith('.txt') or path.endswith('.py'):
                with open(path, 'r', encoding='utf-8') as file:
                    attributes['Content'] = file.read()
        return attributes
        
    except Exception as e:
        print(f"Error processing {path}: {e}")
        return {'Path': os.path.basename(path), 'Creation Time': None, 'Type': None}

def dir_content(doc_path, root_dir):
    all_attributes = []
 
    # 提取标题层级
    def get_heading_level(paragraph):
        if paragraph.style.name.startswith('Heading'):
            # 提取标题级别（Heading 1, Heading 2, etc.）
            level = int(paragraph.style.name.split(' ')[-1])
            return level
        return None

    document = Document(doc_path)     
    doc_name = os.path.basename(doc_path)
    
    # 当前段落内容
    current_content = []
    number = 0
    # 栈，存储标题和标题级别
    stack = [(number, doc_name, 0, [])]
    
    for paragraph in document.paragraphs:
        level = get_heading_level(paragraph)
        if level is not None:
            current_heading = paragraph.text.strip()
            stack[-1][3].extend(current_content)
            current_content = []
            number += 1
            stack.append((number, current_heading, level, [current_heading]))
        else:
            current_content.append(paragraph.text.strip())
    
    stack[-1][3].extend(current_content)

    # 更新每个节点的内容
    for i in range(0, len(stack) - 1):
        j = i + 1
        while j < len(stack) and stack[i][2] < stack[j][2]:
            stack[i][3].extend(stack[j][3])
            j += 1


    # 遍历stack
    for i in range(len(stack)):
        current_number, current_title, current_level, current_content = stack[i]
        attribute=get_attributes(doc_path,root_dir)
        current_content = ','.join(current_content).strip('"')
        
        # 查找父标题
        parent_title = None
        for j in range(i - 1, -1, -1):
            if stack[j][2] < current_level:
                parent_title = stack[j][1]
                break
        attribute['Content']=current_content
        attribute['Title']=current_title
        # attribute['parent_title']=parent_title
        all_attributes.append(attribute) 
        
    return  all_attributes



@hydra.main(config_path='conf', config_name='config', version_base='1.1')
def main(cfg: DictConfig):
    cwd = utils.get_original_cwd()
    cfg.cwd = cwd  

    # 定义要处理的根目录和结果存储目录
    root_dir = os.path.join(cwd,cfg.root_dir)  
    result_dir = os.path.join(cwd,cfg.excel_path)  
    os.makedirs(result_dir, exist_ok=True)

    # 获取根目录本身的属性
    all_attributes = [get_attributes(root_dir,root_dir)]
    # 遍历根目录下的所有文件夹和文件
    for dirpath, dirnames, filenames in os.walk(root_dir):
        dirnames.sort()
        filenames.sort() 
        # 获取当前文件夹下所有子文件夹的属性
        for dirname in dirnames:
            all_attributes.append(get_attributes(os.path.join(dirpath, dirname),root_dir))
        
        # 获取当前文件夹下所有文件的属性
        for filename in filenames:
            # all_attributes.append(get_attributes(os.path.join(dirpath, filename),root_dir))
            if filename.endswith('.docx') and os.path.getsize(os.path.join(dirpath,filename)) > 0:
               all_attributes.extend(dir_content(os.path.join(dirpath, filename),root_dir))
            else:
               all_attributes.append(get_attributes(os.path.join(dirpath, filename),root_dir))

    # 将所有属性保存到一个CSV文件
    df = pd.DataFrame(all_attributes)
    
    csv_filename = os.path.join(result_dir, 'ner_all.xlsx')
    df.index.name = 'ID'
    df.to_excel(csv_filename, index=True)

    print("所有属性已成功提取并保存到excel文件中。")

if __name__ == '__main__':
    main()

triples_run.py

import os
import pandas as pd
import time
from omegaconf import DictConfig
from docx import Document
import hydra
from hydra import utils
import re



# 从ner_all.csv中读取路径和编号映射关系
def load_number_mapping(ner_all_path):

    df = pd.read_excel(ner_all_path,dtype={'Path':str,'Title':str})
   
    df['path_title'] = df.apply(lambda row: os.path.join(str(row['Path']), str(row['Title'])), axis=1)
    number_mapping_title = dict(zip(df['path_title'], df['ID']))
    number_mapping=dict(zip(df['Path'], df['ID']))
    return number_mapping_title,number_mapping


  
def get_attributes(path, number_mapping,root_dir):
    # 获取从根目录到当前路径的相对路径
    relpath_path = os.path.relpath(path, os.path.dirname(root_dir))
    # 获取当前路径的编号
    number = number_mapping.get(path, None)

    try:
        stat = os.stat(path)
        parent_dir = '' if path == root_dir else os.path.dirname(relpath_path)
        # 计算当前路径相对于root_dir的深度
        depth = relpath_path.count(os.sep) + 1
        
        attributes = {
            'Number': number,
            'Name': os.path.basename(path),
            'Parent': parent_dir,
            'Path': relpath_path,
            'Creation Time': time.ctime(stat.st_ctime),
            'Type': 'Directory' if os.path.isdir(path) else 'File' ,
            'Depth': depth ,
            'Title': None,
            'Content': None,
            'Parent_title': None, 
        }
        
        if os.path.isfile(path):
            if path.endswith('.txt') or path.endswith('.py'):
                with open(path, 'r', encoding='utf-8') as file:
                    attributes['Content'] = file.read()
        return attributes
    except Exception as e:
        print(f"Error processing {path}: {e}")
        return {'Number': number, 'Path': os.path.basename(path), 'Creation Time': None,
                'Type': None, 'Depth': None, 'Content': f"Error processing file: {e}"}
    
def dir_content(doc_path, root_dir,number_mapping_title,number_mapping):
    all_attributes = []
    relationships = []
    
    # 提取标题层级
    def get_heading_level(paragraph):
        if paragraph.style.name.startswith('Heading'):
            # 提取标题级别（Heading 1, Heading 2, etc.）
            level = int(paragraph.style.name.split(' ')[-1])
            return level
        return None

    document = Document(doc_path)     
    doc_name = os.path.basename(doc_path)
    
    # 当前段落内容
    current_content = []
    path_title=os.path.join(doc_path,doc_name)
    number = number_mapping_title.get(path_title, None)
    # 栈，存储标题和标题级别
    stack = [(number, doc_name, 0, [])]
    
    for paragraph in document.paragraphs:
        level = get_heading_level(paragraph)
        if level is not None:
            current_heading = paragraph.text.strip()
            stack[-1][3].extend(current_content)

            current_content = []
            doc_path_title= os.path.join(doc_path,current_heading)
            number = number_mapping_title.get(doc_path_title, None)
            stack.append((number, current_heading, level, [current_heading]))
        else:
            current_content.append(paragraph.text.strip())
    
    stack[-1][3].extend(current_content)

    # 更新每个节点的内容
    for i in range(0, len(stack) - 1):
        j = i + 1
        while j < len(stack) and stack[i][2] < stack[j][2]:
            stack[i][3].extend(stack[j][3])
            j += 1

        
    # 遍历stack
    for i in range(len(stack)):
        current_number, current_title, current_level, current_content = stack[i]
        attribute=get_attributes(doc_path,number_mapping,root_dir)
        current_content = ','.join(current_content)
        # 查找父标题
        parent_title = ''
        for j in range(i - 1, -1, -1):
            if stack[j][2] < current_level:
                parent_title = stack[j][1]
                break
        if parent_title:
            doc_path_fa_title=os.path.join(doc_path,parent_title) 
            parent_number=number_mapping_title.get(doc_path_fa_title, None)
            relationships.append((parent_number,current_number,'include'))
        else:
            attr=get_attributes(os.path.dirname(doc_path),number_mapping,root_dir)
            parent_number=attr['Number']
            relationships.append((parent_number,current_number,'include'))
        attribute['Number']=current_number
        attribute['Content']=current_content
        attribute['Title']=current_title
        attribute['Parent_title']=parent_title

        all_attributes.append(attribute) 
                  
    return  all_attributes,relationships
   
    


@hydra.main(config_path='conf', config_name='config', version_base='1.1')
def main(cfg: DictConfig):
    cwd = utils.get_original_cwd()
    cfg.cwd = cwd
    # 定义要处理的根目录和结果存储目录
    root_dir = os.path.join(cwd, cfg.root_dir)
    result_dir = os.path.join(cwd, cfg.triple_number_path)
    os.makedirs(result_dir, exist_ok=True)
    # 读取ner_all.xlsx中的路径和编号映射关系
    ner_all_path = os.path.join(cwd, 'excel','ner_all.xlsx')
    number_mapping_title,number_mapping = load_number_mapping(ner_all_path)
    # 定义结果文件路径
    csv_filename_ner = os.path.join(result_dir, f'ner.csv')
    csv_filename_relation = os.path.join(result_dir, f'relation.csv')
    # 在循环开始之前删除文件
    if os.path.exists(csv_filename_ner):
        os.remove(csv_filename_ner)

    if os.path.exists(csv_filename_relation):
        os.remove(csv_filename_relation)
    # 开始处理根目录
    root_attributes = [get_attributes(root_dir, number_mapping,root_dir)]
    df_ner = pd.DataFrame(root_attributes)
    df_ner.to_csv(csv_filename_ner, mode='a',index=False, header=False)
    # 获取当前目录下所有子文件夹和文件的属性
    for dirpath, dirnames, filenames in os.walk(root_dir):
        if not dirnames and not filenames:
           continue
        dirnames.sort()
        filenames.sort()
        all_attributes_tail = []
        relationships = []
        # 获取子文件夹的属性
        all_attributes_head = [get_attributes(dirpath, number_mapping,root_dir)]
        head = all_attributes_head[0]['Number']
        for dirname in dirnames:   
            sub_dirpath = os.path.join(dirpath, dirname)
            tail_attributes = get_attributes(sub_dirpath, number_mapping,root_dir)
            all_attributes_tail.append(tail_attributes)
            tail = tail_attributes['Number']
            relationships.append((head, tail, 'include'))

        # 获取文件的属性
        for filename in filenames:
            filepath = os.path.join(dirpath, filename)
           
            if  filepath.endswith('.docx') and os.path.getsize(filepath) > 0:
                some_attributes,some_relationships =dir_content(filepath,root_dir,number_mapping_title,number_mapping)
                all_attributes_tail.extend(some_attributes)
                relationships.extend(some_relationships)
                
            else:
                tail_attributes = get_attributes(filepath, number_mapping,root_dir)
                tail = tail_attributes['Number']
                relationships.append((head, tail, 'include'))
                all_attributes_tail.append(tail_attributes)


        df_tail = pd.DataFrame(all_attributes_tail)
        df_tail.to_csv(csv_filename_ner, mode='a',index=False, header=False)

        df_relation = pd.DataFrame(relationships, columns=['head', 'tail', 'Relation'])
        df_relation.to_csv(csv_filename_relation, mode='a',index=False, header=False)
    print("所有属性已成功提取并保存到CSV文件中。")

if __name__ == '__main__':
    main()

csv_json_ner.py

import csv
import json

# 输入CSV文件名
csv_re = 'triples_number/relation.csv'
csv_ner = 'triples_number/ner.csv'
# 输出JSON文件名
json_re = 'triples_number/relation.json'
json_ner = 'triples_number/ner.json'

# 初始化一个空的字典，用于存储最终的JSON数据
json_links = {"links": []}
json_nodes = {"nodes": []}
json_item = {}

# 读取关系CSV文件
with open(csv_re, mode='r', newline='', encoding='utf-8') as csv_file:
    csv_reader = csv.reader(csv_file)
    for row in csv_reader:
        source, target, relation = row
        json_links["links"].append({
            "relation": relation,
            "source": int(source),
            "target": int(target)
        })

# 读取NER CSV文件
with open(csv_ner, mode='r', newline='', encoding='utf-8') as csv_file:
    csv_reader = csv.reader(csv_file)
    
    for row in csv_reader:
        number, name, parent_path, path, time, type, depth, title, content, parent_title = row
        
        json_item[number] = {
            "name": name,
            "parent_path": parent_path,
            "path": path,
            "time": time,
            "type": type,
            "content": content.strip(),
        }

        json_nodes["nodes"].append({
            "index": int(number),
            "name": name,
            "type": type,
            "depth": int(depth),
        })

# 合并链接和节点数据
json_combined = {**json_links, **json_nodes}

# 将链接和节点数据写入JSON文件
with open(json_re, mode='w', encoding='utf-8') as json_file:
    json.dump(json_combined, json_file, ensure_ascii=False, indent=4)

# 将NER数据写入JSON文件
with open(json_ner, mode='w', encoding='utf-8') as json_file:
    json.dump(json_item, json_file, ensure_ascii=False, indent=4)

print("JSON文件已生成！")

小结

代码虽然可以完成预期任务，但仍有很多需要改进的地方。先存一期，后面再修改。

目录知识图谱的代码，我会上传到我的资源，上次有东西少传了，该部分的代码就是增加了指向的箭头和一个按钮。

小酋仍在学习

关注

12
点赞
踩
5

收藏

觉得还不错? 一键收藏
0
评论
如何自己构建目录知识图谱的数据源(二)

与第一期相比，增加了对文件的处理。如果是docx文件，会把文件内部的标题实体和关系也提取出来；其他文件就是增加了一个content属性（包含文件的所有内容）。最后的可视化效果：箭头指向就是表示包含关系，01.docx是文档，1.2，1.3等是章节。
复制链接

扫一扫