python提取docx大纲标题并转换成树形json

最新推荐文章于 2024-08-05 11:02:24 发布

落雨飞辰

最新推荐文章于 2024-08-05 11:02:24 发布

阅读量595

点赞数

分类专栏： python 文章标签： python json 开发语言

如有错误，欢迎指正批评 && 转载请注明谢谢

本文链接：https://blog.csdn.net/bigsungod/article/details/134048404

版权

python 专栏收录该内容

2 篇文章

订阅专栏

该代码示例展示了如何利用Python处理Word文档，提取并组织标题信息形成层次结构。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

import json
from collections import defaultdict

from docx import Document

# get info
cont = []
pos = []
from docx import Document
obj = Document('t.docx')
for p in obj.paragraphs:
    style_name = p.style.name
    if style_name.startswith('Heading'):
        #print(style_name,p.text,sep=':')
        pos.append(int(style_name[-1]))
        cont.append(p.text)

infos = {}
relations = []
for i in range(len(pos)):
    dic = {'title': cont[i]}
    infos[i] = dic

# construct keys
# input=[1, 2, 2, 1, 2, 2, 2, 1, 2, 2, 3, 3]
# output = ['1', '1-1', '1-2', '2', '2-1', '2-2', '2-3', '3', '3-1', '3-2', '3-2-1', '3-2-2']
keys = ['0'] * len(pos)
def dfs(p, f):
    x = 1
    if keys[p] == '0':
        keys[p] = f + '-' + str(x)
    x += 1
    th = keys[p]
    for i in range(p+1, len(pos)):
        if pos[i] - 1 == pos[p]:
            print(i)
            dfs(i, th +'-')
        elif pos[i] == pos[p]:
            if keys[i] == '0':
                keys[i] = f + '-' + str(x)
                x += 1
                th = keys[i]
        elif pos[i] + 1 == pos[p]:
            return
dfs(0, '')

keys = [e.strip('-').replace('--', '-') for e in keys]
def generate_tree(arr):
    top = current = {'id': 0, 'children': []}
    stack = [current]
    for pos, i in enumerate(arr):
        node = {'id': i, 'key': keys[pos], 'title': cont[pos], 'children': []}
        while i <= current['id']:
            stack.pop()
            current = stack[-1]
        current['children'].append(node)
        stack.append(node)
        current = node
    return top['children']


arr = generate_tree(pos)
print(keys)
#json1 = json.loads(tree_dict, strict=False)
print(pos)
print(cont)
print(arr)