python提取docx大纲标题并转换成树形json

import json
from collections import defaultdict

from docx import Document

# get info
cont = []
pos = []
from docx import Document
obj = Document('t.docx')
for p in obj.paragraphs:
    style_name = p.style.name
    if style_name.startswith('Heading'):
        #print(style_name,p.text,sep=':')
        pos.append(int(style_name[-1]))
        cont.append(p.text)

infos = {}
relations = []
for i in range(len(pos)):
    dic = {'title': cont[i]}
    infos[i] = dic

# construct keys
# input=[1, 2, 2, 1, 2, 2, 2, 1, 2, 2, 3, 3]
# output = ['1', '1-1', '1-2', '2', '2-1', '2-2', '2-3', '3', '3-1', '3-2', '3-2-1', '3-2-2']
keys = ['0'] * len(pos)
def dfs(p, f):
    x = 1
    if keys[p] == '0':
        keys[p] = f + '-' + str(x)
    x += 1
    th = keys[p]
    for i in range(p+1, len(pos)):
        if pos[i] - 1 == pos[p]:
            print(i)
            dfs(i, th +'-')
        elif pos[i] == pos[p]:
            if keys[i] == '0':
                keys[i] = f + '-' + str(x)
                x += 1
                th = keys[i]
        elif pos[i] + 1 == pos[p]:
            return
dfs(0, '')

keys = [e.strip('-').replace('--', '-') for e in keys]
def generate_tree(arr):
    top = current = {'id': 0, 'children': []}
    stack = [current]
    for pos, i in enumerate(arr):
        node = {'id': i, 'key': keys[pos], 'title': cont[pos], 'children': []}
        while i <= current['id']:
            stack.pop()
            current = stack[-1]
        current['children'].append(node)
        stack.append(node)
        current = node
    return top['children']


arr = generate_tree(pos)
print(keys)
#json1 = json.loads(tree_dict, strict=False)
print(pos)
print(cont)
print(arr)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值