import json
from collections import defaultdict
from docx import Document
# get info
cont = []
pos = []
from docx import Document
obj = Document('t.docx')
for p in obj.paragraphs:
style_name = p.style.name
if style_name.startswith('Heading'):
#print(style_name,p.text,sep=':')
pos.append(int(style_name[-1]))
cont.append(p.text)
infos = {}
relations = []
for i in range(len(pos)):
dic = {'title': cont[i]}
infos[i] = dic
# construct keys
# input=[1, 2, 2, 1, 2, 2, 2, 1, 2, 2, 3, 3]
# output = ['1', '1-1', '1-2', '2', '2-1', '2-2', '2-3', '3', '3-1', '3-2', '3-2-1', '3-2-2']
keys = ['0'] * len(pos)
def dfs(p, f):
x = 1
if keys[p] == '0':
keys[p] = f + '-' + str(x)
x += 1
th = keys[p]
for i in range(p+1, len(pos)):
if pos[i] - 1 == pos[p]:
print(i)
dfs(i, th +'-')
elif pos[i] == pos[p]:
if keys[i] == '0':
keys[i] = f + '-' + str(x)
x += 1
th = keys[i]
elif pos[i] + 1 == pos[p]:
return
dfs(0, '')
keys = [e.strip('-').replace('--', '-') for e in keys]
def generate_tree(arr):
top = current = {'id': 0, 'children': []}
stack = [current]
for pos, i in enumerate(arr):
node = {'id': i, 'key': keys[pos], 'title': cont[pos], 'children': []}
while i <= current['id']:
stack.pop()
current = stack[-1]
current['children'].append(node)
stack.append(node)
current = node
return top['children']
arr = generate_tree(pos)
print(keys)
#json1 = json.loads(tree_dict, strict=False)
print(pos)
print(cont)
print(arr)
python提取docx大纲标题并转换成树形json
最新推荐文章于 2024-08-05 11:02:24 发布