问题
从网上下载了一本书的pdf,发现书签没有分级,对强迫症很难受,于是用python的pypdf库处理了一下,踩了点小坑,在这里放一下代码吧。
处理前的书签

处理后的书签

代码
import PyPDF2
#层级打印书签
def print_outLine(outline: list, sub: int = 0):
for line in outline:
if isinstance(line,list):
print_outLine(line,sub+1)
else:
print(sub*" "+line.title)
#处理无级书签为层级结构,以.的个数为层级标准
def to_new_outline(outline, end: int, curlevel:int = 0, index: int = 0):
new_outline = []
while index<end:
line = outline[index]
level = line.title.split(" ")[0].count(".")
if level == curlevel:
new_outline.append(line)
index += 1
else:
remain = 1
while outline[index+remain].title.split(" ")[0].count(".") > curlevel:
remain += 1
sub_outline = to_new_outline(outline,index+remain,curlevel+1,index)
new_outline.append(sub_outline)
index = index+remain
return new_outline
#设置书签
def set_outline(outline: list, root, writer: PyPDF2.PdfWriter):
i = 0
while i < len(outline):
if not isinstance(outline[i],list):
parent = writer.add_outline_item(outline[i].title,pdf_in.get_page_number(outline[i].page),root)
if i+1<len(outline) and isinstance(outline[i+1],list):
i=i+1
set_outline(outline[i],parent,writer)
else:
i+=1
else:
i+=1
if __name__ == "__main__":
pdf_in = PyPDF2.PdfReader("./test.pdf")
outline = pdf_in.outline
new_outline = to_new_outline(outline,end=len(outline))
print_outLine(new_outline,0)
# 写出pdf
writer = PyPDF2.PdfWriter()
#复制原书页
for page in pdf_in.pages:
writer.add_page(page)
#根书签
root = writer.get_outline_root()
set_outline(new_outline,root,writer)
#写文件
with open("./new.pdf","wb") as f:
writer.write(f)
f.close()

被折叠的 条评论
为什么被折叠?



