环境
python == 3.10
PyPDF2 ==3.0.1
安装
pip install PyPDF2
流程
- 将空白页和内容页读取出来,看看内部结构有什么不同
- 以此为依据,遍历整个PDF 文件,标记处有内容的页面,写入到另外一个PDF文件。
python 代码
# 每一个页都是一个字典对象,看第一层没区别
# 参考文章中 第一层 keys 一样, 但是 /Resources下结构有所不同,空白页没有"/XObject"键
# 我的第一层keys 不一样, 但是 /Resources下结构一样
# 另外 PyPDF2 版本不一样,各个模块有更新,自己看源码进行更新,或者根据报错提示进行更新
from PyPDF2 import PdfReader, PdfWriter
def remove_pdf_blank_pages(path):
pdfReader = PdfReader(open(path, 'rb'))
writer = PdfWriter()
pages = len(pdfReader.pages)
# blank = pdfReader.pages[1]
# full = pdfReader.pages[2]
#print('*'*10)
#print(blank.keys())# dict_keys(['/Type', '/Parent', '/Resources', '/MediaBox', '/Contents'])
#print(full.keys())# dict_keys(['/Type', '/Parent', '/Resources', '/MediaBox', '/Annots', '/Tabs', '/StructParents', '/Contents'])
#print(blank['/Resources'])
#{'/Font': IndirectObject(600, 0, 139632281578944), '/XObject': {'/Im553': IndirectObject(553, 0, 139632281578944), '/Im7': IndirectObject(7, 0, 139632281578944)}, '/ProcSet': ['/PDF', '/Text', '/ImageC', '/ImageI', '/ImageB']}
#print(full['/Resources'])
#{'/Font': IndirectObject(600, 0, 139632281578944), '/XObject': {'/Im553': IndirectObject(553, 0, 139632281578944), '/Im7': IndirectObject(7, 0, 139632281578944)}, '/ProcSet': ['/PDF', '/Text', '/ImageC', '/ImageI', '/ImageB']}
#print('*' * 10)
for i in range(pages):
page = pdfReader.pages[i]
# if "/XObject" in page["/Resources"].keys() or "/Font" in page["/Resources"].keys():
# writer.add_page(page)
if "/StructParents" in page.keys() or "/Tabs" in page.keys() or "/Annots" in page.keys():
writer.add_page(page)
writer.write(open(path, 'wb'))