1,安装 第三方库
pip install pypdf2
2,配置删除的链接
IPO_DEL_LINK_LIST = ['']
3,删除的内容
IPO_DEL_CONTENT_LIST = ['']
IPO_DEL_PAGE_CONTENT = ' '
4,调用方法
pdf = PdfFile(input_pdf, output_pdf, link_list=IPO_DEL_LINK_LIST, content_list=IPO_DEL_CONTENT_LIST,
delete_page_content=IPO_DEL_PAGE_CONTENT, delete_old=True)
pdf.run_task()
PdfFile类:
"""
pdf 整理类
"""
from PyPDF2 import PdfFileReader, PdfFileWriter
import os
from PyPDF2.generic import ByteStringObject, NameObject, ArrayObject, IndirectObject, EncodedStreamObject, \
TextStringObject, NullObject
from PyPDF2.pdf import ContentStream
from PyPDF2.utils import b_
class PdfFile(object):
"""
pdf 对象
"""
def __init__(self, input_pdf, out_pdf, delete_page_index_list=None, link_list=None, content_list=None,
delete_page_content=None, delete_old=False, watermarks=None):
self.input_pdf = input_pdf
self.input_file = open(self.input_pdf, 'rb')
self.reader = PdfFileReader(self.input_file)
self.pages_num = self.reader.getNumPages()
self.writer = PdfFileWriter()
self.out_pdf = out_pdf
self.delete_page_index_list = delete_page_index_list
self.link_list = link_list
self.content_list = content_list
self.delete_page_content = delete_page_content
self.delete_old = delete_old
self.watermarks = watermarks
def run_task(self):
delete_page = []
if self.delete_page_index_list is not None:
delete_page = self.delete_page_index_list
for i in range(self.pages_num):
if i not in delete_page:
flag = False
page = PdfPage(self.reader, i)
if self.link_list is not None:
page.delete_link(self.link_list)
if self.content_list is not None and self.delete_page_content is not None:
flag = page.delete_content(self.content_list, self.delete_page_content)
if self.watermarks:
page.delete_watermark(self.watermarks)
if not flag:
self.writer.addPage(page.page)
out_file = open(self.out_pdf, "wb")
self.writer.write(out_file)
out_file.close()
self.input_file.close()
if self.delete_old:
os.remove(self.input_pdf)
class PdfPage(object):
"""
pdf 页对象
"""
def __init__(self, pdf_reader, page_index, delete_content_page=None):
self.reader = pdf_reader
self.page = self.reader.getPage(page_index)
self.page_obj = self.page.getObject()
self.delete_content_page = delete_content_page
def delete_link(self, link_list):
"""
删除超链接
:return:
"""
annots = self.page_obj.get('/Annots')
if annots is not None:
if isinstance(annots, ArrayObject):
self.remove_annots(annots, link_list)
elif isinstance(annots, IndirectObject):
obj_info = self.reader.getObject(annots)
if isinstance(obj_info, ArrayObject):
self.remove_annots(obj_info, link_list)
def remove_annots(self, annots: ArrayObject, link_list: list):
"""
删除链接
:param annots:
:param link_list:
:return:
"""
for link_info in list(annots):
obj_info = self.reader.getObject(link_info)
if isinstance(obj_info, NullObject):
continue
if "/A" not in obj_info:
continue
for link in link_list:
remove_url = obj_info['/A'].get('/URI', '')
if isinstance(remove_url, TextStringObject):
if link in remove_url:
annots.remove(link_info)
elif isinstance(remove_url, IndirectObject):
if link in remove_url.getObject():
annots.remove(link_info)
def delete_content(self, content_list, delete_page_content):
"""
删除内容,并判断当前页是否删除, 删除为True, 不删除 False
:return:
"""
flag = False
contents = self.page_obj.get('/Contents')
if contents is not None:
if isinstance(contents, ArrayObject):
flag = self.remove_content(contents, content_list, delete_page_content, flag)
elif isinstance(contents, IndirectObject):
content = self.reader.getObject(contents)
if isinstance(content, ArrayObject):
flag = self.remove_content(content, content_list, delete_page_content, flag)
elif isinstance(content, EncodedStreamObject):
if bytes(delete_page_content, encoding="utf-8") in content.getData():
flag = True
return flag
def remove_content(self, content, content_list, delete_page_content, flag):
for content_info in list(content):
if isinstance(content_info, NameObject):
continue
content_item = self.reader.getObject(content_info)
if not hasattr(content_item, 'getData'):
continue
for content_data in content_list:
if bytes(content_data, encoding="utf-8") in content_item.getData():
if content_info in content:
content.remove(content_info)
if bytes(delete_page_content, encoding="utf-8") in content_item.getData():
flag = True
return flag
def delete_watermark(self, watermarks: list):
"""
删除水印
:param watermarks:
:return:
"""
if self.page.get("/Contents"):
content_object = self.page["/Contents"].getObject()
content = ContentStream(content_object, self.reader)
for operands, operator in content.operations:
if operator == b_("Tj"):
text = operands[0]
for watermark in watermarks:
bytes_obj = b_(watermark)
if isinstance(text, ByteStringObject) and text == bytes_obj:
operands[0] = ByteStringObject()
self.page.__setitem__(NameObject('/Contents'), content)