pdf文件去除无用内容和链接_链接取消无内容-CSDN博客

本文链接：https://blog.csdn.net/m0_43451420/article/details/114405305

1,安装第三方库

pip install pypdf2

2,配置删除的链接

IPO_DEL_LINK_LIST = ['']

3,删除的内容

IPO_DEL_CONTENT_LIST = ['']
IPO_DEL_PAGE_CONTENT = ' '

4,调用方法

pdf = PdfFile(input_pdf, output_pdf, link_list=IPO_DEL_LINK_LIST, content_list=IPO_DEL_CONTENT_LIST,
                          delete_page_content=IPO_DEL_PAGE_CONTENT, delete_old=True)
pdf.run_task()

PdfFile类:

"""
pdf 整理类
"""
from PyPDF2 import PdfFileReader, PdfFileWriter
import os

from PyPDF2.generic import ByteStringObject, NameObject, ArrayObject, IndirectObject, EncodedStreamObject, \
    TextStringObject, NullObject
from PyPDF2.pdf import ContentStream
from PyPDF2.utils import b_


class PdfFile(object):
    """
    pdf 对象
    """

    def __init__(self, input_pdf, out_pdf, delete_page_index_list=None, link_list=None, content_list=None,
                 delete_page_content=None, delete_old=False, watermarks=None):
        self.input_pdf = input_pdf
        self.input_file = open(self.input_pdf, 'rb')
        self.reader = PdfFileReader(self.input_file)
        self.pages_num = self.reader.getNumPages()
        self.writer = PdfFileWriter()
        self.out_pdf = out_pdf
        self.delete_page_index_list = delete_page_index_list
        self.link_list = link_list
        self.content_list = content_list
        self.delete_page_content = delete_page_content
        self.delete_old = delete_old
        self.watermarks = watermarks

    def run_task(self):
        delete_page = []
        if self.delete_page_index_list is not None:
            delete_page = self.delete_page_index_list

        for i in range(self.pages_num):
            if i not in delete_page:
                flag = False
                page = PdfPage(self.reader, i)
                if self.link_list is not None:
                    page.delete_link(self.link_list)
                if self.content_list is not None and self.delete_page_content is not None:
                    flag = page.delete_content(self.content_list, self.delete_page_content)
                if self.watermarks:
                    page.delete_watermark(self.watermarks)
                if not flag:
                    self.writer.addPage(page.page)

        out_file = open(self.out_pdf, "wb")
        self.writer.write(out_file)
        out_file.close()
        self.input_file.close()
        if self.delete_old:
            os.remove(self.input_pdf)


class PdfPage(object):
    """
    pdf 页对象
    """

    def __init__(self, pdf_reader, page_index, delete_content_page=None):
        self.reader = pdf_reader
        self.page = self.reader.getPage(page_index)
        self.page_obj = self.page.getObject()
        self.delete_content_page = delete_content_page

    def delete_link(self, link_list):
        """
        删除超链接
        :return: 
        """
        annots = self.page_obj.get('/Annots')
        if annots is not None:
            if isinstance(annots, ArrayObject):
                self.remove_annots(annots, link_list)
            elif isinstance(annots, IndirectObject):
                obj_info = self.reader.getObject(annots)
                if isinstance(obj_info, ArrayObject):
                    self.remove_annots(obj_info, link_list)

    def remove_annots(self, annots: ArrayObject, link_list: list):
        """
        删除链接
        :param annots:
        :param link_list:
        :return:
        """
        for link_info in list(annots):
            obj_info = self.reader.getObject(link_info)
            if isinstance(obj_info, NullObject):
                continue
            if "/A" not in obj_info:
                continue
            for link in link_list:
                remove_url = obj_info['/A'].get('/URI', '')
                if isinstance(remove_url, TextStringObject):
                    if link in remove_url:
                        annots.remove(link_info)
                elif isinstance(remove_url, IndirectObject):
                    if link in remove_url.getObject():
                        annots.remove(link_info)

    def delete_content(self, content_list, delete_page_content):
        """
        删除内容,并判断当前页是否删除, 删除为True， 不删除 False
        :return:
        """
        flag = False
        contents = self.page_obj.get('/Contents')
        if contents is not None:
            if isinstance(contents, ArrayObject):
                flag = self.remove_content(contents, content_list, delete_page_content, flag)
            elif isinstance(contents, IndirectObject):
                content = self.reader.getObject(contents)
                if isinstance(content, ArrayObject):
                    flag = self.remove_content(content, content_list, delete_page_content, flag)
                elif isinstance(content, EncodedStreamObject):
                    if bytes(delete_page_content, encoding="utf-8") in content.getData():
                        flag = True
        return flag

    def remove_content(self, content, content_list, delete_page_content, flag):
        for content_info in list(content):
            if isinstance(content_info, NameObject):
                continue
            content_item = self.reader.getObject(content_info)
            if not hasattr(content_item, 'getData'):
                continue
            for content_data in content_list:
                if bytes(content_data, encoding="utf-8") in content_item.getData():
                    if content_info in content:
                        content.remove(content_info)
                if bytes(delete_page_content, encoding="utf-8") in content_item.getData():
                    flag = True
        return flag

    def delete_watermark(self, watermarks: list):
        """
        删除水印
        :param watermarks:
        :return:
        """
        if self.page.get("/Contents"):
            content_object = self.page["/Contents"].getObject()
            content = ContentStream(content_object, self.reader)
            for operands, operator in content.operations:
                if operator == b_("Tj"):
                    text = operands[0]
                    for watermark in watermarks:
                        bytes_obj = b_(watermark)
                        if isinstance(text, ByteStringObject) and text == bytes_obj:
                            operands[0] = ByteStringObject()
            self.page.__setitem__(NameObject('/Contents'), content)