pdf去水印和切割

weixin_42487686

已于 2022-11-02 12:01:16 修改

阅读量183

点赞数

文章标签： pdf python 算法

于 2022-11-02 11:58:55 首次发布

本文链接：https://blog.csdn.net/weixin_42487686/article/details/127649219

版权

from PIL import Image
import os
import io
import fitz
import time


def single_pdf_clearwater(pdf_path: str):
    pdf = fitz.open(pdf_path)   # 打开pdf目录
    pdf_img = fitz.open()       # 打开空文件，用来存图片pdf
    for page_inf in pdf:
        definition = 6    # 清晰度，感觉输出的pdf不够清晰，可以调大，调大，文件大小也会变大
        matrix = fitz.Matrix(definition, definition)
        img = page_inf.get_pixmap(matrix=matrix).tobytes()
        img = Image.open(io.BytesIO(img))
        width, height = img.size
        for i in range(width):
            for j in range(height):
                if sum(img.getpixel((i, j))) > 590: # 这里的600你需要根据自己的水印的颜色进行更改。根据水印像素修改
                    img.putpixel((i, j), (255, 255, 255))
        img = img.tobytes()  # = img = np.asarray(img)；img = bytearray(img)
        img = fitz.Pixmap(fitz.csRGB, width, height, img)
        img = img.tobytes()
        img = fitz.open("png", img)
        pdf_bytes = img.convert_to_pdf()
        pdf_img.insert_pdf(fitz.open("pdf", pdf_bytes))
    if not os.path.exists("output"):
        os.makedirs("output")  # 处理好的pdf存入了output目录下
    pdf_img.save("output/去水印pdf_" + os.path.basename(pdf_path))


def group_pdf_clearwater(path_array):
    print("************去水印时间比较久***********")
    for pdf_path in path_array:
        print(pdf_path, "去水印中...")
        single_pdf_clearwater(pdf_path)
    print("完成")


def folder_pdf_files(file_path: str):  # 一个文件夹里面有多少pdf文件
    file_list = []

    file_list.append(file_path)
    print(file_path, ": 有", len(file_list), "个pdf文件")
    return file_list


if __name__ == '__main__':
    time_start = time.time()
    path_list = folder_pdf_files("pdf文件名")
    print(path_list)
    group_pdf_clearwater(path_list)
    time_end = time.time()
    print("程序运行时间：", round(time_end - time_start, 2), "秒")

from PyPDF2 import PdfFileReader, PdfFileWriter


# PDF文件分割
def split_pdf(start_page,end_page):
    try:
        read_file = input("请输入要拆分的PDF名字(例如test.pdf):")
        fp_read_file = open(read_file, 'rb')
        pdf_input = PdfFileReader(fp_read_file)  # 将要分割的PDF内容格式话
        page_count = pdf_input.getNumPages()  # 获取PDF页数
        print("该文件共有{}页".format(page_count))  # 打印页数
        name,pdf=read_file.split(".")
        pdf_file=name+"_"+str(start_page)+'-'+str(end_page)+'.'+pdf
        try:
            print(f'开始分割{start_page}页-{end_page}页，保存为{pdf_file}......')
            pdf_output = PdfFileWriter()  # 实例一个 PDF文件编写器
            for i in range(start_page, end_page):
                pdf_output.addPage(pdf_input.getPage(i))
            with open(pdf_file, 'wb') as sub_fp:
                pdf_output.write(sub_fp)
        except Exception as e:
            print(e)
    except Exception as e:
        print(e)



if __name__ == '__main__':
    split_pdf(12,17)