Python 将PDF导出为图片

目录

前言

主要功能

核心代码分析

完整代码

总结

前言

因日常需要将PDF转化为图片并插入到文档中,我于是在网上的AI工具帮助下实现了将PDF转化为图片的功能。

主要功能

1.提取嵌入PDF中的图片。

2.将PDF的每一页或指定页转化为图片。

可以同时拖入多个PDF文件到文件路径输入框中。

保存规则:默认先在PDF文件所在的文件夹中创建一个与PDF文件名相同的文件夹,输出图片保存到该文件夹中。

核心代码分析

一个多线程函数,防止在处理PDF时窗口卡死

def thread_it(func, *args):
    t = threading.Thread(target=func, args=args)
    # 守护线程
    t.setDaemon = True
    # 启动线程
    t.start()
bt4 = tk.Button(frame4, text='提取pdf中的图片', command=lambda: thread_it(self.start))
bt4.grid(row=0, column=2, sticky='nsew')
bt5 = tk.Button(frame4, text='将pdf转换为图片', command=lambda: thread_it(self.pdf_to_image))
bt5.grid(row=0, column=3, sticky='nsew')

程序界面,需要传入一个主窗口

    def __init__(self, master):
        '''
        获取pdf中的图片
        :param master:
        '''
        self.master = master
        self.files = []
        frame1 = tk.Frame(master, relief="raised", borderwidth=1)
        bt0 = tk.Label(frame1, text='文件路径')
        bt0.grid(row=0, column=0, sticky="nsew")
        self.show_path = tk.Entry(frame1)
        self.show_path.grid(row=0, column=1, sticky="nsew")
        windnd.hook_dropfiles(self.show_path, self.get_paths, force_unicode=True)
        bt1 = tk.Button(frame1, text='选择文件', command=self.choose_files)
        bt1.grid(row=0, column=2, sticky="nsew")
        bt2 = tk.Label(frame1, text='保存路径')
        bt2.grid(row=1, column=0, sticky="nsew")
        self.save_path = tk.Entry(frame1)
        self.save_path.grid(row=1, column=1, sticky="nsew")
        bt3 = tk.Button(frame1, text='选择另存路径', command=self.choose_savepath)
        bt3.grid(row=1, column=2, sticky="nsew")

        frame4 = tk.Frame(master, relief="raised", borderwidth=1)
        frame4.columnconfigure(0, weight=0)
        frame4.columnconfigure(1, weight=0)
        frame4.columnconfigure(2, weight=1)
        frame4.columnconfigure(3, weight=1)
        frame4.rowconfigure(0, weight=0)

        l0 = tk.Label(frame4, text='提取范围')
        l0.grid(row=0, column=0, sticky='nsew')

        self.page_input = tk.Entry(frame4, relief="raised", borderwidth=2)
        self.page_input.grid(row=0, column=1, sticky='nsew')

        bt4 = tk.Button(frame4, text='提取pdf中的图片', command=lambda: thread_it(self.start))
        bt4.grid(row=0, column=2, sticky='nsew')
        bt5 = tk.Button(frame4, text='将pdf转换为图片', command=lambda: thread_it(self.pdf_to_image))
        bt5.grid(row=0, column=3, sticky='nsew')

        frame2 = tk.Frame(master)
        frame2.columnconfigure(0, weight=0)
        frame2.columnconfigure(1, weight=1)
        frame2.columnconfigure(2, weight=1)
        frame2.rowconfigure(0, weight=0)

        bt6 = tk.Label(frame2, text='导出图片配置:')
        bt6.grid(row=0, column=0, sticky='nsew')
        self.resolution_value = tk.IntVar()
        self.zoom_factor_value = tk.IntVar()

        resolution_slider = tk.Scale(frame2, from_=200, to=800, resolution=200, orient='horizontal', label='分辨率',
                                     variable=self.resolution_value)
        resolution_slider.set(400)  # 设置默认值为400
        resolution_slider.grid(row=0, column=1, sticky='nsew')

        zoom_factor_slider = tk.Scale(frame2, from_=1, to=4, resolution=1, orient='horizontal', label='缩放因子',
                                      variable=self.zoom_factor_value)
        zoom_factor_slider.set(2)  # 设置默认值为2
        zoom_factor_slider.grid(row=0, column=2, sticky='nsew')

        frame3 = tk.Frame(master, relief="raised", borderwidth=1)
        self.out = tk.scrolledtext.ScrolledText(frame3)
        self.out.pack(fill=tk.BOTH, expand=True)

        frame1.rowconfigure(0, weight=0)
        frame1.rowconfigure(1, weight=0)
        frame1.columnconfigure(1, weight=1)
        frame3.rowconfigure(0, weight=1)
        frame3.columnconfigure(0, weight=1)
        frame1.grid(row=0, column=0, sticky='nsew')
        frame4.grid(row=1, column=0, sticky='nsew')
        frame2.grid(row=2, column=0, sticky='nsew')
        frame3.grid(row=3, column=0, sticky='nsew')

        master.columnconfigure(0, weight=1)
        master.rowconfigure(0, weight=0)
        master.rowconfigure(1, weight=0)
        master.rowconfigure(2, weight=0)
        master.rowconfigure(3, weight=1)
        # 创建Entry组件并设置注释文本
        self.save_path.insert(0, "默认保存在原文件夹")
        self.save_path.config(fg='gray')  # 设置注释文本颜色为灰色
        self.save_path.bind('<FocusIn>', self.on_entry_click)  # 绑定事件处理程序
        self.page_input.insert(0, "默认提取全部")
        self.page_input.config(fg='gray')  # 设置注释文本颜色为灰色
        self.page_input.bind('<FocusIn>', self.page_input_on_entry_click)  # 绑定事件处理程序
        self.page_input.bind('<FocusOut>', self.page_input_on_focus_out)  # 绑定事件处理程序

 这是将页面数据转化为图片,核心

    def to_be_img(self, pdf_page_number, input_pdf_page, img_path, resolution, zoom_factor):
        '''
        :param pdf_page_number: 指定的页码
        :param input_pdf_page: 输入指定页码的页面数据
        :param img_path: 图片保存的路径
        :param resolution: 分辨率
        :param zoom_factor: 放大系数
        :return: 
        '''
        zoom_matrix = fitz.Matrix(zoom_factor, zoom_factor)  # 调整矩阵以改善图像质量
        pix = input_pdf_page.get_pixmap(matrix=zoom_matrix)
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        img = img.resize((int(pix.width * resolution / 72), int(pix.height * resolution / 72)),
                         resample=Image.LANCZOS)  # 调整图像分辨率
        img.save(img_path, "PNG", dpi=(resolution, resolution))  # 保存图像为PNG格式并设置分辨率
        self.out.see(tk.END)
        self.out.insert('insert', f'已处理第{pdf_page_number}页。\n')

 有三种选择:提取全部;提取某一页;提取某一范围

    def extract_images_from_pdf(self, pdf_path, output_folder, pages):
        resolution = self.resolution_value.get()
        zoom_factor = self.zoom_factor_value.get()
        doc = fitz.open(pdf_path)
        if pages['end'] == None:  # 提取全部
            for page_number in range(doc.page_count):
                page = doc.load_page(page_number)
                image_list = page.get_images(full=True)
                error_nb = 0
                for img_index, img_info in enumerate(image_list):
                    try:
                        xref = img_info[0]
                        base_image = doc.extract_image(xref)
                        image_bytes = base_image["image"]
                        image = Image.open(io.BytesIO(image_bytes))
                        image = image.resize((int(image.width * zoom_factor), int(image.height * zoom_factor)),
                                             resample=Image.LANCZOS)
                        image.save(f"{output_folder}/page{page_number + 1}_image{img_index + 1}.png", format="PNG",
                                   dpi=(resolution, resolution), quality=95)
                    except Exception as e:
                        error_nb += 1
                        self.out.see(tk.END)
                        self.out.insert('insert', f'出错:{e}\n')

                self.out.see(tk.END)
                self.out.insert('insert',
                                f'第{page_number + 1}页提取提取了{len(image_list) - error_nb}张,出错{error_nb}张\n')
                self.out.see(tk.END)
            doc.close()
        elif pages['end'] == 0:  # 提取某一页
            page_nb = pages['start'] - 1
            page = doc.load_page(page_nb)
            image_list = page.get_images(full=True)
            error_nb = 0
            for img_index, img_info in enumerate(image_list):
                try:
                    xref = img_info[0]
                    base_image = doc.extract_image(xref)
                    image_bytes = base_image["image"]
                    image = Image.open(io.BytesIO(image_bytes))
                    image = image.resize((int(image.width * zoom_factor), int(image.height * zoom_factor)),
                                         resample=Image.LANCZOS)
                    image.save(f"{output_folder}/page{pages['start']}_image{img_index + 1}.png", format="PNG",
                               dpi=(resolution, resolution), quality=95)
                except Exception as e:
                    error_nb += 1
                    self.out.see(tk.END)
                    self.out.insert('insert', f'出错:{e}\n')
            self.out.see(tk.END)
            self.out.insert('insert',
                            f'第{pages["start"]}页提取提取了{len(image_list) - error_nb}张,出错{error_nb}张\n')
            self.out.see(tk.END)
            doc.close()
        else:  # 提取某范围
            page_start = pages['start'] - 1
            page_end = pages['end']
            for page_number in range(page_start, page_end):
                page = doc.load_page(page_number)
                image_list = page.get_images(full=True)
                error_nb = 0
                for img_index, img_info in enumerate(image_list):
                    try:
                        xref = img_info[0]
                        base_image = doc.extract_image(xref)
                        image_bytes = base_image["image"]
                        image = Image.open(io.BytesIO(image_bytes))
                        image = image.resize((int(image.width * zoom_factor), int(image.height * zoom_factor)),
                                             resample=Image.LANCZOS)
                        image.save(f"{output_folder}/page{page_number + 1}_image{img_index + 1}.png", format="PNG",
                                   dpi=(resolution, resolution), quality=95)
                    except Exception as e:
                        error_nb += 1
                        self.out.see(tk.END)
                        self.out.insert('insert', f'出错:{e}\n')

                self.out.see(tk.END)
                self.out.insert('insert',
                                f'第{page_number + 1}页提取提取了{len(image_list) - error_nb}张,出错{error_nb}张\n')
                self.out.see(tk.END)
            doc.close()
        self.out.see(tk.END)

 提取界面中相关参数并开始运行的函数

    def start(self):
        '''
        提取pdf中的图片
        :return:
        '''
        # 是否可以开始转化的标志
        IS_OK_TO_WORK = True
        # 存储提取页码,当下默认提取全部,当pages['end']=='0'为提取某一页
        pages = {'start': 1, 'end': None}
        if self.page_input.get() != '默认提取全部' and self.page_input.get() != '输入提取页码,如5-12' and self.page_input.get() != '':
            tmp = self.page_input.get().split('-')
            if len(tmp) == 2:
                try:
                    pages['start'] = int(tmp[0])
                    pages['end'] = int(tmp[1])
                except Exception as e:
                    IS_OK_TO_WORK = False
                    print(e)
                    messagebox.showerror(message='提取页码输入不给规范,请检查,样例:2-5(提取第2页到第5页的页面)')
            elif len(tmp) == 1:
                try:
                    pages['start'] = int(tmp[0])
                    pages['end'] = 0
                except:
                    IS_OK_TO_WORK = False
                    messagebox.showerror(message='提取页码输入不给规范,请检查,样例:2-5(提取第2页到第5页的页面)')
            else:
                messagebox.showerror(message='提取页码输入不给规范,请检查,样例:2-5(提取第2页到第5页的页面)')
                IS_OK_TO_WORK = False
        else:
            pages = {'start': 1, 'end': None}
        if IS_OK_TO_WORK:
            for pdf_path in self.files:
                # 提取PDF中的图片
                pdf_name = os.path.basename(pdf_path)
                tmp = self.save_path.get()
                if tmp != "默认保存在原文件夹" and tmp != "":
                    output_folder = os.path.join(tmp, os.path.splitext(pdf_name)[0])
                else:
                    folder_path = os.path.dirname(pdf_path)

                    output_folder = os.path.join(folder_path, os.path.splitext(pdf_name)[0])
                if not os.path.exists(output_folder):
                    os.mkdir(output_folder)
                self.out.see(tk.END)
                self.out.insert('insert', f'正在处理{pdf_path}\n')
                self.out.insert('insert', f'图片保存路径为{output_folder}\n')
                self.extract_images_from_pdf(pdf_path=pdf_path, output_folder=output_folder, pages=pages)

完整代码

from tkinter import scrolledtext
import tkinter as tk
from tkinter import filedialog
import os
import fitz
import io
import concurrent.futures
import windnd
from tkinter import messagebox
from PIL import Image
import threading
def thread_it(func, *args):
    t = threading.Thread(target=func, args=args)
    # 守护线程
    t.setDaemon = True
    # 启动线程
    t.start()


class get_pdf_photo():
    def __init__(self, master):
        '''
        获取pdf中的图片
        :param master:
        '''
        self.master = master
        self.files = []
        frame1 = tk.Frame(master, relief="raised", borderwidth=1)
        bt0 = tk.Label(frame1, text='文件路径')
        bt0.grid(row=0, column=0, sticky="nsew")
        self.show_path = tk.Entry(frame1)
        self.show_path.grid(row=0, column=1, sticky="nsew")
        windnd.hook_dropfiles(self.show_path, self.get_paths, force_unicode=True)
        bt1 = tk.Button(frame1, text='选择文件', command=self.choose_files)
        bt1.grid(row=0, column=2, sticky="nsew")
        bt2 = tk.Label(frame1, text='保存路径')
        bt2.grid(row=1, column=0, sticky="nsew")
        self.save_path = tk.Entry(frame1)
        self.save_path.grid(row=1, column=1, sticky="nsew")
        bt3 = tk.Button(frame1, text='选择另存路径', command=self.choose_savepath)
        bt3.grid(row=1, column=2, sticky="nsew")

        frame4 = tk.Frame(master, relief="raised", borderwidth=1)
        frame4.columnconfigure(0, weight=0)
        frame4.columnconfigure(1, weight=0)
        frame4.columnconfigure(2, weight=1)
        frame4.columnconfigure(3, weight=1)
        frame4.rowconfigure(0, weight=0)

        l0 = tk.Label(frame4, text='提取范围')
        l0.grid(row=0, column=0, sticky='nsew')

        self.page_input = tk.Entry(frame4, relief="raised", borderwidth=2)
        self.page_input.grid(row=0, column=1, sticky='nsew')

        bt4 = tk.Button(frame4, text='提取pdf中的图片', command=lambda: thread_it(self.start))
        bt4.grid(row=0, column=2, sticky='nsew')
        bt5 = tk.Button(frame4, text='将pdf转换为图片', command=lambda: thread_it(self.pdf_to_image))
        bt5.grid(row=0, column=3, sticky='nsew')

        frame2 = tk.Frame(master)
        frame2.columnconfigure(0, weight=0)
        frame2.columnconfigure(1, weight=1)
        frame2.columnconfigure(2, weight=1)
        frame2.rowconfigure(0, weight=0)

        bt6 = tk.Label(frame2, text='导出图片配置:')
        bt6.grid(row=0, column=0, sticky='nsew')
        self.resolution_value = tk.IntVar()
        self.zoom_factor_value = tk.IntVar()

        resolution_slider = tk.Scale(frame2, from_=200, to=800, resolution=200, orient='horizontal', label='分辨率',
                                     variable=self.resolution_value)
        resolution_slider.set(400)  # 设置默认值为400
        resolution_slider.grid(row=0, column=1, sticky='nsew')

        zoom_factor_slider = tk.Scale(frame2, from_=1, to=4, resolution=1, orient='horizontal', label='缩放因子',
                                      variable=self.zoom_factor_value)
        zoom_factor_slider.set(2)  # 设置默认值为2
        zoom_factor_slider.grid(row=0, column=2, sticky='nsew')

        frame3 = tk.Frame(master, relief="raised", borderwidth=1)
        self.out = tk.scrolledtext.ScrolledText(frame3)
        self.out.pack(fill=tk.BOTH, expand=True)

        frame1.rowconfigure(0, weight=0)
        frame1.rowconfigure(1, weight=0)
        frame1.columnconfigure(1, weight=1)
        frame3.rowconfigure(0, weight=1)
        frame3.columnconfigure(0, weight=1)
        frame1.grid(row=0, column=0, sticky='nsew')
        frame4.grid(row=1, column=0, sticky='nsew')
        frame2.grid(row=2, column=0, sticky='nsew')
        frame3.grid(row=3, column=0, sticky='nsew')

        master.columnconfigure(0, weight=1)
        master.rowconfigure(0, weight=0)
        master.rowconfigure(1, weight=0)
        master.rowconfigure(2, weight=0)
        master.rowconfigure(3, weight=1)
        # 创建Entry组件并设置注释文本
        self.save_path.insert(0, "默认保存在原文件夹")
        self.save_path.config(fg='gray')  # 设置注释文本颜色为灰色
        self.save_path.bind('<FocusIn>', self.on_entry_click)  # 绑定事件处理程序
        self.page_input.insert(0, "默认提取全部")
        self.page_input.config(fg='gray')  # 设置注释文本颜色为灰色
        self.page_input.bind('<FocusIn>', self.page_input_on_entry_click)  # 绑定事件处理程序
        self.page_input.bind('<FocusOut>', self.page_input_on_focus_out)  # 绑定事件处理程序

    def page_input_on_focus_out(self, event):
        if not self.page_input.get() or self.page_input.get() == '输入提取页码,如5-12':
            self.page_input.delete(0, tk.END)
            self.page_input.insert(0, "默认提取全部")
            self.page_input.config(fg='gray')  # 设置注释文本颜色为灰色

    def get_paths(self, paths):
        '''
        拖入获取路径
        :param paths:
        :return:
        '''
        self.files = []
        self.show_path.delete(0, tk.END)  # 删除注释文本
        self.show_path.config(fg='black')  # 更改文本颜色为黑色
        for idx, i in enumerate(paths):
            if i.endswith('.pdf'):
                self.files.append(i)
                self.show_path.insert(0, i + '\n')
            else:
                messagebox.showinfo(title='Error', message=f'文件{i}不是PDF文件!')

    def page_input_on_key(self, event):
        self.page_input.delete(0, tk.END)  # 删除注释文本
        self.page_input.config(fg='black')  # 更改文本颜色为黑色
        # 解除绑定
        self.page_input.unbind("<Key>")

    def page_input_on_entry_click(self, event):
        if self.page_input.get() == '输入提取页码,如5-12' or self.page_input.get() == '默认提取全部':
            self.page_input.delete(0, tk.END)  # 删除注释文本
            self.page_input.config(fg='gray')  # 更改文本颜色为黑色
            self.page_input.insert(0, '输入提取页码,如5-12')
            self.page_input.icursor(0)  # 将光标移到最前面
            self.page_input.bind("<Key>", self.page_input_on_key)  # 绑定键盘按键事件

    def on_entry_click(self, event):
        if self.save_path.get() == "默认保存在原文件夹":
            self.save_path.delete(0, tk.END)  # 删除注释文本
            self.save_path.config(fg='black')  # 更改文本颜色为黑色

    def choose_savepath(self):
        path = filedialog.askdirectory()
        if path != '':
            self.save_path.delete(0, tk.END)
            self.save_path.insert('insert', path)
            self.save_path.config(fg='black')  # 更改文本颜色为黑色

    def choose_files(self):
        root = tk.Tk()
        root.withdraw()
        # 弹出文件对话框,让用户选择一个或多个PDF文件
        self.files = filedialog.askopenfilenames(filetypes=(('PDF Files', '*.pdf'),))
        self.show_path.delete(0, tk.END)
        for pdf_path in self.files:
            self.show_path.insert('insert', pdf_path)

    def to_be_img(self, pdf_page_number, input_pdf_page, img_path, resolution, zoom_factor):
        '''
        :param pdf_page_number: 指定的页码
        :param input_pdf_page: 输入指定页码的页面数据
        :param img_path: 图片保存的路径
        :param resolution: 分辨率
        :param zoom_factor: 放大系数
        :return:
        '''
        zoom_matrix = fitz.Matrix(zoom_factor, zoom_factor)  # 调整矩阵以改善图像质量
        pix = input_pdf_page.get_pixmap(matrix=zoom_matrix)
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        img = img.resize((int(pix.width * resolution / 72), int(pix.height * resolution / 72)),
                         resample=Image.LANCZOS)  # 调整图像分辨率
        img.save(img_path, "PNG", dpi=(resolution, resolution))  # 保存图像为PNG格式并设置分辨率
        self.out.see(tk.END)
        self.out.insert('insert', f'已处理第{pdf_page_number}页。\n')

    def extract_images_from_pdf(self, pdf_path, output_folder, pages):
        resolution = self.resolution_value.get()
        zoom_factor = self.zoom_factor_value.get()
        doc = fitz.open(pdf_path)
        if pages['end'] == None:  # 提取全部
            for page_number in range(doc.page_count):
                page = doc.load_page(page_number)
                image_list = page.get_images(full=True)
                error_nb = 0
                for img_index, img_info in enumerate(image_list):
                    try:
                        xref = img_info[0]
                        base_image = doc.extract_image(xref)
                        image_bytes = base_image["image"]
                        image = Image.open(io.BytesIO(image_bytes))
                        image = image.resize((int(image.width * zoom_factor), int(image.height * zoom_factor)),
                                             resample=Image.LANCZOS)
                        image.save(f"{output_folder}/page{page_number + 1}_image{img_index + 1}.png", format="PNG",
                                   dpi=(resolution, resolution), quality=95)
                    except Exception as e:
                        error_nb += 1
                        self.out.see(tk.END)
                        self.out.insert('insert', f'出错:{e}\n')

                self.out.see(tk.END)
                self.out.insert('insert',
                                f'第{page_number + 1}页提取提取了{len(image_list) - error_nb}张,出错{error_nb}张\n')
                self.out.see(tk.END)
            doc.close()
        elif pages['end'] == 0:  # 提取某一页
            page_nb = pages['start'] - 1
            page = doc.load_page(page_nb)
            image_list = page.get_images(full=True)
            error_nb = 0
            for img_index, img_info in enumerate(image_list):
                try:
                    xref = img_info[0]
                    base_image = doc.extract_image(xref)
                    image_bytes = base_image["image"]
                    image = Image.open(io.BytesIO(image_bytes))
                    image = image.resize((int(image.width * zoom_factor), int(image.height * zoom_factor)),
                                         resample=Image.LANCZOS)
                    image.save(f"{output_folder}/page{pages['start']}_image{img_index + 1}.png", format="PNG",
                               dpi=(resolution, resolution), quality=95)
                except Exception as e:
                    error_nb += 1
                    self.out.see(tk.END)
                    self.out.insert('insert', f'出错:{e}\n')
            self.out.see(tk.END)
            self.out.insert('insert',
                            f'第{pages["start"]}页提取提取了{len(image_list) - error_nb}张,出错{error_nb}张\n')
            self.out.see(tk.END)
            doc.close()
        else:  # 提取某范围
            page_start = pages['start'] - 1
            page_end = pages['end']
            for page_number in range(page_start, page_end):
                page = doc.load_page(page_number)
                image_list = page.get_images(full=True)
                error_nb = 0
                for img_index, img_info in enumerate(image_list):
                    try:
                        xref = img_info[0]
                        base_image = doc.extract_image(xref)
                        image_bytes = base_image["image"]
                        image = Image.open(io.BytesIO(image_bytes))
                        image = image.resize((int(image.width * zoom_factor), int(image.height * zoom_factor)),
                                             resample=Image.LANCZOS)
                        image.save(f"{output_folder}/page{page_number + 1}_image{img_index + 1}.png", format="PNG",
                                   dpi=(resolution, resolution), quality=95)
                    except Exception as e:
                        error_nb += 1
                        self.out.see(tk.END)
                        self.out.insert('insert', f'出错:{e}\n')

                self.out.see(tk.END)
                self.out.insert('insert',
                                f'第{page_number + 1}页提取提取了{len(image_list) - error_nb}张,出错{error_nb}张\n')
                self.out.see(tk.END)
            doc.close()
        self.out.see(tk.END)

    def start(self):
        '''
        提取pdf中的图片
        :return:
        '''
        # 是否可以开始转化的标志
        IS_OK_TO_WORK = True
        # 存储提取页码,当下默认提取全部,当pages['end']=='0'为提取某一页
        pages = {'start': 1, 'end': None}
        if self.page_input.get() != '默认提取全部' and self.page_input.get() != '输入提取页码,如5-12' and self.page_input.get() != '':
            tmp = self.page_input.get().split('-')
            if len(tmp) == 2:
                try:
                    pages['start'] = int(tmp[0])
                    pages['end'] = int(tmp[1])
                except Exception as e:
                    IS_OK_TO_WORK = False
                    print(e)
                    messagebox.showerror(message='提取页码输入不给规范,请检查,样例:2-5(提取第2页到第5页的页面)')
            elif len(tmp) == 1:
                try:
                    pages['start'] = int(tmp[0])
                    pages['end'] = 0
                except:
                    IS_OK_TO_WORK = False
                    messagebox.showerror(message='提取页码输入不给规范,请检查,样例:2-5(提取第2页到第5页的页面)')
            else:
                messagebox.showerror(message='提取页码输入不给规范,请检查,样例:2-5(提取第2页到第5页的页面)')
                IS_OK_TO_WORK = False
        else:
            pages = {'start': 1, 'end': None}
        if IS_OK_TO_WORK:
            for pdf_path in self.files:
                # 提取PDF中的图片
                pdf_name = os.path.basename(pdf_path)
                tmp = self.save_path.get()
                if tmp != "默认保存在原文件夹" and tmp != "":
                    output_folder = os.path.join(tmp, os.path.splitext(pdf_name)[0])
                else:
                    folder_path = os.path.dirname(pdf_path)

                    output_folder = os.path.join(folder_path, os.path.splitext(pdf_name)[0])
                if not os.path.exists(output_folder):
                    os.mkdir(output_folder)
                self.out.see(tk.END)
                self.out.insert('insert', f'正在处理{pdf_path}\n')
                self.out.insert('insert', f'图片保存路径为{output_folder}\n')
                self.extract_images_from_pdf(pdf_path=pdf_path, output_folder=output_folder, pages=pages)

    def pdf_to_image(self):
        '''
        将pdf转化为图片
        :return:
        '''
        IS_OK_TO_WORK = True
        # 存储提取页码,当下默认提取全部,当pages['end']=='0'为提取某一页
        pages = {'start': 1, 'end': None}
        if self.page_input.get() != '默认提取全部' and self.page_input.get() != '输入提取页码,如5-12' and self.page_input.get() != '':
            tmp = self.page_input.get().split('-')
            if len(tmp) == 2:
                try:
                    pages['start'] = int(tmp[0])
                    pages['end'] = int(tmp[1])
                    if pages['start'] > pages['end']:
                        messagebox.showerror(message='提取页码输入不给规范,请检查,样例:2-5(提取第2页到第5页的页面)')
                        IS_OK_TO_WORK = False
                except Exception as e:
                    IS_OK_TO_WORK = False
                    print(e)
                    messagebox.showerror(message='提取页码输入不给规范,请检查,样例:2-5(提取第2页到第5页的页面)')
            elif len(tmp) == 1:
                try:
                    pages['start'] = int(tmp[0])
                    pages['end'] = 0
                except:
                    IS_OK_TO_WORK = False
                    messagebox.showerror(message='提取页码输入不给规范,请检查,样例:2-5(提取第2页到第5页的页面)')
            else:
                messagebox.showerror(message='提取页码输入不给规范,请检查,样例:2-5(提取第2页到第5页的页面)')
                IS_OK_TO_WORK = False
        else:
            pages = {'start': 1, 'end': None}
        if IS_OK_TO_WORK:
            # resolution=300:代表设置图像的分辨率为 300 DPI(每英寸点数),这可以提高图像的清晰度。
            # zoom_factor=2:代表将页面放大两倍以提高图像的清晰度。
            resolution = self.resolution_value.get()
            zoom_factor = self.zoom_factor_value.get()
            for pdf_path in self.files:
                mypdf = fitz.open(pdf_path)
                self.out.see(tk.END)
                self.out.insert('insert', f'开始转换{pdf_path}\n')
                tmp = os.path.splitext(pdf_path)[0]
                pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
                tmp_path = self.save_path.get()

                if tmp_path and tmp_path != '默认保存在原文件夹':
                    save_path = os.path.join(tmp_path, f'{pdf_name}')
                else:
                    save_path = tmp
                if not os.path.exists(save_path):
                    os.makedirs(save_path)
                if pages['end'] == None:
                    futures = []
                    # 创建一个最多包含20个线程的线程池
                    with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
                        for pdf_page_number in range(mypdf.page_count):
                            input_pdf_page = mypdf.load_page(pdf_page_number)
                            img_path = os.path.join(save_path, f'{pdf_page_number + 1}.png')
                            args_list = [pdf_page_number + 1, input_pdf_page, img_path, resolution, zoom_factor]
                            # 提交任务给线程池
                            future = executor.submit(self.to_be_img, *args_list)
                            futures.append(future)
                    mypdf.close()

                elif pages['end'] == 0:
                    page_nb = pages['start'] - 1

                    img_path = os.path.join(save_path, f'{pages["start"]}.png')
                    page = mypdf.load_page(page_nb)
                    self.to_be_img(pdf_page_number=pages['start'], input_pdf_page=page, img_path=img_path,
                                   resolution=resolution, zoom_factor=zoom_factor)
                    mypdf.close()

                else:
                    page_start = pages['start'] - 1
                    page_end = pages['end']
                    futures = []
                    # 创建一个最多包含20个线程的线程池
                    with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
                        for pdf_page_number in range(page_start, page_end):
                            input_pdf_page = mypdf.load_page(pdf_page_number)
                            img_path = os.path.join(save_path, f'{pdf_page_number + 1}.png')
                            args_list = [pdf_page_number + 1, input_pdf_page, img_path, resolution, zoom_factor]
                            # 提交任务给线程池
                            future = executor.submit(self.to_be_img, *args_list)
                            futures.append(future)

                    mypdf.close()
                self.out.see(tk.END)
                self.out.insert('insert', f'转换任务完成\n\n')


root = tk.Tk()
root.title('提取PDF中的图片')
get_pdf_photo(root)
root.mainloop()


总结

我想要的功能已实现,日常够用了。

  • 8
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值