扫描件PDF怎么转换成可复制的PDF

扫描件PDF基本上都是图片,是无法复制内容的,那么怎么才能复制到内容呢,有人说很多软件都是可以的,但是基本上所有的的软件,都需要购买VIP的,有没有免费的呢?
有!我发现WPS里面“转换”——“扫描件识别”,可以转换PDF,而且识别准确性非常高,但是,每个pdf最多只能识别5页,所以,我们需要将需要扫描的原始pdf每隔5页分割成多个pdf,再进行转换,之后再进行合并,就OK了。
网上搜了一下,几乎所有的pdf拆分合并的软件都是要钱的,那么我们可以通过python,自己写一个程序,进行pdf的拆分和合并,完全免费,随意使用。

下面是pdf拆分的代码,桌面程序

import tkinter as tk
from tkinter import filedialog as fd
from PyPDF2 import PdfReader, PdfWriter

# 获取pdf文件总页数
def pdf_page_num(path):
    file_reader = PdfReader(path)
    return len(file_reader.pages)

# 打开选择文件对话框选择文件
def select_pdf():
    pdf_selected = fd.askopenfilename(filetypes=[('PDF文件', '.pdf')])
    if pdf_selected != '':
        pdf_in.set(pdf_selected)
        page_num = pdf_page_num(pdf_selected)
        page_range.set("页码范围:1-%s" % page_num)
        button_out['state'] = "normal"


# 打开选择文件夹对话框选择保存位置
def select_out():
    path_save = fd.askdirectory()
    if path_save != '':
        button_split['state'] = 'normal'
        pdf_out.set(path_save)


def pdf_split():
    select_file_path = pdf_in.get()
    save_path = pdf_out.get()
    page_text = entry_page.get()
    if select_file_path is None or select_file_path == '':
        tk.messagebox.showinfo('提醒', "请选择需要分割的文件!!!")
        return
    if save_path is None or save_path == '':
        tk.messagebox.showinfo('提醒', "请选择保存的文件夹!!!")
        return
    if page_text is None or page_text == '':
        tk.messagebox.showinfo('提醒', "请输入分割的页码!!!")
        return
    file_reader = PdfReader(select_file_path)
    pageNums = []
    if page_text.find('/') == 0:
        pageNum = int(page_text.replace("/", ""))
        firstNum = 0
        totalNum = len(file_reader.pages)
        for num in range(1, totalNum + 1):
            if num % pageNum == 1:
                firstNum = num
            if num % pageNum == 0 or num == totalNum:
                pageNums.append((firstNum, num))
    else:
        pageArr = page_text.split(',')
        for pageText in pageArr:
            if "-" in pageText:
                pageTextSplit = pageText.split("-")
                pageNums.append((int(pageTextSplit[0]), int(pageTextSplit[1])))
            else:
                pageNums.append((int(pageText), int(pageText)))
    for pageNum in pageNums:
        file_writer = PdfWriter()
        firstNum = pageNum[0]
        lastNum = pageNum[1]
        for pageIndex in range(firstNum, lastNum + 1):
            file_writer.add_page(file_reader.pages[pageIndex - 1])
        name = firstNum if firstNum == lastNum else "%s-%s" % (firstNum, lastNum)
        with open(r"%s\%s.pdf" % (save_path, name), 'wb') as out:
            file_writer.write(out)

    tk.messagebox.showinfo('成功', "全部拆分成功!!!")


def select_split_file(root):
    global button_split, pdf_in, pdf_out, split_out, page_range, button_out, split_result, entry_page
    pdf_in = tk.StringVar()
    pdf_out = tk.StringVar()
    split_out = tk.StringVar()
    page_range = tk.StringVar()
    split_result = tk.StringVar()

    label_input = tk.Label(root, text="1、选择要分割的PDF文件")
    entry_input = tk.Entry(root, textvariable=pdf_in, width=65)
    button_input = tk.Button(root, text="选择文件", command=select_pdf)

    label_page = tk.Label(root, text="2、设置分割的页码")
    label_page_des = tk.Label(root, text="(可分割为多个PDF,用逗号分隔。例如,1-5,2-10,11,12,如果需要间隔相同页码分割,比如每隔5页,/5)")
    label_page_range = tk.Label(root, text="页码范围", textvariable=page_range)
    entry_page = tk.Entry(root, textvariable=split_out, width=65)

    label_out = tk.Label(root, text='3、选择输出文件夹:')
    entry_out = tk.Entry(root, textvariable=pdf_out, width=65)
    button_out = tk.Button(root, text='选择保存位置', command=select_out)
    button_out['state'] = 'disabled'

    button_split = tk.Button(root, text='执行分割', command=pdf_split, width=20, height=2)

    label_input.place(x=10, y=10)
    entry_input.place(x=10, y=35)
    button_input.place(x=500, y=32)

    label_page.place(x=10, y=80)
    label_page_des.place(x=10, y=105)
    label_page_range.place(x=10, y=130)
    entry_page.place(x=10, y=155)

    label_out.place(x=10, y=205)
    entry_out.place(x=10, y=230)
    button_out.place(x=500, y=230)

    button_split.place(x=220, y=260)


root = tk.Tk()

sw = root.winfo_screenwidth()
sh = root.winfo_screenheight()
c = (sw - 600) / 2
d = (sh - 500) / 2
root.geometry('600x500+%d+%d' % (c, d))
root.title("PDF拆分")
root.resizable(width=False, height=False)

select_split_file(root)
root.mainloop()

下面是pdf合并的程序 ,桌面程序

import tkinter as tk
from tkinter import filedialog as fd
from PyPDF2 import PdfReader, PdfWriter
import os
import natsort

file_types = [('PDF文件', '.pdf')]


def pdf_reader(path):
    # 列出目录下的所有文件和文件夹
    files = os.listdir(path)
    file_list = natsort.natsorted(files)
    return file_list

def pdf_reader_path(path):
    # 列出目录下的所有文件和文件夹
    files = os.listdir(path)
    file_list = natsort.natsorted(files)
    paths = []
    for file_name in file_list:
        print(file_name)
        # 拼接目录名和文件名
        file_path = os.path.join(path, file_name)
        # 判断是否为文件
        if os.path.isfile(file_path):
            paths.append(file_path)
    return paths

# 选择保存位置
def select_in():
    path_in = fd.askdirectory()
    if path_in != '':
        button_out['state'] = 'normal'
        pdf_in.set(path_in)

        pdfs = pdf_reader(path_in)

        text_names.insert(tk.END, ",".join(pdfs))


# 选择保存位置
def select_out():
    path_save = fd.askdirectory()
    if path_save != '':
        pdf_out.set(path_save)


def pdf_split():
    path_in = pdf_in.get()
    save_path = pdf_out.get()
    if path_in is None or path_in == '':
        tk.messagebox.showinfo('提醒', "请选择需要合并的文件所在文件夹!!!")
        return
    if save_path is None or save_path == '':
        tk.messagebox.showinfo('提醒', "请选择保存的文件夹!!!")
        return
    paths = pdf_reader_path(path_in)
    file_writer = PdfWriter()
    for file in paths:
        print(file)
        # 循环读取需要合并pdf文件
        file_reader = PdfReader(file)
        # 遍历每个pdf的每一页
        for page in range(len(file_reader.pages)):
            # 写入实例化对象中
            file_writer.add_page(file_reader.pages[page])

    with open(r"%s\合成后文件.pdf" % (save_path), 'wb') as out:
        file_writer.write(out)

    tk.messagebox.showinfo('成功', "全部合成成功!!!")


def select_split_file(root):
    global button_split, pdf_in, pdf_out, pdf_names, button_out, split_result, entry_page, text_names
    pdf_in = tk.StringVar()
    pdf_out = tk.StringVar()
    pdf_names = tk.StringVar()
    split_result = tk.StringVar()

    label_input = tk.Label(root, text='1、选择需要合并的PDF文件所在的文件夹:')
    entry_input = tk.Entry(root, textvariable=pdf_in, width=60)
    button_input = tk.Button(root, text='选择合并文件夹', command=select_in)

    label_out = tk.Label(root, text='2、选择输出文件夹:')
    entry_out = tk.Entry(root, textvariable=pdf_out, width=60)
    button_out = tk.Button(root, text='选择保存位置', command=select_out)
    button_out['state'] = 'disabled'

    button_split = tk.Button(root, text='执行合并', command=pdf_split, width=20, height=2)

    text_names = tk.Text(root, height=20, width=79)
    text_names.pack(side='left', fill=tk.BOTH, expand=True)
    scrollbar = tk.Scrollbar(root, command=text_names.yview)
    scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
    text_names.config(yscrollcommand=scrollbar.set)

    label_input.place(x=10, y=10)
    entry_input.place(x=10, y=35)
    button_input.place(x=480, y=32)

    label_out.place(x=10, y=80)
    entry_out.place(x=10, y=105)
    button_out.place(x=480, y=105)

    button_split.place(x=220, y=160)

    text_names.place(x=10, y=230)


root = tk.Tk()

sw = root.winfo_screenwidth()
sh = root.winfo_screenheight()
c = (sw - 600) / 2
d = (sh - 500) / 2
root.geometry('600x500+%d+%d' % (c, d))
root.title("PDF合并")
root.resizable(width=False, height=False)

select_split_file(root)
root.mainloop()

其次,如果原始文件含有书签,那么可以将原始文件的书签复制到新的合并的pdf,控制台程序,可以定位到原书签相同的目标位置,不会出现同一页多个书签,点击不动的情况

import PyPDF2 as pdf
from PyPDF2.generic import Fit

def add_bookmark(writer, bookmark_list, realPage, parent):
    for bookmark in bookmark_list:
        if isinstance(bookmark, list):
            add_bookmark(writer, bookmark, realPage, parentItem)
        else:
            title = bookmark.title
            # bookmark.page.idnum 不是真正的页码,需要找到真正的页码
            pageIndex = realPage[bookmark.page.idnum]

            parentItem = writer.add_outline_item(title, pageIndex, parent,
                                    fit=Fit("/XYZ", (bookmark.left, bookmark.top, bookmark.zoom)))

            print(title)


if __name__ == "__main__":
    reader = pdf.PdfReader(R"D:\xxx.pdf")
    # 下面代码是为了找到真正的页码与idnum对应关系
    realPage = {}
    for pageIndex in range(len(reader.pages)):
        page = reader.pages[pageIndex]
        realPage[page.indirect_ref.idnum] = pageIndex

    reader2 = pdf.PdfReader(R"D:\xxx(OCR).pdf")
    writer2 = pdf.PdfWriter()

    for i in range(len(reader2.pages)):
        writer2.add_page(reader2.pages[i])

    bookmark_out = reader.outline
    add_bookmark(writer2, bookmark_out, realPage, None)

    writer2.write(R"D:\xxx(OCR-书签).pdf")

如果不想执行代码,也可以使用生成好的exe执行

链接:https://pan.baidu.com/s/15Ks2pTFmoIrXcZ-gpP9e2A
提取码:6dza

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值