python搜索pdf内容所在页码_用python合并多个pdf文件并标页码

最新推荐文章于 2024-07-28 19:22:49 发布

Jason Hsiao

最新推荐文章于 2024-07-28 19:22:49 发布

阅读量787

点赞数 1

文章标签： python搜索pdf内容所在页码

本文链接：https://blog.csdn.net/weixin_35935514/article/details/113670744

版权

这篇博客介绍如何利用Python的PyPDF2和reportlab库合并多个PDF文件，并为合并后的PDF添加页码。首先，通过os模块遍历指定目录下的PDF文件，然后使用PyPDF2读取并合并这些文件。接着，使用reportlab创建带有页码的PDF页面，最后将页码添加到每一页中。示例代码展示了详细的合并与添加页码的步骤。

摘要由CSDN通过智能技术生成

合并多个pdf文件

来源某篇博客，忘了地址=_=!

# -*- coding:utf-8*-

# 利用PyPDF2模块合并同一文件夹下的所有PDF文件

# 只需修改存放PDF文件的文件夹变量：file_dir 和输出文件名变量: outfile

import os

from PyPDF2 import PdfFileReader, PdfFileWriter

import time

# 使用os模块的walk函数，搜索出指定目录下的全部PDF文件

# 获取同一目录下的所有PDF文件的绝对路径

def getFileName(filedir):

file_list = [os.path.join(root, filespath) \

for root, dirs, files in os.walk(filedir) \

for filespath in files \

if str(filespath).endswith('pdf')

]

return file_list if file_list else []

# 合并同一目录下的所有PDF文件

def MergePDF(filepath, outfile):

output = PdfFileWriter()

outputPages = 0

pdf_fileName = getFileName(filepath)

if pdf_fileName:

for pdf_file in pdf_fileName:

print("路径：%s"%pdf_file)

# 读取源PDF文件

input = PdfFileReader(open(pdf_file, "rb"))

# 获得源PDF文件中页面总数

pageCount = input.getNumPages()

outputPages += pageCount

print("页数：%d"%pageCount)

# 分别将page添加到输出output中

for iPage in range(pageCount):

output.addPage(input.getPage(iPage))

print("合并后的总页数:%d."%outputPages)

# 写入到目标PDF文件

outputStream = open(os.path.join(filepath, outfile), "wb")

output.write(outputStream)

outputStream.close()

print("PDF文件合并完成！")

else:

print("没有可以合并的PDF文件！")

# 主函数

def main():

time1 = time.time()

file_dir = r'E:\test\ac3' # 存放PDF的原文件夹

outfile = "Cheat_Sheets.pdf" # 输出的PDF文件的名称

MergePDF(file_dir, outfile)

time2 = time.time()

print('总共耗时：%s s.' %(time2 - time1))

main()

可能会报错，注释site-packages/PyPDF2/generic.py下的这段代码：

6eb4f269adc1

标页码

#!/usr/bin/env python3

# -*- coding: utf-8 -*-

helpDoc = '''

Add Page Number to PDF file with Python

Python 给 PDF 添加页码

usage:

python addPageNumberToPDF.py [PDF path]

require:

pip install reportlab pypdf2

Support both Python2/3, But more recommend Python3

tips:

* output file will save at pdfWithNumbers/[PDF path]_page.pdf

* only support A4 size PDF

* tested on Python2/Python3@ubuntu

* more large size of PDF require more RAM

* if segmentation fault, plaese try use Python 3

* if generate PDF document is damaged, plaese try use Python 3

Author:

Lei Yang (ylxx@live.com)

GitHub:

https://gist.github.com/DIYer22/b9ede6b5b96109788a47973649645c1f

'''

print(helpDoc)

import reportlab

from reportlab.lib.units import mm

from reportlab.pdfgen import canvas

from PyPDF2 import PdfFileWriter, PdfFileReader

def createPagePdf(num, tmp):

c = canvas.Canvas(tmp)

for i in range(1,num+1):

c.drawString((210//2)*mm, (4)*mm, str(i))

c.showPage()

c.save()

return

with open(tmp, 'rb') as f:

pdf = PdfFileReader(f)

layer = pdf.getPage(0)

return layer

if __name__ == "__main__":

pass

import sys,os

# 需要标页码的pdf文件

path = 'E:\\test\\ac2\\3.pdf'

if len(sys.argv) == 1:

if not os.path.isfile(path):

sys.exit(1)

else:

path = sys.argv[1]

base = os.path.basename(path)

tmp = "__tmp.pdf"

batch = 10

batch = 0

output = PdfFileWriter()

with open(path, 'rb') as f:

pdf = PdfFileReader(f,strict=False)

n = pdf.getNumPages()

if batch == 0:

batch = -n

createPagePdf(n,tmp)

if not os.path.isdir('pdfWithNumbers/'):

os.mkdir('pdfWithNumbers/')

with open(tmp, 'rb') as ftmp:

numberPdf = PdfFileReader(ftmp)

for p in range(n):

if not p%batch and p:

newpath = path.replace(base, 'pdfWithNumbers/'+ base[:-4] + '_page_%d'%(p//batch) + path[-4:])

with open(newpath, 'wb') as f:

output.write(f)

output = PdfFileWriter()

print('page: %d of %d'%(p, n))

page = pdf.getPage(p)

numberLayer = numberPdf.getPage(p)

page.mergePage(numberLayer)

output.addPage(page)

if output.getNumPages():

newpath = path.replace(base, base[:-4] + '_page_%d'%(p//batch + 1) + path[-4:])

with open(newpath, 'wb') as f:

output.write(f)

os.remove(tmp)