一、基于python代码将批量的pdf转换为opencv的图像
该代码可以实现多页PDF的图像转换
import datetime
import os
import fitz # fitz就是pip install PyMuPDF
import cv2
import numpy as np
def pix_to_image(pix):
bytes = np.frombuffer(pix.samples, dtype=np.uint8)
img = bytes.reshape(pix.height, pix.width, 3)
cv_image = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
return cv_image
def pyMuPDF_fitz(pdfPath):
pdfDoc = fitz.open(pdfPath)
pix_totall=[]
for pg in range(pdfDoc.page_count):
page = pdfDoc[pg]
rotate = int(0)
# zoom_x = 2.33333333 # (1.33333333-->1056x816) (2-->1584x1224)
# zoom_y = 2.33333333
zoom_x = 4 # (1.33333333-->1056x816) (2-->1584x1224)
zoom_y = 4
mat = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)
pix = page.get_pixmap(matrix=mat, alpha=False)
pix_totall.append(pix)
return pix_totall
def get_files(path):
""" 获取指定路径下所有文件名称 """
files = []
for filename in os.listdir(path):
if os.path.isfile(os.path.join(path, filename)):
files.append(filename)
return files
if __name__ == "__main__":
# 1、PDF地址
pdfPath_totall = 'C:/code/box_word/PDF_BOX/pdf/'
file_list = os.listdir(pdfPath_totall)
imagePath = 'C:/code/box_word/PDF_BOX/pdf_images/'
print(file_list)
# # 2、需要储存图片的目录
for i, name in enumerate(file_list):
pdfPath=pdfPath_totall+name
# image_save_Path=imagePath+'pdf3'+'.bmp'
print(pdfPath)
#将pdf转换成Buffer,多张图像依旧可以转换
pix_totall=pyMuPDF_fitz(pdfPath)
print("图像的总数为:", len(pix_totall))
#将buffer转换成opencv的图像格式
for i in range(len(pix_totall)):
image_pfd=pix_to_image(pix_totall[i])
gray_image = cv2.cvtColor(image_pfd, cv2.COLOR_BGR2GRAY)
cv2.imwrite(imagePath + name+'_'+str(i)+'pdf_.png', gray_image)
cv2.imwrite(imagePath + name+'_'+str(i)+'pdfcolor_.png', image_pfd)