ImageMagick
install
yum install -y ImageMagick
使用
import os, sys
import subprocess
def convert_pdf_to_img(pdf_file, imgs_dir):
"""
"""
if subprocess.call(['convert', '-density', ' 120', pdf_file, '-quality', '95',\
f'{imgs_dir}/{pdf_file.split("/")[-1].replace("pdf", "jpg")}']) != 0:
raise RuntimeError(f'convert_pdf_to_png failed with pdf filename: {pdf_file}')
def __name__ == '__main__':
file_list = glob.glob('/path/*.pdf')
result_imgs_dir = 'results'
pdfs_to_convert = []
for pdf_file in file_list:
pdfs_to_convert.append(pdf_file.replace('PDF', 'pdf'))
if pdf_file.endswith('PDF'):
old_name = os.path.join(media_path, pdf_file)
new_name = os.path.join(media_path, pdf_file.replace('PDF', 'pdf'))
os.rename(old_name, new_name)
for pdf_to_convert in pdfs_to_convert:
convert_pdf_to_img(pdf_to_convert, result_imgs_dir)
print(f'**************** convert done ****************')
pdf2image
install
pip install pdf2image
使用
import os, sys
from pdf2image import convert_from_path,convert_from_bytes
import tempfile
def pdf2image2(pdfPath, imagePath):
with tempfile.TemporaryDirectory() as path:
images_from_path = convert_from_path(pdfPath, output_folder=path, dpi=96)
for image in images_from_path:
if not os.path.exists(imagePath):
os.makedirs(imagePath)
image.save(imagePath+'/'+ pdfPath.split('.')[0] +'_conv_%s.jpg' % images_from_path.index(image), 'JPEG')
print(images_from_path)
if __name__ == '__main__':
pdfpath="/path/test.pdf"
imgpath='/path/results'
pdf2image2(pdfpath, imgpath)
print(f'************** convert done **************')
PyMuPDF
install
pip install PyMuPDF
使用
import os,sys
import cv2
import fitz
import datetime
def pyMuPDF_fitz(pdfPath, imagePath):
startTime_pdf2img = datetime.datetime.now()#开始时间
pdfDoc = fitz.open(pdfPath)
for pg in range(pdfDoc.pageCount):
page = pdfDoc[pg]
rotate = int(0)
# 每个尺寸的缩放系数为1.3,这将为我们生成分辨率提高2.6的图像。
# 此处若是不做设置,默认图片大小为:792X612, dpi=96
zoom_x = 2.53333333 #(1.33333333-->1056x816) (2-->1584x1224)
zoom_y = 2.53333333
mat = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)
pix = page.getPixmap(matrix=mat, alpha=False)
# get image data
img_data = pix.getImageData("png")
data_buf = np.frombuffer(img_data, np.uint8)
img = cv2.imdecode(data_buf, cv2.IMREAD_UNCHANGED)
cv2.imwrite(f'/path/results/{pdfPath.split("/")[-1].split(".")[0]}_{pg}.jpg', img)
endTime_pdf2img = datetime.datetime.now()#结束时间
print('pdf2img时间=',(endTime_pdf2img - startTime_pdf2img).seconds)
if __name__ == "__main__":
pdf_path = "/path/test.pdf"
img_path ='/path/results'
pyMuPDF_fitz(pdf_path, img_path)