python 代码如下
from pypdf import PdfReader
# 我这里导出的图片是 jp2 格式的,因此需要cv2来转换
import cv2
# numpy 用于将图片以字节流的形式读取
import numpy as np
reader = PdfReader("MyPDF.pdf")# 读取 PDF 文件
outpath = "output/{}".format# 构建输出文件名
for p, page in enumerate(reader.pages):
for i, img in enumerate(page.images):
print("hanling page {} image '{}'".format(i, img.name))
bytes = cv2.imdecode(
np.frombuffer(img.data, np.uint8),
cv2.IMREAD_COLOR
)
cv2.imwrite(outpath("{}-{}.png".format(p, i)), bytes)
遭遇如下报错
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Untitled-1.ipynb 单元格 6 in 1
----> 1 reader.pages[22].images
File c:\Users\SaltSakya\AppData\Local\Programs\Python\Python310\lib\site-packages\pypdf\_page.py:463, in PageObject.images(self)
461 for obj in x_object:
462 if x_object[obj][IA.SUBTYPE] == "/Image":
--> 463 extension, byte_stream = _xobj_to_image(x_object[obj])
464 if extension is not None:
465 filename = f"{obj[1:]}{extension}"
File c:\Users\SaltSakya\AppData\Local\Programs\Python\Python310\lib\site-packages\pypdf\filters.py:707, in _xobj_to_image(x_object_obj)
705 img = Image.frombytes(mode, size, data)
706 if G.S_MASK in x_object_obj: # add alpha channel
--> 707 alpha = Image.frombytes("L", size, x_object_obj[G.S_MASK].get_data())
708 img.putalpha(alpha)
709 img_byte_arr = BytesIO()
File c:\Users\SaltSakya\AppData\Local\Programs\Python\Python310\lib\site-packages\PIL\Image.py:2843, in frombytes(mode, size, data, decoder_name, *args)
2840 args = mode
2842 im = new(mode, size)
-> 2843 im.frombytes(data, decoder_name, args)
2844 return im
File c:\Users\SaltSakya\AppData\Local\Programs\Python\Python310\lib\site-packages\PIL\Image.py:798, in Image.frombytes(self, data, decoder_name, *args)
795 s = d.decode(data)
797 if s[0] >= 0:
--> 798 raise ValueError("not enough image data")
799 if s[1] != 0:
800 raise ValueError("cannot decode image data")
ValueError: not enough image data
导航至报错位置,是 pypdf
中 filters.py
文件中的问题,位置在 707 行,是处理 alpha 通道的部分的代码。
由于我这里实际不需要 alpha 通道,因此直接把这三行代码注释掉。
此后可以正确运行。