安装:
apt-get install python-poppler
apt install poppler-utils
pip3 install pdfminer.six
pip3 install pdf2image
pdf_decompose.py
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import io
import os
import sys
import time
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.layout import LTText
from pdfminer.converter import PDFPageAggregator
from pdf2image import convert_from_path, convert_from_bytes
class PDFDecompose(object):
"""
pdf文件转为image
"""
def __init__(self):
pass
def decompose_from_bytes(self, file_bytes, dpi=96):
"""
:param file_bytes:byte type of pdf file
:return: image list, each element is a PIL image, RGB format
"""
try:
images = convert_from_bytes(file_bytes, dpi=dpi)
return images
except Exception as e:
#gl.log.error('PDF Decompose from byte fail, error: {}'.format(str(e)))
return None
def decompose_from_file(self, file_name, check_content=False):
"""
:param file_name: file in disk
:param check_content: if True, check pdf content whether text or image
:return:
"""
time_start = time.time()
if check_content:
try:
with open(file_name, 'rb') as fp:
parser = PDFParser(fp)
document = PDFDocument(parser)
if not document.is_extractable:
# can not extract
self._log_helper('fail, can not extract', time_start)
return None
rsrcmgr = PDFResourceManager()
laparams = LAParams()
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
if self._is_image_file(device, document, interpreter):
images = self._to_images(file_name)
num_pages = len(images)
self._log_helper('success, pages: {0}'.format(num_pages), time_start)
return images
self._log_helper('fail, no image content', time_start)
return None
except Exception as e:
# no file
self._log_helper('fail, file io error, {0}'.format(file_name), time_start)
return None
else:
images = self._to_images(file_name)
num_pages = len(images)
self._log_helper('success, pages: {0}'.format(num_pages), time_start)
return images
def _is_image_file(self, device, document, interpreter):
"""
检查pdf内前十个page,是否image page占多数,如果是,就认为是一个image的pdf
:param device:
:param document:
:param interpreter:
:return:
"""
pages = PDFPage.create_pages(document)
page_count = 0
image_page_count = 0
for i, page in enumerate(pages):
if i > 10:
break
page_count += 1
interpreter.process_page(page)
# receive the LTPage object for the page.
layout = device.get_result()
if not self._is_text_page(layout):
image_page_count += 1
if page_count <= 0:
return True
if image_page_count // page_count > 0.8:
return True
return False
def _is_text_page(self, page):
"""
检查page内前十个对象,是否text对象占多数,如果是,就认为page是一个text page
:param layout:
:return:
"""
object_count = len(page._objs)
if object_count <= 0:
return False
if object_count > 10:
object_count = 10
text_line_count = 0
for j, obj in enumerate(page._objs):
if j > object_count:
break
if isinstance(obj, LTText):
text_line_count += 1
continue
if text_line_count // object_count > 0.8:
return True
return False
def _to_images(self, file_name):
images = convert_from_path(file_name, dpi=96)
result = []
for image in images:
byteArray = io.BytesIO()
image.save(byteArray, format='JPEG')
result.append(byteArray.getvalue())
return result
def _log_helper(self, log_content, start_time_point):
time_end = time.time()
consume = time_end - start_time_point
test_pdf_to_images.py
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import os
import sys
import unittest
from pdf_decompose import PDFDecompose
class PDFDecomposeTestCase(unittest.TestCase):
def setUp(self):
self.decomposer = PDFDecompose()
def test_pdf_decompose_image(self):
pdf_file_path = './decompose.pdf'
images = self.decomposer.decompose_from_file(pdf_file_path,
check_content=False)
for i, image in enumerate(images):
image_path = os.path.join("./", 'decompose_{0}.jpg'.format(i))
with open(image_path, 'wb') as f:
f.write(image)
image_count = len(images)
self.assertEqual(image_count, 2)
if __name__ == '__main__':
unittest.main()