#!/usr/bin/env python
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
import pdfminer
def createPDFDoc(fpath):
fp = open(fpath, 'rb')
parser = PDFParser(fp)
document = PDFDocument(parser, password='')
# Check if the document allows text extraction. If not, abort.
if not document.is_extractable:
raise "Not extractable"
else:
return document
def createDeviceInterpreter():
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
return device, interpreter
def parse_obj(objs):
for obj in objs:
if isinstance(obj, pdfminer.layout.LTTextBox):
for o in obj._objs:
if isinstance(o,pdfminer.layout.LTTextLine):
text=o.get_text()
if text.strip():
for c in o._objs:
if isinstance(c, pdfminer.layout.LTChar):
print "fontname %s"%c.fontname
# if it's a container, recurse
elif isinstance(obj, pdfminer.layout.LTFigure):
parse_obj(obj._objs)
else:
pass
document=createPDFDoc("/tmp/simple.pdf")
device,interpreter=createDeviceInterpreter()
pages=PDFPage.create_pages(document)
interpreter.process_page(pages.next())
layout = device.get_result()
parse_obj(layout._objs)