一、解析PDF
使用pdfminer解析PDF文件,其中Layout类型包括LAParams, LTTextBox, LTTextLine, LTFigure, LTImage, LTChar。
示例一:解析LTTextBox
from pdfminer.layout import LTTextBoxHorizontal
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
def parse(Path, Save_name):
parser = PDFParser(Path)
document = PDFDocument(parser)
if not document.is_extractable:
print 'error'
raise PDFTextExtractionNotAllowed
else:
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
layout = device.get_result()
for x in layout:
if (isinstance(x, LTTextBoxHorizontal)):
with open('%s' % (Save_name), 'a') as f:
results = x.get_text().encode('utf-8')
print results
f.write(results + "\n")
if __name__ == '__main__':
Path = open('/local/mnt/workspace/PycharmProject/demo/src/tmp/2019.pdf', 'rb')
Parse(Path, '/local/mnt/workspace/PycharmProject/demo/src/tmp/1.txt')
示例二:解析更多Layout类型
#!/usr/bin/python
import sys
import os
from binascii import b2a_hex
###
### pdf-miner requirements
###
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTFigure, LTImage, LTChar
def with_pdf (pdf_doc, fn, pdf_pwd, *args):
"""Open the pdf document, and apply the function, returning the results"""