编辑(再次):
PDFMiner已在版本中再次更新。20100213
您可以使用以下内容检查已安装的版本:>>> import pdfminer>>> pdfminer.__version__'20100213'
以下是更新的版本(附带关于我更改/添加的内容的注释):def pdf_to_csv(filename):
from cStringIO import StringIO #
from pdfminer.converter import LTTextItem, TextConverter
from pdfminer.pdfparser import PDFDocument, PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
class CsvConverter(TextConverter):
def __init__(self, *args, **kwargs):
TextConverter.__init__(self, *args, **kwargs)
def end_page(self, i):
from collections import defaultdict
lines = defaultdict(lambda : {})
for child in self.cur_item.objs:
if isinstance(child, LTTextItem):
(_,_,x,y) = child.bbox #
line = lines[int(-y)]
line[x] = child.text.en