python pdf模块_用于将PDF转换为文本的Python模块

def pdf_to_csv(filename):

from cStringIO import StringIO

from pdfminer.converter import LTChar, TextConverter

from pdfminer.layout import LAParams

from pdfminer.pdfparser import PDFDocument, PDFParser

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter

class CsvConverter(TextConverter):

def __init__(self, *args, **kwargs):

TextConverter.__init__(self, *args, **kwargs)

def end_page(self, i):

from collections import defaultdict

lines = defaultdict(lambda : {})

for child in self.cur_item._objs: #

if isinstance(child, LTChar):

(_,_,x,y) = child.bbox

line = lines[int(-y)]

line[x] = child._text.encode(self.codec) #

for y in sorted(lines.keys()):

line = lines[y]

self.outfp.write(";".join(line[x] for x in sorted(line.keys())))

self.outfp.write("\n")

# ... the following part of the code is a remix of the

# convert() function in the pdfminer/tools/pdf2text module

rsrc = PDFResourceManager()

outfp = StringIO()

device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams())

# becuase my test documents are utf-8 (note: utf-8 is the default codec)

doc = PDFDocument()

fp = open(filename, 'rb')

parser = PDFParser(fp)

parser.set_document(doc)

doc.set_parser(parser)

doc.initialize('')

interpreter = PDFPageInterpre

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值