python pdf模块_用于将PDF转换为文本的Python模块

最新推荐文章于 2024-06-03 18:16:38 发布

祈祀

最新推荐文章于 2024-06-03 18:16:38 发布

阅读量282

点赞数

文章标签： python pdf模块

本文链接：https://blog.csdn.net/weixin_29669899/article/details/113963147

版权

def pdf_to_csv(filename):

from cStringIO import StringIO

from pdfminer.converter import LTChar, TextConverter

from pdfminer.layout import LAParams

from pdfminer.pdfparser import PDFDocument, PDFParser

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter

class CsvConverter(TextConverter):

def __init__(self, *args, **kwargs):

TextConverter.__init__(self, *args, **kwargs)

def end_page(self, i):

from collections import defaultdict

lines = defaultdict(lambda : {})

for child in self.cur_item._objs: #

if isinstance(child, LTChar):

(_,_,x,y) = child.bbox

line = lines[int(-y)]

line[x] = child._text.encode(self.codec) #

for y in sorted(lines.keys()):

line = lines[y]

self.outfp.write(";".join(line[x] for x in sorted(line.keys())))

self.outfp.write("\n")

# ... the following part of the code is a remix of the

# convert() function in the pdfminer/tools/pdf2text module

rsrc = PDFResourceManager()

outfp = StringIO()

device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams())

# becuase my test documents are utf-8 (note: utf-8 is the default codec)

doc = PDFDocument()

fp = open(filename, 'rb')

parser = PDFParser(fp)

parser.set_document(doc)

doc.set_parser(parser)

doc.initialize('')

interpreter = PDFPageInterpre

确定要放弃本次机会？

福利倒计时

: :

立减 ¥

普通VIP年卡可用

立即使用

祈祀

关注关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python pdf模块_用于将PDF转换为文本的Python模块

def pdf_to_csv(filename):from cStringIO import StringIOfrom pdfminer.converter import LTChar, TextConverterfrom pdfminer.layout import LAParamsfrom pdfminer.pdfparser import PDFDocument, PDFParserfrom...
复制链接

扫一扫