python 打开pdf文件_python3使用PDFMiner读取pdf文件时如何保存LTImage类型即图片怎么保存的啊...

def parse_lt_objs (lt_objs, page_number, images_folder, text=[]):

#Iterate through the list of LT* objects and capture the text or image data contained in each#

text_content = []

for lt_obj in lt_objs:

if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):

# text

text_content.append(lt_obj.get_text())

elif isinstance(lt_obj, LTImage):

# text_content.append('tt"')

# an image, so save it to the designated folder, and note it's place in the text

saved_file = save_image(lt_obj, page_number, images_folder)

if saved_file:

use html style tag to mark the position of the image within the text

text_content.append(''+os.path.join(images_folder, saved_file)+'')

else:

print >> sys.stderr, "Error saving image on page", page_number, lt_obj.__repr__

elif isinstance(lt_obj, LTFigure):

LTFigure objects are containers for other LT* objects, so recurse through the children

text_content.append('

')

text_content.append(parse_lt_objs(lt_obj.objs, page_number, images_folder, text_content)) #这句话报错,你知道为什么吗?提示说lt_obj没有objs属性

return '\n'.join(text_content)

def save_image (lt_image, page_number, images_folder):

#Try to save the image data from this LTImage object, and return the file name, if successful#

result = None

if lt_image.stream:

file_stream = lt_image.stream.get_rawdata()

file_ext = determine_image_type(file_stream[0:4])

if file_ext:

file_name = ''.join([str(page_number), '_', lt_image.name, file_ext])

if write_file(images_folder, file_name, lt_image.stream.get_rawdata(), flags='wb'):

result = file_name

return result

def determine_image_type (stream_first_4_bytes):

#Find out the image file type based on the magic number comparison of the first 4 (or 2) bytes#

file_type = None

bytes_as_hex = b2a_hex(stream_first_4_bytes)

if bytes_as_hex.startswith('ffd8'):

file_type = '.jpeg'

elif bytes_as_hex == '89504e47':

file_type = ',png'

elif bytes_as_hex == '47494638':

file_type = '.gif'

elif bytes_as_hex.startswith('424d'):

file_type = '.bmp'

return file_type

def write_file (folder, filename, filedata, flags='w'):

#Write the file data to the folder and filename combination

#(flags: 'w' for write text, 'wb' for write binary, use 'a' instead of 'w' for append)#

result = False

if os.path.isdir(folder):

try:

file_obj = open(os.path.join(folder, filename), flags)

file_obj.write(filedata)

file_obj.close()

result = True

except IOError:

pass

return result

按照文档来说这个应该是可以的

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值