pdfplumber
pdfplumber demo:https://github.com/jsvine/pdfplumber/tree/master/examples
安装:https://blog.csdn.net/blmoistawinde/article/details/82051915
camelot
https://github.com/atlanhq/camelot
- pdf转图片
import sys, fitz
import os
import datetime
def pyMuPDF_fitz(pdfPath, imagePath, pdf_num):
startTime_pdf2img = datetime.datetime.now()#开始时间
print("imagePath="+imagePath)
pdfDoc = fitz.open(pdfPath)
for pg in range(pdfDoc.pageCount):
page = pdfDoc[pg]
rotate = int(0)
# 每个尺寸的缩放系数为1.3,这将为我们生成分辨率提高2.6的图像。
# 此处若是不做设置,默认图片大小为:792X612, dpi=96
zoom_x = 2 #(1.33333333-->1056x816) (2-->1584x1224)
zoom_y = 2
mat = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)
pix = page.getPixmap(matrix=mat, alpha=False)
if not os.path.exists(imagePath):#判断存放图片的文件夹是否存在
os.makedirs(imagePath) # 若图片文件夹不存在就创建
pix.writePNG(imagePath+'/'+'images_%s%s.png' % (pdf_num, pg)) #将图片写入指定的文件夹内
endTime_pdf2img = datetime.datetime.now()#结束时间
print('pdf2img时间=',(endTime_pdf2img - startTime_pdf2img).seconds)
- region 存储为xml
# encoding: utf-8
from xml.dom.minidom import Document
def node_content(doc, name, content):
'''创建节点,并设置节点内容 '''
node = doc.createElement(name)
text = doc.createTextNode(content)
node.appendChild(text)
return node
def parent_node_content(doc, name, content, parent_node):
'''创建子节点,设置子节点内容,并将子节点插入到指定父节点下'''
child_node = node_content(doc, name, content)
parent_node.appendChild(child_node)
def create_object(doc, xmin, ymin, xmax, ymax):
object = doc.createElement('object')
parent_node_content(doc, 'name', 'test', object)
parent_node_content(doc, 'pose', 'Unspecified', object)
parent_node_content(doc, 'truncated', '0', object)
parent_node_content(doc, 'difficult', '0', object)
bndbox = doc.createElement('bndbox')
parent_node_content(doc, 'xmin', xmin, bndbox)
parent_node_content(doc, 'ymin', ymin, bndbox)
parent_node_content(doc, 'xmax', xmax, bndbox)
parent_node_content(doc, 'ymax', ymax, bndbox)
object.appendChild(bndbox)
return object
def create_xml(save_dir, region, img, img_path, img_filename):
# 创建dom文档
doc = Document()
# 创建根节点
annotation = doc.createElement('annotation')
# 根节点插入dom树
doc.appendChild(annotation)
# folder
folder = doc.createElement('folder')
folder_text = doc.createTextNode('image')
folder.appendChild(folder_text)
annotation.appendChild(folder)
# filename
filename = node_content(doc, 'filename', img_filename)
annotation.appendChild(filename)
# path
path = node_content(doc, 'path', img_path)
annotation.appendChild(path)
# source
source = doc.createElement('source')
parent_node_content(doc, 'database', 'Unknown', source)
annotation.appendChild(source)
# size
img_size = img.shape # h, w ,c
size = doc.createElement('size')
parent_node_content(doc, 'width', str(img_size[1]), size)
parent_node_content(doc, 'height', str(img_size[0]), size)
parent_node_content(doc, 'depth', str(img_size[2]), size)
annotation.appendChild(size)
# segmented
segmented = node_content(doc, 'segmented', '0')
annotation.appendChild(segmented)
# object
for box in region:
Xs = [i[0] for i in box]
Ys = [i[1] for i in box]
xmin = str(min(Xs))
ymin = str(min(Ys) - 1)
xmax = str(max(Xs))
ymax = str(max(Ys) + 1)
object = create_object(doc, xmin, ymin, xmax, ymax)
annotation.appendChild(object)
with open(save_dir, 'w', encoding='utf8') as fh:
doc.writexml(fh, indent='', addindent='\t', newl='\n', encoding='UTF-8')
print('xml创建成功。%s' % img_filename)