PDF文件处理

pdfplumber

pdfplumber demo:https://github.com/jsvine/pdfplumber/tree/master/examples
安装:https://blog.csdn.net/blmoistawinde/article/details/82051915

camelot

https://github.com/atlanhq/camelot

  1. pdf转图片

import sys, fitz
import os
import datetime

def pyMuPDF_fitz(pdfPath, imagePath, pdf_num):
    startTime_pdf2img = datetime.datetime.now()#开始时间

    print("imagePath="+imagePath)
    pdfDoc = fitz.open(pdfPath)
    for pg in range(pdfDoc.pageCount):
        page = pdfDoc[pg]
        rotate = int(0)
        # 每个尺寸的缩放系数为1.3,这将为我们生成分辨率提高2.6的图像。
        # 此处若是不做设置,默认图片大小为:792X612, dpi=96
        zoom_x = 2 #(1.33333333-->1056x816)   (2-->1584x1224)
        zoom_y = 2
        mat = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)
        pix = page.getPixmap(matrix=mat, alpha=False)

        if not os.path.exists(imagePath):#判断存放图片的文件夹是否存在
            os.makedirs(imagePath) # 若图片文件夹不存在就创建

        pix.writePNG(imagePath+'/'+'images_%s%s.png' % (pdf_num, pg)) #将图片写入指定的文件夹内

    endTime_pdf2img = datetime.datetime.now()#结束时间
    print('pdf2img时间=',(endTime_pdf2img - startTime_pdf2img).seconds)
  1. region 存储为xml
# encoding: utf-8
from xml.dom.minidom import Document


def node_content(doc, name, content):
    '''创建节点,并设置节点内容 '''
    node = doc.createElement(name)
    text = doc.createTextNode(content)
    node.appendChild(text)
    return node


def parent_node_content(doc, name, content, parent_node):
    '''创建子节点,设置子节点内容,并将子节点插入到指定父节点下'''
    child_node = node_content(doc, name, content)
    parent_node.appendChild(child_node)


def create_object(doc, xmin, ymin, xmax, ymax):
    object = doc.createElement('object')
    parent_node_content(doc, 'name', 'test', object)
    parent_node_content(doc, 'pose', 'Unspecified', object)
    parent_node_content(doc, 'truncated', '0', object)
    parent_node_content(doc, 'difficult', '0', object)
    bndbox = doc.createElement('bndbox')
    parent_node_content(doc, 'xmin', xmin, bndbox)
    parent_node_content(doc, 'ymin', ymin, bndbox)
    parent_node_content(doc, 'xmax', xmax, bndbox)
    parent_node_content(doc, 'ymax', ymax, bndbox)
    object.appendChild(bndbox)
    return object


def create_xml(save_dir, region, img, img_path, img_filename):
    # 创建dom文档
    doc = Document()
    # 创建根节点
    annotation = doc.createElement('annotation')
    # 根节点插入dom树
    doc.appendChild(annotation)

    # folder
    folder = doc.createElement('folder')
    folder_text = doc.createTextNode('image')
    folder.appendChild(folder_text)
    annotation.appendChild(folder)

    # filename
    filename = node_content(doc, 'filename', img_filename)
    annotation.appendChild(filename)

    # path
    path = node_content(doc, 'path', img_path)
    annotation.appendChild(path)

    # source
    source = doc.createElement('source')
    parent_node_content(doc, 'database', 'Unknown', source)
    annotation.appendChild(source)

    # size
    img_size = img.shape  # h, w ,c
    size = doc.createElement('size')
    parent_node_content(doc, 'width', str(img_size[1]), size)
    parent_node_content(doc, 'height', str(img_size[0]), size)
    parent_node_content(doc, 'depth', str(img_size[2]), size)
    annotation.appendChild(size)

    # segmented
    segmented = node_content(doc, 'segmented', '0')
    annotation.appendChild(segmented)

    # object
    for box in region:
        Xs = [i[0] for i in box]
        Ys = [i[1] for i in box]
        xmin = str(min(Xs))
        ymin = str(min(Ys) - 1)
        xmax = str(max(Xs))
        ymax = str(max(Ys) + 1)
        object = create_object(doc, xmin, ymin, xmax, ymax)
        annotation.appendChild(object)

    with open(save_dir, 'w', encoding='utf8') as fh:
        doc.writexml(fh, indent='', addindent='\t', newl='\n', encoding='UTF-8')
    print('xml创建成功。%s' % img_filename)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值