Python PDF 文件解析及二次处理 实例


Python开源的组件完全可以完成PDF文件的各种需求。

以下代码完成对 PDF中化学分子式的区域标记,后期可以把这一区域中的所有对象转换成一张图片,以便转换成其它文档如WORD,HTML时这些化学公式工是完整的。

 

#-------------------------------------------------------------------------------
# Name:        pdftest
# Purpose:
#
# Author:      Administrator
#
# Created:     28-03-2013
# Copyright:   (c) Administrator 2013
# Licence:     <your licence>
#-------------------------------------------------------------------------------
#coding:UTF-8

import pdfminer
from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import *


from reportlab.pdfgen import canvas
from reportlab.graphics.shapes import Circle, Drawing, Group, Line, Rect, String


from pyPdf import PdfFileWriter, PdfFileReader

import sys,os,getopt,re,StringIO


Cont_MinX = 8
Cont_MinY = 8

class myRect:
    def __init__(self,rect):
        self.left = min(rect[0],rect[2])
        self.right = max(rect[0],rect[2])
        self.top = max(rect[1],rect[3])
        self.bottom = min(rect[1],rect[3])
    def __str__(self):
         return str((self.left,self.top,self.right,self.bottom))


class myGroupObj:
    def __init__(self,obj):
        self.data = obj
        self.group = 0
        self.index = 0
        self.groupidx = []
        self.page = 0
    def __str__(self):
        return self.data

def ExtenRect(tmpRect):
    global Cont_MinX,Cont_MinY
    return myRect( (tmpRect.left - Cont_MinX,tmpRect.top + Cont_MinY,tmpRect.right + Cont_MinX, tmpRect.bottom - Cont_MinY ) )

def IsRectCross(tmpRect,rect):
    tmpRectEx = ExtenRect(tmpRect)
    rectEx = ExtenRect(rect)

    if (rectEx.left > tmpRectEx.left and rectEx.left< tmpRectEx.right \
          or tmpRectEx.left > rectEx.left and tmpRectEx.left< rectEx.right \
          or rectEx.right > tmpRectEx.left and rectEx.right< tmpRectEx.right \
          or tmpRectEx.right > rectEx.left and tmpRectEx.right< rectEx.right) \
        and ( rectEx.top < tmpRectEx.top and rectEx.top > tmpRectEx.bottom \
            or tmpRectEx.top < rectEx.top and tmpRectEx.top > rectEx.bottom  \
            or rectEx.bottom < tmpRectEx.top and rectEx.bottom > tmpRectEx.bottom \
            or tmpRectEx.bottom < rectEx.top and tmpRectEx.bottom > rectEx.bottom ):
        return True

    return False

def ConvertToGroup(tmpGroups,Page):
    GroupObjs = []
    if len(tmpGroups)==0:
        return
    for rect in tmpGroups:
        tmpObj = myGroupObj(rect)
        tmpObj.index = len(GroupObjs)+1
        tmpObj.group = tmpObj.index
        tmpObj.groupidx.append(tmpObj.index)
        tmpObj.page = Page
        GroupObjs.append(tmpObj)
    return GroupObjs

def UpdateGroups(GroupObjs):
    bRet = False
    tmpLen = len(GroupObjs)
    for i in range(0,tmpLen):
        tmpObj = GroupObjs[i]
        for j in range(i,tmpLen):
            tmpObjEx = GroupObjs[j]
            if tmpObjEx.group <> tmpObj.group:
                if IsRectCross(tmpObj.data,tmpObjEx.data):
                    tmpObjEx.group = tmpObj.group
                    tmpObj.groupidx.append(tmpObjEx.index)
                    bRet = True

    return bRet

def GroupRects(tmpGroups):
    tmpRects=[]
    curGroup = 0
    tmpGroups = sorted(tmpGroups,key = lambda d:d.group)
    for obj in tmpGroups:
        if curGroup <> obj.group:
            tmpRects.append(obj)
        else:
            curRect = obj.data
            rect = tmpRects[len(tmpRects)-1].data
            rect.left = min(rect.left,curRect.left)
            rect.top = max(rect.top,curRect.top)
            rect.right = max(rect.right,curRect.right)
            rect.bottom = min(rect.bottom,curRect.bottom)

        curGroup = obj.group
    return tmpRects

def main():

    global Cont_MinX,Cont_MinY
    doc = PDFDocument()
    fp = open("demo.pdf", 'rb')
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')

    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)

    interpreter = PDFPageInterpreter(rsrcmgr, device)

    j=0

    output = PdfFileWriter()
    input1 = PdfFileReader(file("demo.pdf", "rb"))

    for i, page in enumerate(doc.get_pages()):

        if page is not None:
            Curves=[]
            interpreter.process_page(page)
            layout = device.get_result()

            page = input1.getPage(i)

            width, height = page.mediaBox.getWidth(),page.mediaBox.getHeight()
            packet = StringIO.StringIO()
            can = canvas.Canvas(packet,pagesize=(width, height))

            for x in layout:

                #print "\n Page "+ str(i+1) + " Text:"
                if isinstance(x, LTTextBoxHorizontal) or isinstance(x, LTTextBox):
                    #print "Text", x.bbox,x.get_text()
                    pass
                else:
                    if isinstance(x, LTCurve) or isinstance(x, LTLine):
                        Curves.append(myRect(x.bbox))

            if len(Curves)>0:


                Curves = sorted(Curves,key = lambda d:d.left)
                Curves = ConvertToGroup(Curves,i+1)


                tmpGroups = Curves
                while True:
                    if not UpdateGroups(tmpGroups):
                        break
                    tmpGroups = GroupRects(tmpGroups)

                print "\n Page "+ str(i+1) + " Rects:"
                for obj in Curves:
                    print obj.index,obj.group,obj.data
##                    can.setStrokeColorRGB(0.5,0.5,0.3)
##                    can.rect(obj.data.left,obj.data.bottom,
##                        abs(obj.data.left-obj.data.right),
##                        abs(obj.data.bottom-obj.data.top),
##                        fill=0)

                print "\n Page "+ str(i+1) + " Group Rects:"
                for obj in tmpGroups:
                    print obj.group,obj.data,obj.groupidx


                    can.setStrokeColorRGB(0.2,0.2,1)
                    tmpRect = ExtenRect(obj.data)
                    can.rect(tmpRect.left,tmpRect.bottom,
                        abs(tmpRect.right-tmpRect.left),
                        abs(tmpRect.top-tmpRect.bottom),
                        fill=0)
                    can.setFillColorRGB(0.2,0.2,1)
                    can.drawString(tmpRect.left,tmpRect.bottom,str(obj.group))

                    #ClipPdf(tmpRect,obj.page,obj.group)

                can.save()

            if packet.len>0:
                packet.seek(0)
                input2 = PdfFileReader(packet)
                page.mergePage(input2.getPage(0))

            output.addPage(page)

        #print "End PAGE " + str(i+1) + "\n"
        #break

    device.close()
    fp.close()

    outputStream = file("demo-test.pdf", "wb")
    output.write(outputStream)
    outputStream.close()

    #print outfp.getvalue()
    #pass


def ClipPdf(tmpRect,pageidx,group):
    pdf = PdfFileReader(file('demo.pdf', 'rb'))

    output = PdfFileWriter()
    page = pdf.pages[pageidx-1]
    page.mediaBox.upperRight = (tmpRect.left,tmpRect.bottom)
    page.mediaBox.lowerLeft = (tmpRect.right,tmpRect.top)
    output.addPage(page)

    outputStream = file("demo-P" + str(pageidx) + "-G" + str(group)+".pdf", "wb")
    output.write(outputStream)
    outputStream.close()



if __name__ == '__main__':
    main()
##    global MergeRect
##    GroupRect([myRect((1,2,3,4)),myRect((3,2,4,4)),myRect((12,2,11,4)),myRect((2,2,1,4)),myRect((13,2,15,4))])
##    for tmpData in MergeRect:
##        print tmpData


 

下图是处理完以后的效果,其中的兰色文字和巨型框是处理完以后增加的。

 

 

后续处理:可以把区域中的对象转换成一个图片,最后替换原有的多个对象。

 

 

 

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值