Python开源的组件完全可以完成PDF文件的各种需求。
以下代码完成对 PDF中化学分子式的区域标记,后期可以把这一区域中的所有对象转换成一张图片,以便转换成其它文档如WORD,HTML时这些化学公式工是完整的。
#-------------------------------------------------------------------------------
# Name: pdftest
# Purpose:
#
# Author: Administrator
#
# Created: 28-03-2013
# Copyright: (c) Administrator 2013
# Licence: <your licence>
#-------------------------------------------------------------------------------
#coding:UTF-8
import pdfminer
from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import *
from reportlab.pdfgen import canvas
from reportlab.graphics.shapes import Circle, Drawing, Group, Line, Rect, String
from pyPdf import PdfFileWriter, PdfFileReader
import sys,os,getopt,re,StringIO
Cont_MinX = 8
Cont_MinY = 8
class myRect:
def __init__(self,rect):
self.left = min(rect[0],rect[2])
self.right = max(rect[0],rect[2])
self.top = max(rect[1],rect[3])
self.bottom = min(rect[1],rect[3])
def __str__(self):
return str((self.left,self.top,self.right,self.bottom))
class myGroupObj:
def __init__(self,obj):
self.data = obj
self.group = 0
self.index = 0
self.groupidx = []
self.page = 0
def __str__(self):
return self.data
def ExtenRect(tmpRect):
global Cont_MinX,Cont_MinY
return myRect( (tmpRect.left - Cont_MinX,tmpRect.top + Cont_MinY,tmpRect.right + Cont_MinX, tmpRect.bottom - Cont_MinY ) )
def IsRectCross(tmpRect,rect):
tmpRectEx = ExtenRect(tmpRect)
rectEx = ExtenRect(rect)
if (rectEx.left > tmpRectEx.left and rectEx.left< tmpRectEx.right \
or tmpRectEx.left > rectEx.left and tmpRectEx.left< rectEx.right \
or rectEx.right > tmpRectEx.left and rectEx.right< tmpRectEx.right \
or tmpRectEx.right > rectEx.left and tmpRectEx.right< rectEx.right) \
and ( rectEx.top < tmpRectEx.top and rectEx.top > tmpRectEx.bottom \
or tmpRectEx.top < rectEx.top and tmpRectEx.top > rectEx.bottom \
or rectEx.bottom < tmpRectEx.top and rectEx.bottom > tmpRectEx.bottom \
or tmpRectEx.bottom < rectEx.top and tmpRectEx.bottom > rectEx.bottom ):
return True
return False
def ConvertToGroup(tmpGroups,Page):
GroupObjs = []
if len(tmpGroups)==0:
return
for rect in tmpGroups:
tmpObj = myGroupObj(rect)
tmpObj.index = len(GroupObjs)+1
tmpObj.group = tmpObj.index
tmpObj.groupidx.append(tmpObj.index)
tmpObj.page = Page
GroupObjs.append(tmpObj)
return GroupObjs
def UpdateGroups(GroupObjs):
bRet = False
tmpLen = len(GroupObjs)
for i in range(0,tmpLen):
tmpObj = GroupObjs[i]
for j in range(i,tmpLen):
tmpObjEx = GroupObjs[j]
if tmpObjEx.group <> tmpObj.group:
if IsRectCross(tmpObj.data,tmpObjEx.data):
tmpObjEx.group = tmpObj.group
tmpObj.groupidx.append(tmpObjEx.index)
bRet = True
return bRet
def GroupRects(tmpGroups):
tmpRects=[]
curGroup = 0
tmpGroups = sorted(tmpGroups,key = lambda d:d.group)
for obj in tmpGroups:
if curGroup <> obj.group:
tmpRects.append(obj)
else:
curRect = obj.data
rect = tmpRects[len(tmpRects)-1].data
rect.left = min(rect.left,curRect.left)
rect.top = max(rect.top,curRect.top)
rect.right = max(rect.right,curRect.right)
rect.bottom = min(rect.bottom,curRect.bottom)
curGroup = obj.group
return tmpRects
def main():
global Cont_MinX,Cont_MinY
doc = PDFDocument()
fp = open("demo.pdf", 'rb')
parser = PDFParser(fp)
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize('')
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
j=0
output = PdfFileWriter()
input1 = PdfFileReader(file("demo.pdf", "rb"))
for i, page in enumerate(doc.get_pages()):
if page is not None:
Curves=[]
interpreter.process_page(page)
layout = device.get_result()
page = input1.getPage(i)
width, height = page.mediaBox.getWidth(),page.mediaBox.getHeight()
packet = StringIO.StringIO()
can = canvas.Canvas(packet,pagesize=(width, height))
for x in layout:
#print "\n Page "+ str(i+1) + " Text:"
if isinstance(x, LTTextBoxHorizontal) or isinstance(x, LTTextBox):
#print "Text", x.bbox,x.get_text()
pass
else:
if isinstance(x, LTCurve) or isinstance(x, LTLine):
Curves.append(myRect(x.bbox))
if len(Curves)>0:
Curves = sorted(Curves,key = lambda d:d.left)
Curves = ConvertToGroup(Curves,i+1)
tmpGroups = Curves
while True:
if not UpdateGroups(tmpGroups):
break
tmpGroups = GroupRects(tmpGroups)
print "\n Page "+ str(i+1) + " Rects:"
for obj in Curves:
print obj.index,obj.group,obj.data
## can.setStrokeColorRGB(0.5,0.5,0.3)
## can.rect(obj.data.left,obj.data.bottom,
## abs(obj.data.left-obj.data.right),
## abs(obj.data.bottom-obj.data.top),
## fill=0)
print "\n Page "+ str(i+1) + " Group Rects:"
for obj in tmpGroups:
print obj.group,obj.data,obj.groupidx
can.setStrokeColorRGB(0.2,0.2,1)
tmpRect = ExtenRect(obj.data)
can.rect(tmpRect.left,tmpRect.bottom,
abs(tmpRect.right-tmpRect.left),
abs(tmpRect.top-tmpRect.bottom),
fill=0)
can.setFillColorRGB(0.2,0.2,1)
can.drawString(tmpRect.left,tmpRect.bottom,str(obj.group))
#ClipPdf(tmpRect,obj.page,obj.group)
can.save()
if packet.len>0:
packet.seek(0)
input2 = PdfFileReader(packet)
page.mergePage(input2.getPage(0))
output.addPage(page)
#print "End PAGE " + str(i+1) + "\n"
#break
device.close()
fp.close()
outputStream = file("demo-test.pdf", "wb")
output.write(outputStream)
outputStream.close()
#print outfp.getvalue()
#pass
def ClipPdf(tmpRect,pageidx,group):
pdf = PdfFileReader(file('demo.pdf', 'rb'))
output = PdfFileWriter()
page = pdf.pages[pageidx-1]
page.mediaBox.upperRight = (tmpRect.left,tmpRect.bottom)
page.mediaBox.lowerLeft = (tmpRect.right,tmpRect.top)
output.addPage(page)
outputStream = file("demo-P" + str(pageidx) + "-G" + str(group)+".pdf", "wb")
output.write(outputStream)
outputStream.close()
if __name__ == '__main__':
main()
## global MergeRect
## GroupRect([myRect((1,2,3,4)),myRect((3,2,4,4)),myRect((12,2,11,4)),myRect((2,2,1,4)),myRect((13,2,15,4))])
## for tmpData in MergeRect:
## print tmpData
下图是处理完以后的效果,其中的兰色文字和巨型框是处理完以后增加的。
后续处理:可以把区域中的对象转换成一个图片,最后替换原有的多个对象。