我终于发现pyPDF可以帮上忙。我把它寄出去,以防它能帮助别人。
(1)定位字符串的函数def fnPDF_FindText(xFile, xString):
# xfile : the PDF file in which to look
# xString : the string to look for
import pyPdf, re
PageFound = -1
pdfDoc = pyPdf.PdfFileReader(file(xFile, "rb"))
for i in range(0, pdfDoc.getNumPages()):
content = ""
content += pdfDoc.getPage(i).extractText() + "\n"
content1 = content.encode('ascii', 'ignore').lower()
ResSearch = re.search(xString, content1)
if ResSearch is not None:
PageFound = i
break
return PageFound
(2)提取感兴趣页面的函数def fnPDF_ExtractPages(xFileNameOriginal, xFileNameOutput, xPageStart, xPageEnd):
from pyPdf import PdfFileReader, PdfFileWriter
output = PdfFileWriter()
pdfOne = PdfFileReader(file(xFileNameOriginal, "rb"))
for i in range(xPageStart, xPageEnd):
output.addPage(pdfOne.getPage(i))
outputStream = file(xFileNameOutput, "wb")
output.write(outputStream)
outputStream.close()
我希望这对其他人有帮助