1.使用PyPDF2:
from PyPDF2.pdf import PdfFileReader,PdfFileWriter
将文件夹中的pdf.py复制粘贴到D:\python3.6.5\Lib中:
将文件夹中的pdf.py复制粘贴到D:\python3.6.5\Lib中:
使用pyPdf会报错如下:
2.利用上篇文章下载到的两篇pdf合并,会报错:
Traceback (most recent call last):
File “D:\python3.6.5\lib\PyPDF2\generic.py”, line 484, in readFromStream
return NameObject(name.decode(‘utf-8’))
UnicodeDecodeError: ‘utf-8’ codec can’t decode byte 0xcb in position 8: invalid continuation byte
return NameObject(name.decode(‘utf-8’))
UnicodeDecodeError: ‘utf-8’ codec can’t decode byte 0xcb in position 8: invalid continuation byte
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File “D:/python tests/ZQfd_paiming/test.py”, line 70, in <module>
dl.merge_pdf()
File “D:/python tests/ZQfd_paiming/test.py”, line 60, in merge_pdf
output.write(outputStream)
File “D:\python3.6.5\lib\PyPDF2\pdf.py”, line 482, in write
self._sweepIndirectReferences(externalReferenceMap, self._root)
File “D:\python3.6.5\lib\PyPDF2\pdf.py”, line 571, in _sweepIndirectReferences
self._sweepIndirectReferences(externMap, realdata)
File “D:\python3.6.5\lib\PyPDF2\pdf.py”, line 547, in _sweepIndirectReferences
value = self._sweepIndirectReferences(externMap, value)
File “D:\python3.6.5\lib\PyPDF2\pdf.py”, line 571, in _sweepIndirectReferences
self._sweepIndirectReferences(externMap, realdata)
File “D:\python3.6.5\lib\PyPDF2\pdf.py”, line 547, in _sweepIndirectReferences
value = self._sweepIndirectReferences(externMap, value)
File “D:\python3.6.5\lib\PyPDF2\pdf.py”, line 556, in _sweepIndirectReferences
value = self._sweepIndirectReferences(externMap, data[i])
File “D:\python3.6.5\lib\PyPDF2\pdf.py”, line 571, in _sweepIndirectReferences
self._sweepIndirectReferences(externMap, realdata)
File “D:\python3.6.5\lib\PyPDF2\pdf.py”, line 547, in _sweepIndirectReferences
value = self._sweepIndirectReferences(externMap, value)
File “D:\python3.6.5\lib\PyPDF2\pdf.py”, line 547, in _sweepIndirectReferences
value = self._sweepIndirectReferences(externMap, value)
File “D:\python3.6.5\lib\PyPDF2\pdf.py”, line 547, in _sweepIndirectReferences
value = self._sweepIndirectReferences(externMap, value)
File “D:\python3.6.5\lib\PyPDF2\pdf.py”, line 577, in _sweepIndirectReferences
newobj = data.pdf.getObject(data)
File “D:\python3.6.5\lib\PyPDF2\pdf.py”, line 1611, in getObject
retval = readObject(self.stream, self)
File “D:\python3.6.5\lib\PyPDF2\generic.py”, line 66, in readObject
return DictionaryObject.readFromStream(stream, pdf)
File “D:\python3.6.5\lib\PyPDF2\generic.py”, line 579, in readFromStream
value = readObject(stream, pdf)
File “D:\python3.6.5\lib\PyPDF2\generic.py”, line 60, in readObject
return NameObject.readFromStream(stream, pdf)
File “D:\python3.6.5\lib\PyPDF2\generic.py”, line 492, in readFromStream
raise utils.PdfReadError(“Illegal character in Name Object”)
PyPDF2.utils.PdfReadError: Illegal character in Name Object
File “D:/python tests/ZQfd_paiming/test.py”, line 70, in <module>
dl.merge_pdf()
File “D:/python tests/ZQfd_paiming/test.py”, line 60, in merge_pdf
output.write(outputStream)
File “D:\python3.6.5\lib\PyPDF2\pdf.py”, line 482, in write
self._sweepIndirectReferences(externalReferenceMap, self._root)
File “D:\python3.6.5\lib\PyPDF2\pdf.py”, line 571, in _sweepIndirectReferences
self._sweepIndirectReferences(externMap, realdata)
File “D:\python3.6.5\lib\PyPDF2\pdf.py”, line 547, in _sweepIndirectReferences
value = self._sweepIndirectReferences(externMap, value)
File “D:\python3.6.5\lib\PyPDF2\pdf.py”, line 571, in _sweepIndirectReferences
self._sweepIndirectReferences(externMap, realdata)
File “D:\python3.6.5\lib\PyPDF2\pdf.py”, line 547, in _sweepIndirectReferences
value = self._sweepIndirectReferences(externMap, value)
File “D:\python3.6.5\lib\PyPDF2\pdf.py”, line 556, in _sweepIndirectReferences
value = self._sweepIndirectReferences(externMap, data[i])
File “D:\python3.6.5\lib\PyPDF2\pdf.py”, line 571, in _sweepIndirectReferences
self._sweepIndirectReferences(externMap, realdata)
File “D:\python3.6.5\lib\PyPDF2\pdf.py”, line 547, in _sweepIndirectReferences
value = self._sweepIndirectReferences(externMap, value)
File “D:\python3.6.5\lib\PyPDF2\pdf.py”, line 547, in _sweepIndirectReferences
value = self._sweepIndirectReferences(externMap, value)
File “D:\python3.6.5\lib\PyPDF2\pdf.py”, line 547, in _sweepIndirectReferences
value = self._sweepIndirectReferences(externMap, value)
File “D:\python3.6.5\lib\PyPDF2\pdf.py”, line 577, in _sweepIndirectReferences
newobj = data.pdf.getObject(data)
File “D:\python3.6.5\lib\PyPDF2\pdf.py”, line 1611, in getObject
retval = readObject(self.stream, self)
File “D:\python3.6.5\lib\PyPDF2\generic.py”, line 66, in readObject
return DictionaryObject.readFromStream(stream, pdf)
File “D:\python3.6.5\lib\PyPDF2\generic.py”, line 579, in readFromStream
value = readObject(stream, pdf)
File “D:\python3.6.5\lib\PyPDF2\generic.py”, line 60, in readObject
return NameObject.readFromStream(stream, pdf)
File “D:\python3.6.5\lib\PyPDF2\generic.py”, line 492, in readFromStream
raise utils.PdfReadError(“Illegal character in Name Object”)
PyPDF2.utils.PdfReadError: Illegal character in Name Object
3.原来的pdf是1.5版本,将pdf转化为word,再在wps中将word输出为pdf,这时的pdf是1.7版本,就不再报错,但不知问题是否是版本的问题,感觉应该是pdf编码问题,但是没有找到查看pdf编码的方式。
# -*- coding:utf-8*-
from PyPDF2.pdf import PdfFileReader,PdfFileWriter
infilelist=['D:/python tests/ZQfd_paiming/pdf/12.pdf',
'D:/python tests/ZQfd_paiming/pdf/12.pdf']
pdffilewriter=PdfFileWriter()
for infile in infilelist:
pdfreader=PdfFileReader(open(infile,'rb'))
numpages=pdfreader.getNumPages()
for i in range(numpages):
pageobj=pdfreader.getPage(i)
pdffilewriter.addPage(pageobj)
pdffilewriter.write(open('D:/python tests/ZQfd_paiming/pdf/zong.pdf','wb'))
#-*- coding:utf-8 -*-
from PyPDF2 import PdfFileReader,PdfFileWriter
import os
def split_pdf_1(pdf,start,end): #切分[start,end)
output=PdfFileWriter()
input=PdfFileReader(open(pdf,'rb'))
page_count=input.getNumPages()
#页码从0计
if start>end:
print('start>end')
return
if end>page_count:
print('end>page_count')
return
for i in range(start,end):
output.addPage(input.getPage(i))
output.write(open(''.join(pdf.split('.')[:-1])+'_1_.pdf','wb'))
def split_pdf_2(pdf,num): #指定切分份数
if num<2:
print('切分份数需要大于1')
return
input=PdfFileReader(open(pdf,'rb'))
page_count=input.getNumPages()
per_page=int(page_count/num) #每份页数
#print(per_page)
for i in range(num):
output=PdfFileWriter()
out_name=''.join(pdf.split('.')[:-1])+'_2_'+str(i)+'.pdf'
for j in range(per_page*i,per_page*(i+1) if i!=(num-1) else page_count):
output.addPage(input.getPage(j))
output.write(open(out_name,'wb'))
def split_pdf_3(pdf,per_page): #规定per_page页为一份进行切分
input=PdfFileReader(open(pdf,'rb'))
page_count=input.getNumPages()
if per_page>page_count:
print('per_page>page_count')
return
num=int(page_count/per_page) #可以被切分成num+1份
for i in range(num):
output=PdfFileWriter()
out_name=''.join(pdf.split('.')[:-1])+'_3_'+str(i)+'.pdf'
for j in range(per_page):
output.addPage(input.getPage(i*per_page+j))
output.write(open(out_name,'wb'))
output=PdfFileWriter()
out_name=''.join(pdf.split('.')[:-1])+'_3_'+str(num)+'.pdf'
for j in range(num*per_page,page_count):
output.addPage(input.getPage(j))
output.write(open(out_name,'wb'))
def find_pdf(filepath): #搜索出某目录下的所有pdf文件,使用os模块walk函数
#os.walk() 方法用于通过在目录树中游走输出在目录中的文件名,向上或者向下
pdf_list=[]
for root,dirs,files in os.walk(filepath):
for filespath in files:
pdf_list.append(os.path.join(root,filespath))
return pdf_list
def merge_pdf(filepath,outfile):
output=PdfFileWriter()
outputPages=0
pdf_file_name=find_pdf(filepath)
for each in pdf_file_name:
print(each)
#读取源pdf文件
input=PdfFileReader(open(each,"rb"))
# 获得源pdf文件中页面总数
page_count = input.getNumPages()
outputPages+=page_count
print(page_count)
# 分别将page添加到输出output中
for iPage in range(0, page_count):
output.addPage(input.getPage(iPage))
print("All Pages Number:"+str(outputPages))
# 最后写pdf文件
outputStream=open(outfile,"wb")
output.write(outputStream)
outputStream.close()
print("finished")
if __name__=='__main__':
split1='D:/python tests/ZQfd_paiming/pdf/zong.pdf' #4页
split2='D:/python tests/ZQfd_paiming/pdf/zong8.pdf' #8页
split3='D:/python tests/ZQfd_paiming/pdf/zong16.pdf' #16页
split_pdf_1(split1,1,3) #将zong.pdf文件的[1,3)页分割出来,另存为zong_result.pdf
split_pdf_2(split2,3) #将8页的zong8.pdf切分成3份,每份2,2,4页
split_pdf_3(split3,3) #将16页的zong16.pdf以每份3页切分,分别为3,3,3,3,3,1页
merge_pdf('D:/python tests/ZQfd_paiming/pdf/','D:/python tests/ZQfd_paiming/pdf/zongZZ.pdf')