【Python】Python合并(指定目录)+切分PDF文件

1.使用PyPDF2:

from PyPDF2.pdf import PdfFileReader,PdfFileWriter
将文件夹中的pdf.py复制粘贴到D:\python3.6.5\Lib中:
使用pyPdf会报错如下:


2.利用上篇文章下载到的两篇pdf合并,会报错:

Traceback (most recent call last):
  File “D:\python3.6.5\lib\PyPDF2\generic.py”, line 484, in readFromStream
return NameObject(name.decode(‘utf-8’))
UnicodeDecodeError: ‘utf-8’ codec can’t decode byte 0xcb in position 8: invalid continuation byte
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File “D:/python tests/ZQfd_paiming/test.py”, line 70, in <module>
dl.merge_pdf()
File “D:/python tests/ZQfd_paiming/test.py”, line 60, in merge_pdf
output.write(outputStream)
File “D:\python3.6.5\lib\PyPDF2\pdf.py”, line 482, in write
self._sweepIndirectReferences(externalReferenceMap, self._root)
File “D:\python3.6.5\lib\PyPDF2\pdf.py”, line 571, in _sweepIndirectReferences
self._sweepIndirectReferences(externMap, realdata)
File “D:\python3.6.5\lib\PyPDF2\pdf.py”, line 547, in _sweepIndirectReferences
value = self._sweepIndirectReferences(externMap, value)
File “D:\python3.6.5\lib\PyPDF2\pdf.py”, line 571, in _sweepIndirectReferences
self._sweepIndirectReferences(externMap, realdata)
File “D:\python3.6.5\lib\PyPDF2\pdf.py”, line 547, in _sweepIndirectReferences
value = self._sweepIndirectReferences(externMap, value)
File “D:\python3.6.5\lib\PyPDF2\pdf.py”, line 556, in _sweepIndirectReferences
value = self._sweepIndirectReferences(externMap, data[i])
File “D:\python3.6.5\lib\PyPDF2\pdf.py”, line 571, in _sweepIndirectReferences
self._sweepIndirectReferences(externMap, realdata)
File “D:\python3.6.5\lib\PyPDF2\pdf.py”, line 547, in _sweepIndirectReferences
value = self._sweepIndirectReferences(externMap, value)
File “D:\python3.6.5\lib\PyPDF2\pdf.py”, line 547, in _sweepIndirectReferences
value = self._sweepIndirectReferences(externMap, value)
File “D:\python3.6.5\lib\PyPDF2\pdf.py”, line 547, in _sweepIndirectReferences
value = self._sweepIndirectReferences(externMap, value)
File “D:\python3.6.5\lib\PyPDF2\pdf.py”, line 577, in _sweepIndirectReferences
newobj = data.pdf.getObject(data)
File “D:\python3.6.5\lib\PyPDF2\pdf.py”, line 1611, in getObject
retval = readObject(self.stream, self)
File “D:\python3.6.5\lib\PyPDF2\generic.py”, line 66, in readObject
return DictionaryObject.readFromStream(stream, pdf)
File “D:\python3.6.5\lib\PyPDF2\generic.py”, line 579, in readFromStream
value = readObject(stream, pdf)
File “D:\python3.6.5\lib\PyPDF2\generic.py”, line 60, in readObject
return NameObject.readFromStream(stream, pdf)
File “D:\python3.6.5\lib\PyPDF2\generic.py”, line 492, in readFromStream
raise utils.PdfReadError(“Illegal character in Name Object”)
PyPDF2.utils.PdfReadError: Illegal character in Name Object

3.原来的pdf是1.5版本,将pdf转化为word,再在wps中将word输出为pdf,这时的pdf是1.7版本,就不再报错,但不知问题是否是版本的问题,感觉应该是pdf编码问题,但是没有找到查看pdf编码的方式。
# -*- coding:utf-8*-
from PyPDF2.pdf import PdfFileReader,PdfFileWriter
infilelist=['D:/python tests/ZQfd_paiming/pdf/12.pdf',
            'D:/python tests/ZQfd_paiming/pdf/12.pdf']
pdffilewriter=PdfFileWriter()
for infile in infilelist:
    pdfreader=PdfFileReader(open(infile,'rb'))
    numpages=pdfreader.getNumPages()
    for i in range(numpages):
        pageobj=pdfreader.getPage(i)
        pdffilewriter.addPage(pageobj)
    pdffilewriter.write(open('D:/python tests/ZQfd_paiming/pdf/zong.pdf','wb'))
#-*- coding:utf-8 -*-
from PyPDF2 import PdfFileReader,PdfFileWriter
import os
def split_pdf_1(pdf,start,end):     #切分[start,end)
    output=PdfFileWriter()
    input=PdfFileReader(open(pdf,'rb'))
    page_count=input.getNumPages()
    #页码从0计
    if start>end:
        print('start>end')
        return
    if end>page_count:
        print('end>page_count')
        return
    for i in range(start,end):
        output.addPage(input.getPage(i))
    output.write(open(''.join(pdf.split('.')[:-1])+'_1_.pdf','wb'))

def split_pdf_2(pdf,num):           #指定切分份数
    if num<2:
        print('切分份数需要大于1')
        return
    input=PdfFileReader(open(pdf,'rb'))
    page_count=input.getNumPages()
    per_page=int(page_count/num)        #每份页数
    #print(per_page)
    for i in range(num):
        output=PdfFileWriter()
        out_name=''.join(pdf.split('.')[:-1])+'_2_'+str(i)+'.pdf'
        for j in range(per_page*i,per_page*(i+1) if i!=(num-1) else page_count):
            output.addPage(input.getPage(j))
        output.write(open(out_name,'wb'))

def split_pdf_3(pdf,per_page):       #规定per_page页为一份进行切分
    input=PdfFileReader(open(pdf,'rb'))
    page_count=input.getNumPages()
    if per_page>page_count:
        print('per_page>page_count')
        return
    num=int(page_count/per_page)    #可以被切分成num+1份
    for i in range(num):
        output=PdfFileWriter()
        out_name=''.join(pdf.split('.')[:-1])+'_3_'+str(i)+'.pdf'
        for j in range(per_page):
            output.addPage(input.getPage(i*per_page+j))
        output.write(open(out_name,'wb'))
    output=PdfFileWriter()
    out_name=''.join(pdf.split('.')[:-1])+'_3_'+str(num)+'.pdf'
    for j in range(num*per_page,page_count):
        output.addPage(input.getPage(j))
    output.write(open(out_name,'wb'))

def find_pdf(filepath):             #搜索出某目录下的所有pdf文件,使用os模块walk函数
    #os.walk() 方法用于通过在目录树中游走输出在目录中的文件名,向上或者向下
    pdf_list=[]
    for root,dirs,files in os.walk(filepath):
        for filespath in files:
            pdf_list.append(os.path.join(root,filespath))
    return pdf_list

def merge_pdf(filepath,outfile):
    output=PdfFileWriter()
    outputPages=0
    pdf_file_name=find_pdf(filepath)
    for each in pdf_file_name:
        print(each)
        #读取源pdf文件
        input=PdfFileReader(open(each,"rb"))

        # 获得源pdf文件中页面总数
        page_count = input.getNumPages()
        outputPages+=page_count
        print(page_count)

        # 分别将page添加到输出output中
        for iPage in range(0, page_count):
            output.addPage(input.getPage(iPage))

    print("All Pages Number:"+str(outputPages))
    # 最后写pdf文件
    outputStream=open(outfile,"wb")
    output.write(outputStream)
    outputStream.close()
    print("finished")

if __name__=='__main__':
    split1='D:/python tests/ZQfd_paiming/pdf/zong.pdf'      #4页
    split2='D:/python tests/ZQfd_paiming/pdf/zong8.pdf'     #8页
    split3='D:/python tests/ZQfd_paiming/pdf/zong16.pdf'    #16页
    split_pdf_1(split1,1,3)         #将zong.pdf文件的[1,3)页分割出来,另存为zong_result.pdf
    split_pdf_2(split2,3)           #将8页的zong8.pdf切分成3份,每份2,2,4页
    split_pdf_3(split3,3)           #将16页的zong16.pdf以每份3页切分,分别为3,3,3,3,3,1页
    merge_pdf('D:/python tests/ZQfd_paiming/pdf/','D:/python tests/ZQfd_paiming/pdf/zongZZ.pdf')


  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值