PdfFileWriter().write() UnicodeEncodeError: 'latin-1' codec can't encode characters in position 8-9

最新推荐文章于 2022-12-03 20:20:01 发布

qq_42345446

最新推荐文章于 2022-12-03 20:20:01 发布

阅读量2.9k

点赞数 2

分类专栏：报错 python 文章标签：报错

本文链接：https://blog.csdn.net/qq_42345446/article/details/100049918

版权

python 同时被 2 个专栏收录

15 篇文章 0 订阅

订阅专栏

报错

7 篇文章 0 订阅

订阅专栏

这个小python是用来将pdf转化成图片的，

看样子有两中方法一是：Wand，二是：pdf2image
我用的pdf2image，因为这个pdf大小不定，少的几十页，多的可能有几千页，
在用pdf2image时大致有两种

from pdf2image import convert_from_path
convert_from_path('a.pdf', 500, "output",fmt="JPEG",output_file="ok",thread_count=4)

这种转换是直接写入到磁盘上的，因此不会占用太多内存。

另一种写法是：
from pdf2image import convert_from_path
pages = convert_from_path('pdf_file', 500)
for page in pages:
    page.save('out.jpg', 'JPEG')
    
但这种写法会占用大量内存，因为convert_from_path的默认格式是ppm，其次若不指定输出则默认是写入到内存中的。

很显然第一种适合我，但是第一种运行竟然什么都没有输出，也没有报错，也没有结果，what the fc**
在这里插入图片描述

有个小伙伴是这么写的：

from PyPDF2 import PdfFileReader, PdfFileWriter
from pdf2image import convert_from_path, convert_from_bytes
import os
import tempfile
import cv2
import re
def split_pdf(infile, out_path, temp):
    """
    :param infile: 待拆分的pdf文件
    :param out_path: 拆分成单页的pdf文件的存储路径
    :return: 无
    """
    if not os.path.exists(out_path):
        os.makedirs(out_path)
    with open(infile, 'rb') as infile:

        reader = PdfFileReader(infile)
        number_of_pages = reader.getNumPages()  #计算此PDF文件中的页数

        for i in range(number_of_pages):

            global number
            number = i
            writer = PdfFileWriter()
            writer.addPage(reader.getPage(i))   # 2019.8.17 这个地方没错，删了个-1 。若是不删的话将会以最后一页为0,第一页为1,依次往下;删了的话将以第一页为0,第二页为1,依次往下。
            out_file_name = temp

            with open(out_file_name, 'wb') as outfile:
                writer.write(outfile)
            yield i

            # try:
            #     with open(out_file_name, 'wb') as outfile:
            #          writer.write(outfile)
            #     yield i
            # except:
            #     print("chucuole     "+ str(number))

def turn_picture(in_File, out_Path):
    temp = '/home/zh/local/flowChart/pic/temp/' + 'temp'+'.pdf'
    for i in split_pdf(in_File, out_Path, temp):
        with tempfile.TemporaryDirectory() as path:
            images = convert_from_path(temp, dpi=10)
            for index, img in enumerate(images):
                global number
                # *** 取名字要注意
                img.save(os.path.join(out_Path, re.compile("[0-9|a-z]*").findall(os.path.basename(in_File))[0]+"_"+str(number)+".png"))
                number += 1
                print('已经转化'+str(number))

if __name__ == '__main__':
    for file in os.listdir('/home/zh/local/flowChart/pdfs/'):
        global in_File
        in_File = os.path.join('/home/zh/local/flowChart/pdfs/', file)
        # in_File = '/home/zheng/zheng/1doc/003.pdf'
        out_Path = '/home/zh/local/flowChart/pic/'  # 生成输出文件夹
        print(in_File)
        turn_picture(in_File, out_Path)

竟然能运行，，，但是报错

/home/zh/local/flowChart/pdfs/2015年全国大学生数学建模竞赛A题优秀论文太阳影子定位模型教程.pdf
Traceback (most recent call last):
  File "/home/zh/local/flowChart/code/PdfToPic.py", line 130, in <module>
    turn_picture(in_File, out_Path)
  File "/home/zh/local/flowChart/code/PdfToPic.py", line 113, in turn_picture
    for i in split_pdf(in_File, out_Path, temp):
  File "/home/zh/local/flowChart/code/PdfToPic.py", line 101, in split_pdf
    writer.write(outfile)
  File "/home/zh/anaconda3/lib/python3.7/site-packages/PyPDF2/pdf.py", line 501, in write
    obj.writeToStream(stream, key)
  File "/home/zh/anaconda3/lib/python3.7/site-packages/PyPDF2/generic.py", line 549, in writeToStream
    value.writeToStream(stream, encryption_key)
  File "/home/zh/anaconda3/lib/python3.7/site-packages/PyPDF2/generic.py", line 472, in writeToStream
    stream.write(b_(self))
  File "/home/zh/anaconda3/lib/python3.7/site-packages/PyPDF2/utils.py", line 238, in b_
    r = s.encode('latin-1')
UnicodeEncodeError: 'latin-1' codec can't encode characters in position 8-9: ordinal not in range(256)

参考http://www.aiuxian.com/article/p-1985272.html把源码给了，
然后竟然行了。。。。。
在这里插入图片描述