这个小python是用来将pdf转化成图片的,
看样子有两中方法一是:Wand,二是:pdf2image
我用的pdf2image,因为这个pdf大小不定,少的几十页,多的可能有几千页,
在用pdf2image时大致有两种
from pdf2image import convert_from_path
convert_from_path('a.pdf', 500, "output",fmt="JPEG",output_file="ok",thread_count=4)
这种转换是直接写入到磁盘上的,因此不会占用太多内存。
另一种写法是:
from pdf2image import convert_from_path
pages = convert_from_path('pdf_file', 500)
for page in pages:
page.save('out.jpg', 'JPEG')
但这种写法会占用大量内存,因为convert_from_path的默认格式是ppm,其次若不指定输出则默认是写入到内存中的。
很显然第一种适合我,但是第一种运行竟然什么都没有输出,也没有报错,也没有结果,what the fc**
有个小伙伴是这么写的:
from PyPDF2 import PdfFileReader, PdfFileWriter
from pdf2image import convert_from_path, convert_from_bytes
import os
import tempfile
import cv2
import re
def split_pdf(infile, out_path, temp):
"""
:param infile: 待拆分的pdf文件
:param out_path: 拆分成单页的pdf文件的存储路径
:return: 无
"""
if not os.path.exists(out_path):
os.makedirs(out_path)
with open(infile, 'rb') as infile:
reader = PdfFileReader(infile)
number_of_pages = reader.getNumPages() #计算此PDF文件中的页数
for i in range(number_of_pages):
global number
number = i
writer = PdfFileWriter()
writer.addPage(reader.getPage(i)) # 2019.8.17 这个地方没错,删了个-1 。若是不删的话将会以最后一页为0,第一页为1,依次往下;删了的话将以第一页为0,第二页为1,依次往下。
out_file_name = temp
with open(out_file_name, 'wb') as outfile:
writer.write(outfile)
yield i
# try:
# with open(out_file_name, 'wb') as outfile:
# writer.write(outfile)
# yield i
# except:
# print("chucuole "+ str(number))
def turn_picture(in_File, out_Path):
temp = '/home/zh/local/flowChart/pic/temp/' + 'temp'+'.pdf'
for i in split_pdf(in_File, out_Path, temp):
with tempfile.TemporaryDirectory() as path:
images = convert_from_path(temp, dpi=10)
for index, img in enumerate(images):
global number
# *** 取名字要注意
img.save(os.path.join(out_Path, re.compile("[0-9|a-z]*").findall(os.path.basename(in_File))[0]+"_"+str(number)+".png"))
number += 1
print('已经转化'+str(number))
if __name__ == '__main__':
for file in os.listdir('/home/zh/local/flowChart/pdfs/'):
global in_File
in_File = os.path.join('/home/zh/local/flowChart/pdfs/', file)
# in_File = '/home/zheng/zheng/1doc/003.pdf'
out_Path = '/home/zh/local/flowChart/pic/' # 生成输出文件夹
print(in_File)
turn_picture(in_File, out_Path)
竟然能运行,,,但是报错
/home/zh/local/flowChart/pdfs/2015年全国大学生数学建模竞赛A题优秀论文太阳影子定位模型教程.pdf
Traceback (most recent call last):
File "/home/zh/local/flowChart/code/PdfToPic.py", line 130, in <module>
turn_picture(in_File, out_Path)
File "/home/zh/local/flowChart/code/PdfToPic.py", line 113, in turn_picture
for i in split_pdf(in_File, out_Path, temp):
File "/home/zh/local/flowChart/code/PdfToPic.py", line 101, in split_pdf
writer.write(outfile)
File "/home/zh/anaconda3/lib/python3.7/site-packages/PyPDF2/pdf.py", line 501, in write
obj.writeToStream(stream, key)
File "/home/zh/anaconda3/lib/python3.7/site-packages/PyPDF2/generic.py", line 549, in writeToStream
value.writeToStream(stream, encryption_key)
File "/home/zh/anaconda3/lib/python3.7/site-packages/PyPDF2/generic.py", line 472, in writeToStream
stream.write(b_(self))
File "/home/zh/anaconda3/lib/python3.7/site-packages/PyPDF2/utils.py", line 238, in b_
r = s.encode('latin-1')
UnicodeEncodeError: 'latin-1' codec can't encode characters in position 8-9: ordinal not in range(256)
参考http://www.aiuxian.com/article/p-1985272.html把源码给了,
然后竟然行了。。。。。