博主目前从事python爬虫的工作,当前任务需要将散列的图片转成pdf然后再合成一篇pdf报告,
开始想了很多办法,在网上百度看了许多类似的文章,包括查看官方的文档,总算将这个事情搞定了。
闲话少说,还是罗代码!!
import os
import time
import json
import pdfkit
import shutil
import hashlib
import requests
from PIL import Image
from reportlab.pdfgen import canvas
from PyPDF2 import PdfFileReader, PdfFileWriter
from .common_def import file_path
from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED
class Img2Pdf(object):
def __init__(self):
pass
def down(self, url):
path_dir = "C:\\data\\img\\"
is_exists = os.path.exists(path_dir)
if not is_exists:
os.makedirs(path_dir)
new_filename = path_dir + hashlib.md5(url.encode('utf8')).hexdigest() + ".jpg"
res = requests.get(url, timeout=30)
if res:
try:
with open(new_filename, 'wb') as f:
f.write(res.content)
return new_filename
except Exception as e:
print('图片下载错误',e)
else:
pass
def run(self, path_file):
if (os.path.isfile(path_file)):
(filepath, tempfilename) = os.path.split(path_file)
(shotname, extension) = os.path.splitext(path_file)
filename = os.path.basename(shotname)
parent_path = os.path.dirname(filepath)
new_path = parent_path + "/pdf"
is_exists = os.path.exists(new_path)
if not is_exists:
os.makedirs(new_path)
out = new_path + '/' + filename + '.pdf'
return out
def imgtopdf(self, f_img, f_pdf):
img = Image.open(f_img)
(w, h) = img.size
c = canvas.Canvas(f_pdf, pagesize=img.size)
c.drawImage(f_img, 0, 0, w, h)
c.save()
return f_pdf
def merge(self, files, outfn):
"""
files: 需要合并的pdf列表
outfn:合并之后的pdf名
"""
pdf_output = PdfFileWriter()
in_pdfs = []
for infn in files:
in_pdf = open(infn, 'rb')
in_pdfs.append(in_pdf)
pdf_input = PdfFileReader(in_pdf)
page_count = pdf_input.getNumPages()
for i in range(page_count):
pdf_output.addPage(pdf_input.getPage(i))
f = open(outfn, 'wb')
pdf_output.write(f)
f.close()
for in_pdf in in_pdfs:
in_pdf.close()
return outfn
def delete(self, path):
is_exists = os.path.exists(path)
try:
if is_exists:
shutil.rmtree(path)
except Exception as e:
pass
所有的功能都封装在了一个类中。
本人不喜欢多写注释,各位多担待,方法的功能都是方法名字的字面意思,调用顺序是从上往下。
比如我在scrapy框架中的pipeline文件中如下调用:
class ImageToPdfPipeline(object):
"""图片转化为pdf格式"""
def __init__(self):
self.img = Img2Pdf()
def process_item(self, item, spider):
urls = item.get('img', [])
if len(item.get('img')) < 3:
return
li = []
for url in urls:
url = self.img.down(url)
if not url:
continue
out = self.img.run(url)
out = self.img.imgtopdf(url, out)
li.append(out)
if li:
out_pdf = file_path(item)
url = self.img.merge(li, out_pdf)
if url:
item['download_status'] = 1
else:
out_pdf = file_path(item)
url = self.img.merge(li, out_pdf)
if url:
item['download_status'] = 1
return item
其中file_path()这个方法是指定文件的下载路径,这个自行设置。
需要注意的一点:文件的删除即调用delete()方法,需要等待所有文件操作都已完毕,才可调用。