专题五:python操作文件(txt,csv,word,pdf,图片等),这篇博客已经介绍了基本的文件操作,本篇主要介绍pdf的操作,结合django做成一个实际的案例。
项目介绍
我们的开发思路是这样子的。我们设计一个表单,让用户上传PDF文件和输入提取页面号码。服务器在收到PDF文件后使用PyPDF2读取用户上传的PDF文件,提取我们所需要的页面,然后通过HttpResponse将这个新生成的PDF文件通过浏览器返回给客户。
页面提取
页面合并
页面替换
需要模块
PyPDF2
django
项目创建
django-admin startproject mysite
python manage.py startapp pdf
设计URL
# mysite/urls.py
urlpatterns = [
path('admin/', admin.site.urls),
path('pdf/',include('pdf.urls'))
]
# pdf/urls.py
from django.urls import path
from . import views
app_name = 'pdf'
urlpatterns = [
# 上传pdf,用户输入需要提取的页面, 返回需要提取的页面
# 提取单页
path('extract/single/', views.pdf_single_page_extract, name='pdf_single_page_extract'),
# 提取范围内页面
path('extract/range/', views.pdf_range_extract, name='pdf_range_extract'),
# pdf合并
path('merge/', views.pdf_merge, name='pdf_merge'),
# 页面替换
path('replace/', views.pdf_replace, name='pdf_replace'),
]
Form表单信息
#pdf/forms.py
from django import forms
class PdfExtractForm(forms.Form):
file = forms.FileField(label="Upload PDF Document")
page = forms.CharField(max_length=20, label="Page Number")
def clean_file(self):
file = self.cleaned_data.get('file')
if not file.name.lower().endswith('.pdf'):
raise forms.ValidationError("Only pdf documents are allowed. ")
return file
class PdfMergeForm(forms.Form):
file1 = forms.FileField(label="PDF file 1")
file2 = forms.FileField(label="PDF file 2", required=False)
file3 = forms.FileField(label="PDF file 3", required=False)
file4 = forms.FileField(label="PDF file 4", required=False)
file5 = forms.FileField(label="PDF file 5", required=False)
class PdfReplaceForm(forms.Form):
file1 = forms.FileField(label="Replacement page")
file2 = forms.FileField(label="PDF document to be replaced")
page = forms.IntegerField(label="Replace page number")
视图函数
#包
import os
import zipfile
import PyPDF2
from django.http import FileResponse
from django.shortcuts import render
from pdf.forms import PdfExtractForm, PdfMergeForm, PdfReplaceForm
def pdf_single_page_extract(request):
if request.method == 'POST':
form = PdfExtractForm(request.POST, request.FILES)
if form.is_valid():
f = form.cleaned_data['file']
# 转化为pdf对象
pdfFileObj = PyPDF2.PdfFileReader(f)
# 获取需要提取的页面参数
page_num_info = form.cleaned_data['page'].strip()
print(page_num_info)
if len(page_num_info) == 2:
page_num_list = page_num_info.split(',')[0]
print('page_num_list', page_num_list)
else:
page_num_list = page_num_info.split(',')
print(page_num_list)
# 创建压缩文件对象 生成提取文件压缩包
zf = zipfile.ZipFile(os.path.join('media', 'extracted_pages.zip'), 'w')
for page_num in page_num_list:
# pdf文档页面对象编码是从0开始 所以减一
page_index = int(page_num) - 1
# 利用PyPDF2提取页面对象
pageObj = pdfFileObj.getPage(page_index)
# 利用PyPDF2创建新的PDF Writer
pdfWriter = PyPDF2.PdfFileWriter()
pdfWriter.addPage(pageObj)
# pdf文件路径
pdf_file_path = os.path.join('media', 'extracted_page_{}.pdf'.format(page_num))
# 将提取页面写入新的PDF文件
with open(pdf_file_path, 'wb') as pdfOutputFile:
pdfWriter.write(pdfOutputFile)
zf.write(pdf_file_path)
zf.close()
response = FileResponse(open(os.path.join('media', 'extracted_pages.zip'), 'rb'))
response['content_type'] = "application/zip"
response['Content-Disposition'] = 'attachment; filename="extracted_pages.zip"'
return response
else:
form = PdfExtractForm()
return render(request, 'pdf/pdf_extract.html', {'form': form})
def pdf_range_extract(request):
if request.method == 'POST':
# 如果用户通过POST提交
form = PdfExtractForm(request.POST, request.FILES)
if form.is_valid():
# 获取上传的文件
f = form.cleaned_data['file']
# 转化为PDF文件对象
pdfFileObj = PyPDF2.PdfFileReader(f)
page_range = form.cleaned_data['page'].strip().split('-')
page_start = int(page_range[0].strip())
page_end = int(page_range[1].strip())
# Extracted pdf file path
pdf_file_path = os.path.join('media', 'extracted_page_{}-{}.pdf'.format(page_start, page_end))
pdfOutputFile = open(pdf_file_path, 'ab+')
# 利用PyPDF2创建新的Pdf Writer
pdfWriter = PyPDF2.PdfFileWriter()
for page_num in range(page_start, page_end + 1):
# pdf文档页码对象编码是从0开始,所以减一
page_index = int(page_num) - 1
# 利用PyPDF2提取页码对象
pageObj = pdfFileObj.getPage(page_index) # 从0编码
# 添加已读取的页面对象
pdfWriter.addPage(pageObj)
pdfWriter.write(pdfOutputFile)
pdfOutputFile.close()
extractedPage = open(pdf_file_path, 'rb')
response = FileResponse(extractedPage)
response['content_type'] = "application/octet-stream"
response['Content-Disposition'] = 'attachment; filename="extracted_pages.pdf"'
return response
else:
# 如果用户没有通过POST,提交生成空表单
form = PdfExtractForm()
return render(request, 'pdf/pdf_range_extract.html', {'form': form})
def pdf_merge(request):
if request.method == 'POST':
# 如果用户通过POST提交
form = PdfMergeForm(request.POST, request.FILES)
if form.is_valid():
# 获取上传的文件1
f1 = form.cleaned_data['file1']
# 获取上传的文件2
f2 = form.cleaned_data['file2']
# 获取上传的文件3
f3 = form.cleaned_data['file3']
# 获取上传的文件4
f4 = form.cleaned_data['file4']
# 获取上传的文件5
f5 = form.cleaned_data['file5']
f_list = [f1, f2, f3, f4, f5]
# 创建PDF文件合并对象,添加合并文件
pdfMerger = PyPDF2.PdfFileMerger()
# 转化为PDF文件对象
for f in f_list:
if f:
pdfFileObj = PyPDF2.PdfFileReader(f)
pdfMerger.append(pdfFileObj)
# 将合并文件对象写入到merged_file.pdf
with open(os.path.join('media', 'merged_file.pdf'), 'wb') as pdfOutputFile:
pdfMerger.write(pdfOutputFile)
# 打开合并的merged_file.pdf,通过FileResponse输出
response = FileResponse(open(os.path.join('media', 'merged_file.pdf'), 'rb'))
response['content_type'] = "application/octet-stream"
response['Content-Disposition'] = 'attachment; filename="merged_file.pdf"'
return response
else:
# 如果通过POST提交,但表单未通过验证
form = PdfMergeForm()
else:
# 如果用户没有通过POST,提交生成空表单
form = PdfMergeForm()
return render(request, 'pdf/pdf_merge.html', {'form': form})
def pdf_replace(request):
if request.method == 'POST':
# 如果用户通过POST提交
form = PdfReplaceForm(request.POST, request.FILES)
if form.is_valid():
# 获取需要插入的PDF页面文件1
f1 = form.cleaned_data['file1']
# 获取需要被替换的文件2
f2 = form.cleaned_data['file2']
# 获取替换页码数
page = form.cleaned_data['page']
# 获取文件2总页数
pdfFileObj = PyPDF2.PdfFileReader(f2)
total_page = pdfFileObj.getNumPages()
# 获取文件2第一部分-人为可读页码
page_start = 1
page_end = page - 1
pdfOutputFile1 = open(os.path.join('media', 'part_1.pdf'), 'wb+')
# 利用PyPDF2创建新的Pdf Writer
pdfWriter = PyPDF2.PdfFileWriter()
for page_num in range(page_start, page_end + 1):
# pdf文档页码对象编码是从0开始,所以减一
page_index = int(page_num) - 1
# 利用PyPDF2提取页码对象
pageObj = pdfFileObj.getPage(page_index) # 从0编码
# 添加已读取的页面对象
pdfWriter.addPage(pageObj)
pdfWriter.write(pdfOutputFile1)
pdfOutputFile1.close()
# 获取文件2第2部分-人为可读页码
page_start = page + 1
page_end = total_page
pdfOutputFile2 = open(os.path.join('media', 'part_2.pdf'), 'wb+')
# 利用PyPDF2创建新的Pdf Writer
pdfWriter = PyPDF2.PdfFileWriter()
for page_num in range(page_start, page_end + 1):
# pdf文档页码对象编码是从0开始,所以减一
page_index = int(page_num) - 1
# 利用PyPDF2提取页码对象
pageObj = pdfFileObj.getPage(page_index) # 从0编码
# 添加已读取的页面对象
pdfWriter.addPage(pageObj)
pdfWriter.write(pdfOutputFile2)
pdfOutputFile2.close()
f2_part_1 = open(os.path.join('media', 'part_1.pdf'), 'rb+')
f2_part_2 = open(os.path.join('media', 'part_2.pdf'), 'rb+')
# 创建PDF文件合并对象,添加合并文件
pdfMerger = PyPDF2.PdfFileMerger()
pdfMerger.append(PyPDF2.PdfFileReader(f2_part_1))
pdfMerger.append(PyPDF2.PdfFileReader(f1))
pdfMerger.append(PyPDF2.PdfFileReader(f2_part_2))
# 将合并文件对象写入到replaced_file.pdf
with open(os.path.join('media', 'replaced_file.pdf'), 'wb') as pdfOutputFile:
pdfMerger.write(pdfOutputFile)
# 打开合并的replaced_file.pdf,通过HttpResponse输出
response = FileResponse(open(os.path.join('media', 'replaced_file.pdf'), 'rb'))
response['content_type'] = "application/octet-stream"
response['Content-Disposition'] = 'attachment; filename="replaced_file.pdf"'
return response
else:
# 如果通过POST提交,但表单未通过验证
form = PdfReplaceForm()
else:
# 如果用户没有通过POST,提交生成空表单
form = PdfReplaceForm()
return render(request, 'pdf/pdf_replace.html', {'form': form})
模板页面
#base.html
{% load static %}
<html lang="en">
<head>
<title>{% block title %}Django PDF提取, 合并与替换{% endblock %} </title>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/css/bootstrap.min.css">
</head>
<body>
<!-- Page content of course! -->
<main>
<div class="container">
{% block content %}
{% if error_message %}<p><strong>{{ error_message}}</strong></p>{% endif %}
{% endblock %}
</div>
</main>
<footer class="footer">
{% block footer %}{% endblock %}
</footer>
<!--End of Footer-->
<!-- Bootstrap core JavaScript
================================================== -->
<script src="https://code.jquery.com/jquery-3.3.1.min.js" integrity="sha256-FgpCb/KJQlLNfOu91ta32o/NMZxltwRo8QtmkMRdAu8=" crossorigin="anonymous"></script>
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js"></script>
</body>
</html>
#pdf_extract.html
{% extends "pdf/base.html" %}
{% block content %}
<p>
<a href="{% url 'pdf:pdf_single_page_extract' %}">PDF Singe Page Extract(单页提取)</a>|
<a href="{% url 'pdf:pdf_range_extract' %}">PDF Page Range Extract(页面范围提取)</a> |
<a href="{% url 'pdf:pdf_merge' %}">PDF Merge(合并)</a> |
<a href="{% url 'pdf:pdf_replace' %}">PDF Replace(替换)</a>
</p>
<h3>PDF Single Page Extract (单页提取)</h3>
<p>输入页面号码,如1或者3,5,9。 Enter page numbers such as 1, or 1, 3, 5.</p>
{{ form.non_field_errors }}
<form method="post" enctype="multipart/form-data" action="">{% csrf_token %}
{{ form.as_p }}
<button type="submit" class="btn btn-primary">Extract</button>
</form>
{% endblock %}
#pdf_range_extract.html
{% extends "pdf/base.html" %}
{% block content %}
<p>
<a href="{% url 'pdf:pdf_single_page_extract' %}">PDF Singe Page Extract(单页提取)</a>|
<a href="{% url 'pdf:pdf_range_extract' %}">PDF Page Range Extract(页面范围提取)</a> |
<a href="{% url 'pdf:pdf_merge' %}">PDF Merge(合并)</a> |
<a href="{% url 'pdf:pdf_replace' %}">PDF Replace(替换)</a>
</p>
<h3>PDF Page Range Extract (页面范围提取)</h3>
<p>请输入页面范围, 如2-8, 3-10。Please enter page range such as 2-8, 3-10.</p>
{{ form.non_field_errors }}
<form method="post" enctype="multipart/form-data" action="">{% csrf_token %}
{{ form.as_p }}
<button type="submit" class="btn btn-primary">Extract</button>
</form>
{% endblock %}
#pdf_merge.html
{% extends "pdf/base.html" %}
{% block content %}
<p><a href="{% url 'pdf:pdf_single_page_extract' %}">PDF Singe Page Extract(单页提取)</a>
| <a href="{% url 'pdf:pdf_range_extract' %}">
PDF Page Range Extract(页面范围提取)</a> |
<a href="{% url 'pdf:pdf_merge' %}">PDF Merge(合并)</a> |
<a href="{% url 'pdf:pdf_replace' %}">
PDF Replace(替换)</a>
</p>
<h3>PDF Merge (合并) </h3>
<p>请按顺序上传PDF文件。Please upload pdf documents by order.</p>
<form method="post" enctype="multipart/form-data" action="">{% csrf_token %}
{{ form.as_p }}
<button type="submit" class="btn btn-primary">Merge</button>
</form>
{% endblock %}
#pdf_replace.html
{% extends "pdf/base.html" %}
{% block content %}
<p></p><a href="{% url 'pdf:pdf_single_page_extract' %}">PDF Singe Page Extract(单页提取)</a>
| <a href="{% url 'pdf:pdf_range_extract' %}">
PDF Page Range Extract(页面范围提取)</a> | <a href="{% url 'pdf:pdf_merge' %}">PDF Merge(合并)</a> |
<a href="{% url 'pdf:pdf_replace' %}">
PDF Replace(替换)</a></p>
<h3>PDF Replace(替换)</h3>
<p>一次仅能替换一个PDF页面。You can only replace 1 single page at a time.</p>
<form method="post" enctype="multipart/form-data" action="">{% csrf_token %}
{{ form.as_p }}
<button type="submit" class="btn btn-primary">Replace</button>
</form>
{% endblock %}