背景
已知扣分和扣分原因
给每个学生的pdf报告上特定位置添加批注
实现功能
- 查找pdf上特定文字所在页面
- 在该页面添加批注
代码
有借鉴,后补引用
#-*- coding = utf-8 -*-
import pandas as pd
from PyPDF2 import PdfFileWriter, PdfFileReader
import io
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter,A4 #纸大小
from reportlab.pdfbase import pdfmetrics #可以写中文
from reportlab.pdfbase.ttfonts import TTFont
import pandas as pd
import pdfplumber
import os
import re
#%% read files
path = r'D:\AAAAA Files\Season2021-2022\下学期'
classes_ = os.listdir(path)
class_ = classes_[0]
s_files = os.listdir(os.path.join(path, class_))
s_files = pd.DataFrame(s_files,columns=['origin_name'])
s_files['stdid'] = s_files['origin_name'].str.slice(0,7)
s_files['isfile'] = s_files['origin_name'].\
apply(lambda s: os.path.isfile(os.path.join(path, class_, s)))
s_files['ispdf'] = s_files['origin_name'].apply(lambda s: s[-4:]=='.pdf')
s_files = s_files[(s_files['isfile']==False)|(s_files['ispdf']==True)]
#%%
name_table = \
pd.read_excel(r'D:\AAAAA Files\Season2021-2022\下学期\学生名单-期末作业打分表.xlsx')
name_table = name_table[['学生期末作业成绩', 'Unnamed: 4', 'Unnamed: 8']]
name_table.dropna(inplace=True)
name_table = name_table.rename(columns={'学生期末作业成绩':'学号',
'Unnamed: 4':'分数',
'Unnamed: 8':'评价'})
name_table['isfinished'] = False
#%%
pdfmetrics.registerFont(TTFont("SimSun",
r"D:\AAAAA Files\Season2021-2022\注\Simsun.ttf"))
#%%
def markpdf(filepath, isfile, stdid):
pdf_path = 0
if isfile == False:
filepathes = os.listdir(os.path.join(path,class_,filepath))
for file_ in filepathes:
if (file_[-4:]=='.pdf')&('代码' not in file_):
pdf_path = os.path.join(path, class_, filepath, file_)
break
else:
continue
if pdf_path == 0:
return -1
elif (filepath[-4:]=='.pdf')&('代码' not in filepath):
pdf_path = os.path.join(path, class_, filepath)
else:
print(filepath+'未完成')
return -1
#% 寻找文字
pdf = pdfplumber.open(pdf_path)
pagei = -1
pagenum = len(pdf.pages)
for i in range(pagenum):
pagetext = pdf.pages[i].extract_text()
if re.search('.*?(文字一|文字二|文字三).*?',pagetext) is not None:
pagei = i
break
else:
continue
if pagei == -1:
print(filepath+'未完成')
return -1
pdf.close()
packet = io.BytesIO()
# 使用Reportlab创建一个新的PDF
can = canvas.Canvas(packet, pagesize=A4)
can.setFont("SimSun", 10)
can.setFillColorRGB(255, 0, 0)
temp = name_table[name_table['学号']==stdid]
if temp.shape[0]==0:
print(filepath+'未完成')
return -1
can.drawString(80, 50, "任务二:%s \n扣分情况:-%s"%(temp['评价'].values[0],
25-temp['分数'].values[0]))
can.save()
#buffer从偏移0开始
packet.seek(0)
new_pdf = PdfFileReader(packet)
#读取已有的PDF
existing_pdf = PdfFileReader(open(pdf_path, "rb"))
output = PdfFileWriter()
#
page = existing_pdf.getPage(pagei)
page.mergePage(new_pdf.getPage(0))
for i in range(pagenum):
if i ==pagei:
output.addPage(page)
else:
output.addPage(existing_pdf.getPage(i))
# 最后,向目标的pdf写出
outputStream = open('%s-批改完成.pdf'%pdf_path[:-4], "wb")
output.write(outputStream)
outputStream.close()
packet.close()
# print(filepath+'完成')
# os.remove(pdf_path)
# readed = readed + [stdid]
return stdid
#%%
readed = []
for fi in s_files.iterrows():
i, f = fi
stdid = markpdf(f['origin_name'],f['isfile'],int(f['stdid']))
if stdid>0:
readed = readed+[stdid]
readed = pd.DataFrame(readed, columns=['学号'])
readed['完成'] = True
name_table = pd.merge(name_table, readed, on='学号',how='left')