#小结#
发票样本
提取
金额: 125.03 位于 (小写)文本后 正则表达式amount_pattern = r'\(小写\)¥(\d+\.\d{2})'
月份:202401 位于账单月文本后 正则表达式amount_pattern = r'账单月:(\d{6})'
完整示例:
import pdfplumber
def readinvoice(pdf_dir):
pdfs = []
filenames = get_pdf(pdf_dir)
# 循环当有多个PDF时,批量读取
for filename in filenames:
pdf_f = {}
with pdfplumber.open(filename) as pdf:
first_page = pdf.pages[0]
# 提取文本信息
pdf_text = first_page.extract_text()
amount_pattern = r'\(小写\)¥(\d+\.\d{2})'
pdf_f["amount"] = re.findall(amount_pattern, pdf_text)[0]
bill_month_pattern = r'账单月:(\d{6})'
pdf_f["billm"] = re.findall(bill_month_pattern, pdf_text)[0]
pdfs.append(pdf_f)
start_index = filename.rfind('\\') + 1 # 找到最后一个 "\\" 的索引,并向后移动一位
end_index = filename.rfind('.pdf') # 找到 ".pdf" 的索引
pdf_f["user"] = filename[start_index:end_index]
return pdfs
pdfs = []
pdfs.extend(readinvoice(r"C:\phonepdf"))
for pdfi in pdfs:
print(pdfi)
从Outlook中,获取最新的发票并保存到指定路径
def runoutlook():
# 创建 Outlook 应用对象
outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI")
# 获取收件箱文件夹
inbox = outlook.GetDefaultFolder(6)
# 遍历收件箱中的邮件
for message in inbox.Items:
# 检查发件人和是否未读
if message.SenderEmailAddress == "10086@139.com" and message.UnRead:
# 获取附件
attachments = message.Attachments
for attachment in attachments:
# 检查附件是否为 PDF 格式
if attachment.FileName.endswith('.pdf'):
# 保存附件到指定路径
attachment.SaveAsFile(
r'C:\phonepdf\\' + attachment.FileName)
# 将邮件设置为已读
message.UnRead = False
print("附件保存完成")