一、进行安装相对应的库文件
import os
import re
import logging
import tkinter as tk
from tkinter import filedialog
from datetime import datetime
import pdfplumber
import pandas as pd
from logging.handlers import RotatingFileHandler
二、可以兼容多种发票类型, 如有些是只有长发票号码,有些是有发票号码,发票代码以及验证码的那种,目前都可以抓取。
三、代码如下:
import os
import re
import logging
import tkinter as tk
from tkinter import filedialog
from datetime import datetime
import pdfplumber
import pandas as pd
from logging.handlers import RotatingFileHandler
# 提取发票信息
class ExtraInfo():
def __init__(self) -> None:
# 通过logger设置日志格式
logger = logging.getLogger()
logger.setLevel("INFO")
fh = RotatingFileHandler("run.log", maxBytes=1024 * 1024, backupCount=1,
encoding="utf-8") # <这里设置了日志最大只能为1M>, <备份数设为1, 如果不设则maxBytes不生效>, <输出到文件>
sh = logging.StreamHandler() # <输出到控制台>
fh.setFormatter(logging.Formatter("%(asctime)s - %(name)s - %(levelname)s -%(module)s: %(message)s"))
sh.setFormatter(logging.Formatter("%(asctime)s - %(name)s - %(levelname)s -%(module)s: %(message)s"))
logger.addHandler(fh) # <使fh生效>
# logger.addHandler(sh) # <使sh生效>
self.logger = logger
@staticmethod
def extra_invoice_info(file_path):
text = ""
try:
with pdfplumber.open(file_path) as pdf:
page = pdf.pages[0]
text = page.extract_text()
print("Extracted Text:", text)
except Exception as e:
print(f"出现异常:{file_path}-{e}")
finally:
return text
def analyze_invoice_info(self, pdf_dir):
ls_dir = os.listdir(pdf_dir)
ls_data = []
for i in ls_dir:
pdf_path = os.path.join(pdf_dir, i)
if pdf_path.endswith(".pdf"):
try:
text = self.extra_invoice_info(pdf_path)
mtime = os.path.getmtime(pdf_path) # 修改时间
文件修改时间 = datetime.fromtimestamp(int(mtime))
名称 = re.findall(r"名\s{0,1}称\s{0,1}[:,:]\s{0,1}(.*?)\s", text, re.I | re.S)
识别号 = re.findall(r"纳税人识别号[:, :]\s{0,1}(.*?)\s", text, re.I | re.S)
开票项目名称 = re.findall(r"税\s{0,1}额\s{0,1}(.*?)\s", text, re.I | re.S)
开票日期 = re.findall(r"开票日期\s{0,1}[:,:]\s{0,1}(.*?)日", text, re.I | re.S)
发票号码 = re.findall(r"发票号码\s{0,1}[:,:]\s{0,1}(.*?)\s", text, re.I | re.S)
#if 发票号码 :
# 发票号码 = 发票号码[0][:21]
#else:
# 发票号码 = ""
发票代码 = re.findall(r"发票代码\s{0,1}[:,:]\s{0,1}(.*?)\s", text, re.I | re.S)
#校验码 = re.findall(r"校验码\s{0,1}[:,:]\s{0,1}(.*?)\s", text, re.I | re.S)
#可能是校验码有空格隔开,导致的报错。把空格嘎了
# = ''.join(校验码[0].split())
价税合计 = re.findall(r".小写.[\s]{0,1}[\xa5,¥]\s{0,1}(.*?)\s", text, re.I | re.S)
if not 价税合计:
价税合计 = re.findall(r"小写\(cid:61\)[\xa5,¥]\s{0,1}(.*?)\s", text, re.I | re.S)
if not 价税合计:
价税合计 = re.findall(r"大写.\s{0,1}(.*?)\s", text, re.I | re.S)
ret = [pdf_path, 文件修改时间, 名称[0] if 名称 else "", 名称[1] if len(名称) > 1 else "",
识别号[0] if 识别号 else "", 识别号[1] if len(识别号) > 1 else "",
开票项目名称[0] if 开票项目名称 else "",
(开票日期[0].replace(" ", "") + "日") if 开票日期 else "",
发票号码[0] if 发票号码 else "",
f"'{发票代码[0]}" if 发票代码 else "", f"'{价税合计[0]}" if 价税合计 else ""]
ret = [str(k).replace(" ", "").replace("'", "") if k else "" for k in ret]
except Exception as e:
self.logger.warning(e)
self.logger.warning([text])
ret = [pdf_path, 文件修改时间, "", "", "", "", "", "", "", ""]
finally:
self.logger.warning([text])
self.logger.info(ret)
ls_data.append(ret)
print("ls_data:",ls_data)
return ls_data
def to_excel(self, save_path, pdf_dir):
ls_data = self.analyze_invoice_info(pdf_dir)
# 检查每行数据项数量
num_columns = len(ls_data[0]) if ls_data else 0
if not all(len(row) == num_columns for row in ls_data):
raise ValueError(f"Number of columns in data rows doesn't match the expected number ({num_columns}).")
# 创建DataFrame
df = pd.DataFrame(ls_data,
columns=["文件路径", "文件修改时间", "购方名称", "销方名称", "购方识别号", "销方识别号",
"开票项目名称", "开票日期", "发票号码", "发票代码", "价税合计"])
writer = pd.ExcelWriter(save_path, engine='openpyxl')
df.to_excel(writer, sheet_name='Sheet1', startrow=0, startcol=0, index=False)
writer._save()
# 弹窗获取文件夹
class PopUp():
def __init__(self) -> None:
self.folder_path = ""
self.root = tk.Tk()
self.folder_label = tk.Label(self.root, text="请选择文件夹")
self.folder_label.pack(padx=5, pady=5)
def browse_folder(self):
self.folder_path = filedialog.askdirectory()
print(self.folder_path)
if self.folder_path:
self.folder_label.config(text=self.folder_path)
def pop_up(self):
self.root.title("上传文件夹")
self.root.geometry("300x150")
browse_button = tk.Button(self.root, text="浏览", command=self.browse_folder)
browse_button.pack(padx=5, pady=5)
upload_button = tk.Button(self.root, text="提交", command=self.root.destroy)
upload_button.pack(padx=5, pady=5)
self.root.mainloop()
return self.folder_path
if __name__ == '__main__':
pop = PopUp()
dir = pop.pop_up()
extra = ExtraInfo()
save_path = r"ouput.xlsx"
extra.to_excel(save_path, dir)
四、最后生成excel文件,保存路径在你的代码同级目录下。
ps: 把你们的所有发票信息都存放在一个文件夹里,然后里面不要有其他的东西,然后你就浏览,选择那个文件夹,提交就可以啦~ 撒花