首先创建发票实体类invoice.py
代码如下:
class InvoicePurchaser:
"""
发票购买方信息
"""
# 名称
name: str
# 纳税人识别号
identification_number: str
# 地址、电话
address_telephone: str
# 开户行及账号
bank_account: str
class InvoiceSeller:
"""
发票销售方信息
"""
# 名称
name: str
# 纳税人识别号
identification_number: str
# 地址、电话
address_telephone: str
# 开户行及账号
bank_account: str
class Invoice:
"""
发票信息
"""
# 机器编码
machine_code: str
# 发票代码
invoice_code: str
# 发票号码
invoice_number: str
# 开票日期
invoice_date: str
# 校验码
check_code: str
# 金额
amount: str
# 税额
tax: str
# 税率
tax_rate: str
# 价税合计
total_amount: str
# 收款人
payee: str
# 复核
review: str
# 开票人
drawer: str
# 发票购买方信息
invoicePurchaser: InvoicePurchaser
# 发票销售方信息
invoiceSeller: InvoiceSeller
通过CnOcr识别发票信息
from cnocr import CnOcr
from models.invoice import Invoice, InvoicePurchaser, InvoiceSeller
import json
# 所有参数都使用默认值
ocr = CnOcr()
def ocr_invoice(img_path) -> Invoice:
invoice = Invoice()
invoicePurchaser = InvoicePurchaser()
invoiceSeller = InvoiceSeller()
result = ocr.ocr(img_path)
identification_number: int = 0
address_telephone: int = 0
bank_account: int = 0
for i in result:
text = i['text']
if '机器编号:' in text:
list = text.split(':')
if len(list) == 2:
invoice.machine_code = list[1]
elif '发票代码:' in text:
list = text.split(':')
if len(list) == 2:
invoice.invoice_code = list[1]
elif '发票号码:' in text:
list = text.split(':')
if len(list) == 2:
invoice.invoice_number = list[1]
elif '开票日期:' in text:
list = text.split(':')
if len(list) == 2:
invoice.invoice_date = list[1]
elif '校验码:' in text:
list = text.split(':')
if len(list) == 2:
invoice.check_code = list[1]
elif '收款人:' in text:
list = text.split(':')
if len(list) == 2:
invoice.payee = list[1]
elif '复核:' in text:
list = text.split(':')
if len(list) == 2:
invoice.review = list[1]
elif '开票人:' in text:
list = text.split(':')
if len(list) == 2:
invoice.drawer = list[1]
elif '(小写)¥' in text:
list = text.split('¥')
if len(list) == 2:
invoice.total_amount = list[1]
elif '称:' in text:
list = text.split(':')
if len(list) == 2:
invoicePurchaser.name = list[1]
elif '称:' in text:
list = text.split(':')
if len(list) == 2:
invoiceSeller.name = list[1]
elif '纳税人识别号:' in text:
identification_number += 1
list = text.split(':')
if len(list) == 2:
if identification_number == 1:
invoicePurchaser.identification_number = list[1]
elif identification_number == 2:
invoiceSeller.identification_number = list[1]
elif '地址、电话:' in text:
address_telephone += 1
list = text.split(':')
if len(list) == 2:
if address_telephone == 1:
invoicePurchaser.address_telephone = list[1]
elif address_telephone == 2:
invoiceSeller.address_telephone = list[1]
elif '开户行及账号:' in text:
bank_account += 1
list = text.split(':')
if len(list) == 2:
if bank_account == 1:
invoicePurchaser.bank_account = list[1]
elif bank_account == 2:
invoiceSeller.bank_account = list[1]
invoice.invoicePurchaser = invoicePurchaser
invoice.invoiceSeller = invoiceSeller
return invoice
if __name__ == "__main__":
img_path = './images/fapiao.jpg'
result = ocr_invoice(img_path)
j = json.dumps(result, ensure_ascii=False, default=lambda obj: obj.__dict__)
print(j)