python: 用百度API读取增值税发票信息

# encoding: utf-8
# 版权所有 2023 涂聚文有限公司
# 许可信息查看:
# 描述:
# Author    : geovindu,Geovin Du 涂聚文.
# IDE       : PyCharm 2023.1 python 311
# Datetime  : 2023/9/30 6:56
# User      : geovindu
# Product   : PyCharm
# Project   : pythonTkinterDemo
# File      : BaiduOCRAPI.py
# explain   : 学习
 
import os
import base64
import requests
import pandas as pd
import json
 
 
 
class BaiduOCR(object):
    """
    利用百度API读取发票信息(pdf,image文件)
    """
 
    AppID="40226401"
    APIKey="geovindu"
    SecretKey="geovindu"
 
    def __init__(self):
        """
 
        """
        self.AppID="40226401"
        self.APIKey="geovindu"  #
        self.SecretKey="geovindu"  #
 
 
 
    def getAccessToken(self):
        """
 
        :param APIKey:
        :param SecretKey:
        :return:
        """
        '''
        host = f"https://aip.baidubce.com/oauth/2.0/token?client_secret={self.SecretKey}&grant_type=client_credentials&client_id={self.APIKey}"
        response = requests.get(host)
        return response.json()['access_token']
        '''
        url = "https://aip.baidubce.com/oauth/2.0/token"
        params = {"grant_type": "client_credentials", "client_id": self.APIKey, "client_secret": self.SecretKey}
        return str(requests.post(url, params=params).json().get("access_token"))
 
    def getContent(self,accessToken, pdfFile):
        """
 
        :param accessToken
        :param pdfFile:
        :return:
        """
        #headers = {'content-type': 'application/x-www-form-urlencoded'}
        #request_url = f"https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice?access_token={accessToken}"
        f = open(pdfFile, 'rb')
        pdf = base64.b64encode(f.read())
        print(pdf)
        print(accessToken)
        request_url = f"https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice"
        params = {"pdf_file": pdf}
        access_token =accessToken # '[调用鉴权接口获取的token]'
        request_url = request_url + "?access_token=" + access_token
        headers = {'content-type': 'application/x-www-form-urlencoded'}
        response = requests.post(request_url, data=params, headers=headers)
        if response:
            print(response.json())
 
        #print(pdf)
        #params = {"pdf_file": pdf}
        #response = requests.post(request_url, data=params, headers=headers)
        #print(response.json())
        return response.json()
 
 
    def getContentPng(self,accessToken, pngFile):
        """
 
        :param accessToken
        :param pngFile:
        :return:
        """
        #headers = {'content-type': 'application/x-www-form-urlencoded'}
        #request_url = f"https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice?access_token={accessToken}"
        f = open(pngFile, 'rb')
        pdf = base64.b64encode(f.read())
        print(pdf)
        print(accessToken)
        request_url = f"https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice"
        params = {"image": pdf}
        access_token =accessToken # '[调用鉴权接口获取的token]'
        request_url = request_url + "?access_token=" + access_token
        headers = {'content-type': 'application/x-www-form-urlencoded'}
        response = requests.post(request_url, data=params, headers=headers)
        if response:
            print(response.json())
 
        #print(pdf)
        #params = {"pdf_file": pdf}
        #response = requests.post(request_url, data=params, headers=headers)
        #print(response.json())
        return response.json()
 
    def getUsefulInfo(self,content, pdf_name):
        """
 
        :param content
        :param pdf_name:
        :return:
        """
        jsonstr = content
        print("Json",jsonstr)
        words_result = jsonstr['words_result']
        info = {'发票文件名': pdf_name,
                '发票号码': str(words_result['InvoiceNum']),
                '开票日期': words_result['InvoiceDate'],
                '货物名称': words_result['CommodityName'][0]['word'],
                '未税金额': words_result['CommodityAmount'][0]['word'],
                '货物税率': words_result['CommodityTaxRate'][0]['word'],
                '货物税额': words_result['CommodityTax'][0]['word'],
                '合计金额': words_result['TotalAmount'],
                '合计税额': words_result['TotalTax'],
                '价税合计(小写)': words_result['AmountInFiguers'],
                '价税合计(大写)': words_result['AmountInWords'],
                '销售方名称': words_result['SellerName'],
                '销售方纳税人识别号': words_result['SellerRegisterNum'],
                '销售方银行及账户': words_result['SellerBank'],
                '销售方地址及电话': words_result['SellerAddress']}
        return info

调用:用京东多张发票测试成功

    ocr=Common.BaiduOCRAPI.BaiduOCR()
    pdfFilelist = os.listdir("invoice/")
    infolist = []
    for pdfFile in pdfFilelist:
        if pdfFile.split(".")[-1] == 'pdf':
            pdfName = pdfFile.split(".")[:-1]
            print(pdfFile)
            access_token =ocr.getAccessToken()
            content = ocr.getContent(access_token, "invoice/" + pdfFile)
            info = ocr.getUsefulInfo(content, pdfName)
            infolist.append(info)

    df = pd.DataFrame(infolist)
    print(df)
    #df.to_excel('增值税发票信息统计.xlsx', sheet_name="geovindu",index=False)
    with pd.ExcelWriter('geovindu.xlsx') as writer:  #, mode='a' 附加
        df.to_excel(writer, sheet_name='geovindu', index=False)

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
好的,这里是一个Python示例代码,使用百度AI增值税发票识别API,批量识别增值税发票并导出Excel文件。 首先,需要安装百度AI的Python SDK,可以通过以下命令进行安装: ``` pip install baidu-aip ``` 接着,在百度AI控制台中创建一个增值税发票识别应用,并获取应用的API Key和Secret Key。 然后,可以使用以下Python代码实现批量识别增值税发票并导出Excel文件: ```python from aip import AipOcr import os import xlwt # 设置API信息 APP_ID = 'your_app_id' API_KEY = 'your_api_key' SECRET_KEY = 'your_secret_key' # 初始化AipOcr对象 client = AipOcr(APP_ID, API_KEY, SECRET_KEY) # 设置Excel文件名和表格头部 excel_name = 'invoices.xls' header = ['发票代码', '发票号码', '开票日期', '购方名称', '购方税号', '金额', '税额', '价税合计'] # 创建Excel文件 workbook = xlwt.Workbook(encoding='utf-8') worksheet = workbook.add_sheet('invoices') for col, value in enumerate(header): worksheet.write(0, col, value) # 设置文件夹路径和文件扩展名 folder_path = 'invoices' file_ext = '.jpg' # 遍历文件夹中所有图片文件,并识别增值税发票信息 for i, filename in enumerate(os.listdir(folder_path)): if filename.endswith(file_ext): file_path = os.path.join(folder_path, filename) with open(file_path, 'rb') as f: image = f.read() result = client.vatInvoice(image) if 'words_result' in result: data = result['words_result'] row = worksheet.row(i + 1) for j, key in enumerate(header): row.write(j, data[key]) else: print('Error:', result['error_msg']) # 保存Excel文件 workbook.save(excel_name) print('Excel file saved:', excel_name) ``` 在上述代码中,需要将`your_app_id`、`your_api_key`和`your_secret_key`替换成自己的API信息,`invoices`替换为存放增值税发票图片的文件夹路径,`.jpg`替换为增值税发票图片的文件扩展名。 执行完上述代码后,将会在当前目录下生成一个名为`invoices.xls`的Excel文件,包含了所有增值税发票识别结果。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值