申请百度接口识别发票数据并保存到Excel

import os
import base64
import requests
import pandas as pd


class BaiduOCR(object):
    '''
    调用百度OCR识别发票信息保存至excel文件
    '''

    def __init__(self):
        self.AppID = ""
        self.APIKey = ""  #
        self.SecretKey = ""  #

    def getAccessToken(self):
        """
        :param APIKey:
        :param SecretKey:
        :return:
        """
        url = "https://aip.baidubce.com/oauth/2.0/token"
        params = {"grant_type": "client_credentials", "client_id": self.APIKey, "client_secret": self.SecretKey}
        return str(requests.post(url, params=params).json().get("access_token"))

    def getContent(self, accessToken, pdfFile):
        """
        :param accessToken
        :param pdfFile:
        :return:
        """
        f = open(pdfFile, 'rb')
        pdf = base64.b64encode(f.read())
        request_url = f"https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice"
        params = {"pdf_file": pdf}
        request_url = request_url + "?access_token=" + accessToken
        headers = {'content-type': 'application/x-www-form-urlencoded'}
        response = requests.post(request_url, data=params, headers=headers)
        return response.json()

    def getContentPng(self, accessToken, pngFile):
        """
        :param accessToken
        :param pngFile:
        :return:
        """
        f = open(pngFile, 'rb')
        pdf = base64.b64encode(f.read())
        request_url = f"https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice"
        params = {"image": pdf}
        access_token = accessToken  # '[调用鉴权接口获取的token]'
        request_url = request_url + "?access_token=" + access_token
        headers = {'content-type': 'application/x-www-form-urlencoded'}
        response = requests.post(request_url, data=params, headers=headers)
        return response.json()

    def getUsefulInfo(self, content, fp_path):
        """
        :param content
        :param pdf_name:
        :return:
        """
        words_result = content['words_result']
        info = {'发票文件路径': fp_path,
                '发票号码': str(words_result['InvoiceNum']),
                '开票日期': words_result['InvoiceDate'],
                '货物名称': words_result['CommodityName'][0]['word'],
                '未税金额': words_result['CommodityAmount'][0]['word'],
                '货物税率': words_result['CommodityTaxRate'][0]['word'],
                '货物税额': words_result['CommodityTax'][0]['word'],
                '合计金额': words_result['TotalAmount'],
                '合计税额': words_result['TotalTax'],
                '价税合计(小写)': words_result['AmountInFiguers'],
                '价税合计(大写)': words_result['AmountInWords'],
                '销售方名称': words_result['SellerName'],
                '销售方纳税人识别号': words_result['SellerRegisterNum'],
                '销售方银行及账户': words_result['SellerBank'],
                '销售方地址及电话': words_result['SellerAddress']}
        return info


if __name__ == '__main__':
    ocr = BaiduOCR()
    # 发票路径和识别数据保存位置
    fp_base = r"C:\代理手续费\发票"
    save_path = r'C:\代理手续费\发票识别数据'
    pdfFilelist = os.listdir(fp_base)
    infolist = []
    for pdfFile in pdfFilelist:
        if pdfFile.split(".")[-1] == 'pdf':
            access_token = ocr.getAccessToken()

            fp_path = os.path.join(fp_base, pdfFile)
            print(fp_path)
            content = ocr.getContent(access_token, fp_path)
            info = ocr.getUsefulInfo(content, fp_path)
            infolist.append(info)

    df = pd.DataFrame(infolist)
    # df.to_excel('增值税发票信息统计.xlsx', sheet_name="geovindu",index=False)
    with pd.ExcelWriter(os.path.join(save_path, '增值税发票信息统计.xlsx')) as writer:  # , mode='a' 附加
        df.to_excel(writer, sheet_name='增值税发票信息', index=False)
  • 4
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值