1.增值税发票识别:
import os
import shutil
import sys
import urllib.request
import urllib.parse
import json
import time
import base64
import openpyxl
import xlwt
import pandas as pd
# 增值税发票识别/OCR文字识别
def posturl(appcode,file_dir,move_dir,write_dir,write_filename):
"""
:param file_dir:
:param write_path:
:return:
"""
# file_dir='\\\\'.join(file_dir.split("\\"))+'\\\\'
# write_path='\\\\'.join(write_path.split("\\"))+'\\\\'
filenames = []
err_file = []
datas_list = []
datas_title=[]
for root, dirs, files in os.walk(file_dir):
for file in files:
if file.split('.')[-1] in ['jpg', 'png', 'jpeg', 'bmp']:
filenames.append(file)
# print(filenames)
# img_path='C:\\Users\\best\\PycharmProjects\\pythonProject\\timg.jpg'
for file_name in filenames:
with open(file_dir + file_name, 'rb') as f: # 以二进制读取本地图片
data = f.read()
encodestr = str(base64.b64encode(data), 'utf-8')
headers = {
'Authorization': 'APPCODE %s' % appcode,
'Content-Type': 'application/json; charset=UTF-8'
}
url_request = "https://ocrapi-invoice.taobao.com/ocrservice/invoice"
dicts = {'img': encodestr}
try:
params = json.dumps(dicts).encode(encoding='UTF8')
req = urllib.request.Request(url_request, params, headers)
r = urllib.request.urlopen(req)
html = r.read()
r.close()
ret = html.decode("utf8")
ret_ = json.loads(ret)
# print(ret_)
datas_list.append(ret_.get('data'))
# return ret
except urllib.error.HTTPError as e:
err_file.append("%s error" % file_name)
shutil.move(file_dir + file_name, move_dir)
# print(e.code)
# print(e.read().decode("utf8"))
time.sleep(1)
fp_title_list = []
for data_list in datas_list:
for key in data_list.keys():
if isinstance(data_list[key], list) and len(data_list[key]) > 0:
# print(data_list)
datas_title.append(data_list)
# 取标题
datas=datas_title[0]
for key in datas.keys():
if isinstance(datas[key], str):
fp_title_list.append([key])
elif isinstance(datas[key], list):
data_ = datas[key][0]
# print(data_)
for key_ in data_.keys():
fp_title_list.append([key_])
elif isinstance(datas[key], dict):
# data__ = datas[key]
for key__ in datas[key]:
fp_title_list.append([key__])
for title in fp_title_list:
for datas in datas_list:
for key in datas.keys():
if isinstance(datas[key], str):
if key in title:
title.append(datas[key])
elif isinstance(datas[key], list):
if len(datas[key]) > 0:
data_ = datas[key][0]
# print(data_)
for key_ in data_.keys():
if key_ in title:
title.append(data_[key_])
elif isinstance(datas[key], dict):
data__ = datas[key]
for key__ in datas[key]:
if key__ in title and key__ != '联次':
title.append(data__[key__])
# a = pd.DataFrame(fp_title_list)
# # print(a)
# if write_to_excel.lower()=="true" :
# a.to_excel(write_dir + write_filename+'.xlsx')
# write excel
f = openpyxl.Workbook()
# 表名
f.create_sheet(u'%s' % write_filename)
sheet1 = f.active
for index in range(len(fp_title_list)):
j = 1
for i in fp_title_list[index]:
sheet1.cell(j, index + 1, i)
j += 1
f.save(write_dir + write_filename + '.xlsx')
return fp_title_list,str(err_file)
if __name__ == "__main__":
# 发票目录
file_dir = 'D:\\fp\\zhfp\\'
# 写入路径
write_dir = 'C:\\Users\\best\\Desktop\\'
# 识别不出来的移动到
move_dir = 'C:\\Users\\best\\Desktop\\'
appcode = ' '
write_filename='new_fp_test.xlsx'
# appcode=sys.argv[1] 从命令行输入参数的话得用 ""
# file_dir=sys.argv[2]
# move_dir=sys.argv[3]
# write_to_excel=sys.argv[4]
# write_dir=sys.argv[5]
# write_filename=sys.argv[6]
posturl(appcode,file_dir,move_dir,write_dir,write_filename)
2.定额发票识别:
import base64
import json
import os
import time
import urllib, sys
import urllib.request as urllib2
import ssl
import openpyxl
import pandas as pd
def Quotalnvoice(appcode,file_dir,write_dir,write_filename):
filenames = []
err_file = []
datas_list = []
datas_title = []
fp_title_datas=[]
host = 'https://ocrapi-quota-invoice.taobao.com'
path = '/ocrservice/quotaInvoice'
url = host + path
method = 'POST'
for root, dirs, files in os.walk(file_dir):
for file in files:
if file.split('.')[-1] in ['jpg', 'png', 'jpeg', 'bmp']:
filenames.append(file)
print(filenames)
for file_name in filenames:
with open(file_dir + file_name, 'rb') as f:
data = f.read()
encodestr = str(base64.b64encode(data), 'utf-8')
dicts = {'img': encodestr}
bodys = json.dumps(dicts).encode(encoding='UTF8')
post_data = bodys
request = urllib2.Request(url, post_data)
request.add_header('Authorization', 'APPCODE ' + appcode)
request.add_header('Content-Type', 'application/json; charset=UTF-8')
request.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE')
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
response = urllib2.urlopen(request, context=ctx)
content = response.read().decode("utf8")
time.sleep(1)
if (content):
cont = json.loads(content)
# print(cont)
result = cont.get('data')
# print(result)
datas_list.append(result)
for data in datas_list:
for key in data.keys():
if isinstance(data[key], dict) > 0:
datas_title.append(data)
#print(datas_title)
# take title
datas=datas_title[0]
for key in datas.keys():
if isinstance(datas[key], str):
fp_title_datas.append([key])
elif isinstance(datas[key], dict):
for key_ in datas[key].keys():
fp_title_datas.append([key_])
# print(fp_title_datas)
# get data in [title,data,...]
for title in fp_title_datas:
for datas in datas_list:
for key in datas.keys():
if isinstance(datas[key], str):
if key in title:
print(type(title))
title.append(datas[key])
elif isinstance(datas[key], dict):
for key__ in datas[key].keys():
if key__ in title:
title.append(datas[key][key__])
#pandas write excel
# a = pd.DataFrame(fp_title_datas)
# print(a)
# if write_to_excel.lower()=="true" :
# a.to_excel(write_dir + write_filename+'.xlsx')
# openpyxl write excel
f = openpyxl.Workbook()
# table_name
f.create_sheet(u'%s' % write_filename)
sheet1 = f.active
for index in range(len(fp_title_datas)):
j = 1
for i in fp_title_datas[index]:
sheet1.cell(j, index + 1, i)
j += 1
f.save(write_dir + write_filename + '.xlsx')
# print(datas_list)
if __name__ == '__main__':
file_dir = 'D:\\fp\\_quto\\'
appcode = ' '
write_dir="D:\\fp\\_quto\\"
write_filename='QuotaInvoice'
Quotalnvoice(appcode,file_dir,write_dir,write_filename)
3.混贴发票识别:
import base64
import json
import os
import time
import urllib, sys
import urllib.request as urllib2
import ssl
import openpyxl
import pandas as pd
def MixedMultiInvoice(appcode,file_dir,write_dir,write_filename):
filenames = []
err_file = []
datas_list = []
datas_title = []
fp_title_datas=[]
host = 'https://ocrapi-mixed-multi-invoice.taobao.com'
path = '/ocrservice/mixedMultiInvoice'
url = host + path
method = 'POST'
for root, dirs, files in os.walk(file_dir):
for file in files:
if file.split('.')[-1] in ['jpg', 'png', 'jpeg', 'bmp']:
filenames.append(file)
print(filenames)
for file_name in filenames:
with open(file_dir + file_name, 'rb') as f:
data = f.read()
encodestr = str(base64.b64encode(data), 'utf-8')
dicts = {'img': encodestr}
bodys = json.dumps(dicts).encode(encoding='UTF8')
post_data = bodys
request = urllib2.Request(url, post_data)
request.add_header('Authorization', 'APPCODE ' + appcode)
request.add_header('Content-Type', 'application/json; charset=UTF-8')
request.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE')
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
response = urllib2.urlopen(request, context=ctx)
content = response.read().decode("utf8")
# time.sleep(1)
if (content):
cont = json.loads(content)
result = cont.get('subMsgs')
# subMsga{}->result[]包含多种发票data->data{}
for i in result:
print((i.get('result')).get('data'))
# print(result)
datas_list.append(result)
# print(datas_list)
if __name__ == '__main__':
file_dir = 'D:\\fp\\mix\\'
appcode = ''
write_dir="D:\\fp\\_quto\\"
write_filename='QuotaInvoice'
MixedMultiInvoice(appcode,file_dir,write_dir,write_filename)
4.表格识别:
import sys, os
import base64
import time
import json
import urllib.request as urllib2
import base64
def get_img_base64(img_file):
with open(img_file, 'rb') as infile:
s = infile.read()
return str(base64.b64encode(s),'utf-8')
def predict(url, appcode, img_base64, kv_config, old_format):
if not old_format:
param = {}
param['image'] = img_base64
if kv_config is not None:
param['configure'] = kv_config
# print(param)
body = json.dumps(param).encode(encoding='UTF8')
else:
param = {}
pic = {}
pic['dataType'] = 50
pic['dataValue'] = img_base64
param['image'] = pic
if kv_config is not None:
conf = {}
conf['dataType'] = 50
conf['dataValue'] = json.dumps(kv_config).encode(encoding='UTF8')
param['configure'] = conf
inputs = {"inputs": [param]}
body = json.dumps(inputs).encode(encoding='UTF8')
headers = {'Authorization': 'APPCODE %s' % appcode}
request = urllib2.Request(url=url, headers=headers, data=body)
try:
response = urllib2.urlopen(request, timeout=10)
return response.code, response.headers, response.read().decode("utf8")
except urllib2.HTTPError as e:
return e.code, e.headers, e.read()
def demo(appcode,file_dir):
filenames = []
for root, dirs, files in os.walk(file_dir):
for file in files:
if file.split('.')[-1] in ['jpg', 'png', 'jpeg', 'bmp']:
filenames.append(file)
for filename in filenames:
url = 'https://form.market.alicloudapi.com/api/predict/ocr_table_parse'
img_file = file_dir+filename
# 如果输入带有inputs, 设置为True,否则设为False
is_old_format = False
config = {'format': 'xlsx', 'finance': False, 'dir_assure': False}
# not config config=none
# config = None
img_base64data = get_img_base64(img_file)
stat, header, content = predict(url, appcode, img_base64data, config, is_old_format)
if stat != 200:
print('Http status code: ', stat)
print('Error msg in header: ', header['x-ca-error-message'] if 'x-ca-error-message' in header else '')
print('Error msg in body: ', content)
exit()
if is_old_format:
result_str = json.loads(content)['outputs'][0]['outputValue']['dataValue']
else:
result_str = content
# read table content
result=json.loads(result_str)
filename=filename.replace('.','_')
# write excel
with open(file_dir+'%s.xlsx' %filename, 'wb') as fout:
# write content in output.xlsx
fout.write(base64.b64decode(result.get("tables")))
# print(result.get("tables"))
# result = json.loads(result_str)
if __name__ == '__main__':
appcode = ' '
file_dir='D:\\fp\\table\\'
demo(appcode,file_dir)
用pyinstaller -F .py文件打包成exe,遍历文件夹的时候os.walk(file_dir),
file_dir 传参必须得"fiel_dir",‘file_dir’ 函数里都能能读出来,但传不进os.walk()