如果自己实现的话,需要先做一次直线检测,然后划分出一个个的单元格,将单元格图像送字符识别识别,最后根据行列写xls文件。不过既然有api加上又不是公司的项目就直接调api了。
申请一个百度开发者账号,新建一个ocr的应用,下载sdk。百度的表格识别api,分成同步和异步两种,同步的调用需要提申请,异步的可以直接使用。每天50次的免费
# -*- coding: utf-8 -*-
# -------------------------------------------------------------------------------
# Name: ocr_online.py
# Purpose: ocr表格识别
#
# Author: BQH
#
# Created: 2018-12-06
# Copyright: (c) Administrator 2018
# Licence: <your licence>
# -------------------------------------------------------------------------------
import cv2
import os
import base64
from aip import AipOcr
import requests
import time
data_dir = r'E:\code\ocr_online\data'
img_dir = r'E:\code\ocr_online\img'
result_dir = r'E:\code\ocr_online\result'
APP_ID = '你的app id'
API_KEY = '你的 api key'
SECRET_KEY = '你的 key'
client = AipOcr(APP_ID, API_KEY, SECRET_KEY)
def image_process():
for name in os.listdir(data_dir):
img = cv2.imread(os.path.join(data_dir, name), 0)
ret, binary = cv2.threshold(img, 230, 255, cv2.THRESH_BINARY)
binary = cv2.GaussianBlur(binary, (3, 3), 0)
kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]], np.float32)
dst = cv2.filter2D(binary, -1, kernel=kernel)
cv2.imwrite(os.path.join(img_dir, name), dst)
def get_file_content(filePath):
with open(filePath, 'rb') as fp:
return fp.read()
def file_download(url, file_path):
r = requests.get(url)
with open(file_path, 'wb') as f:
f.write(r.content)
def main():
num = 0
for name in os.listdir(img_dir):
image = get_file_content(os.path.join(img_dir,name))
res = client.tableRecognitionAsync(image)
try:
req_id = res['result'][0]['request_id']
# print(req_id)
while True:
time.sleep(3)
res = client.getTableRecognitionResult(req_id)
try:
msg = res['result']['ret_msg']
if msg == '已完成':
url = res['result']['result_data']
xls_name = name.split('.')[0] + '.xls'
file_download(url, os.path.join(result_dir, xls_name))
num = num + 1
print('{0}: {1} 完成!'.format(num, xls_name))
os.remove(os.path.join(img_dir,name))
break
else:
time.sleep(2)
except Exception as e:
break
except Exception as e:
print(res)
break
if __name__ == '__main__':
main()