基本思路
调用文字识别服务商API,目前市场上成熟的产品一个是百度云,另一个是薪火科技。经过试用发现,百度云的表格识别功能在识别规范的电子文档中表格(比如PDF文件),在识别手机照片中表格时准确度一塌糊涂。相比之下,薪火科技表格文字识别功能更为可靠。
另外,所识别的Excel表格中存在一个格子包含多行文本,需要借助VBA将多行文本拆分成单独的格子。
百度表格文字识别API调用代码
# encoding:utf-8
import requests
import base64
import time
import urllib.request
'''
表格文字识别(异步接口)
'''
'''
请求access token
'''
def getToken(ak, sk):
host = 'https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id='+ak+'&client_secret='+sk
response = requests.get(host)
if response:
access_token = response.json()['access_token']
print("---1---")
return access_token
else:
return ''
'''
发送识别请求
'''
def postRqeust(access_token, img_path):
request_url = "https://aip.baidubce.com/rest/2.0/solution/v1/form_ocr/request"
# 二进制方式打开图片文件
f = open(img_path, 'rb')
img = base64.b64encode(f.read())
params = {"image":img, "request_type": 'xls'}
request_url = request_url + "?access_token=" + access_token
headers = {'content-type': 'application/x-www-form-urlencoded'}
response = requests.post(request_url, data=params, headers=headers)
if response:
print ('---2---')
request_id = response.json()['result'][0]['request_id']
return request_id
'''
获取识别结果
'''
def getRequestResult(access_token, request_id, xls_path):
request_url = "https://aip.baidubce.com/rest/2.0/solution/v1/form_ocr/get_request_result"
params = {"request_id":request_id, "result_type": "excel"}
request_url = request_url + "?access_token=" + access_token
headers = {'content-type': 'application/x-www-form-urlencoded'}
time.sleep(10)
while True:
response = requests.post(request_url, data=params, headers=headers)
print(response.json())
if response:
result = response.json()["result"]
file_url = result["result_data"]
if file_url!="":
urllib.request.urlretrieve(file_url, xls_path)
# res = requests.get(file_url)
# with open(xls_path, "wb") as xls:
# xls.write(res.content)
break
else:
time.sleep(3)
continue
if __name__ == "__main__":
ak = ""
sk = ""
access_token = getToken(ak, sk)
img_path="./test3.jpg"
xls_path = "./test3.xls"
request_id = postRqeust(access_token, img_path)
getRequestResult(access_token, request_id, xls_path)
薪火科技表格文字识别调用代码
# encoding:utf-8
import requests
import base64
import time
import json
import urllib.request
def saveXls(xls_url, xls_path):
urllib.request.urlretrieve(xls_url, xls_path)
def postRequest(token, img_path):
request_url = "https://www.xinhuokj.com/api/table_recog_excel"
headers = {'content-type': 'application/json'}
f = open(img_path, 'rb')
img = base64.b64encode(f.read())
params = {"image":str(img, encoding="utf-8"), "type": '11', "token": token}
response = requests.post(request_url, data = json.dumps(params), headers = headers)
if response:
return response.json()['data']['filepath']
if __name__ == "__main__":
token = ""
img_path = "test2.jpg"
xls_path = "result.xls"
xls_url = postRequest(token, img_path)
saveXls(xls_url, xls_path)
VBA将一个单元格中多行文本分割成一列多个单元格
' 基本思路:
' 1、遍历所有存在多行文本的单元格
' 2、对单元格中的多行文本进行分割,将分割后的字符串一一保存到一个新的数组,
' 3、最后赋值到指定的单元格区域
' 1.原始
Sub 分割单元格()
Dim arr, brr(1 To 1000, 1 To 1), i&, x&, m&
arr = Range("c1:c" & Range("c65536").End(3).Row)
For x = 1 To UBound(arr)
arr_split = VBA.Split(arr(x, 1) & Chr(10), Chr(10))
For j = LBound( arr_split) To UBound( arr_split) - 1
n = n + 1
brr(n, 1) = arr_split(j)
Next
Next
Range("a1").Resize(n, 1) = brr
End Sub
' 2.优化
Sub 分割单元格()
Dim arr, brr(1 To 1000, 1 To 1), i&, x&
arr = Range("c1:c" & Range("c65536").End(3).Row)
For x = 1 To UBound(arr)
For i = 1 To Len(arr(x, 1)) + 1
If Mid(arr(x, 1) & Chr(10), i, 1) = Chr(10) Then
m = m + 1
n = n + 1
brr(n, 1) = VBA.Split(arr(x, 1) & Chr(10), Chr(10))(m - 1)
End If
Next
m = 0
Next
Range("a1").Resize(n, 1) = brr
End Sub
' 3.增加对话框
Sub 分割单元格()
c = Application.InputBox("", "输入列号")
r = Application.InputBox("", "输入行号")
t = Application.InputBox("", "保存单元格")
Dim arr, brr(1 To 1000, 1 To 1), i&, x&
e = Range(c & "65536").End(3).Row
If e - r = 0 Then
e = r + 1
End If
arr = Range(c & r & ":" & c & e)
u = UBound(arr)
For x = 1 To u
For i = 1 To Len(arr(x, 1)) + 1
If Mid(arr(x, 1) & Chr(10), i, 1) = Chr(10) Then
m = m + 1
n = n + 1
brr(n, 1) = VBA.Split(arr(x, 1) & Chr(10), Chr(10))(m - 1)
End If
Next
m = 0
Next
Range(t).Resize(n, 1) = brr
End Sub
' 4.自动识别待分割单元格
Sub 分割单元格()
' 获取待分割范围最大行列号
min_row = Application.InputBox("", "输入起始行号")
max_row = Range("b65536").End(xlUp).Row
max_col = Cells(1, Columns.Count).End(xlToLeft).Column
row_result = max_row + 3
If max_row - min_row = 0 Then
max_row = min_row + 1
End If
Dim arr_ori, arr_rst()
ReDim arr_rst(1 To 1000, 1 To 1)
' 逐列分割
For cl = 1 To max_col
' 将列号转为字母表示
col_alpha = Replace(Cells(1, cl).Address(0, 0), "1", "")
arr_ori = Range(col_alpha & min_row & ":" & col_alpha & max_row)
' 逐行分割
For rw = 1 To (max_row - min_row)
' 不为空的单元格逐字符遍历
If Not IsEmpty(arr_ori(rw, 1)) Then
For i = 1 To Len(arr_ori(rw, 1)) + 1
' 判断字符是否为换行符,是的话保存到arr_rst下一行
If Mid(arr_ori(rw, 1) & Chr(10), i, 1) = Chr(10) Then
m = m + 1
n = n + 1
arr_rst(n, 1) = VBA.Split(arr_ori(rw, 1) & Chr(10), Chr(10))(m - 1)
End If
Next
m = 0
End If
Next
Range(col_alpha & row_result).Resize(n, 1) = arr_rst
ReDim arr_rst_re(1 To 1000, 1 To 1)
arr_rst = arr_rst_re
n = 0
Next
End Sub
' 一些补充知识
' brr(1 To 1000, 1 To 1) 声明一个行数为1000,列数为1的数组
' i&即声明i为Long
' Range("c1:c5")为选中单元格范围,&符号为连接字符串之用
' Range().End(3).Row为单元格范围有数值的最后一行的行号
' LBound为数组开始下标,UBound为结束下标
' For循环中可以使用未事先声明的变量,起始值默认为0
' Chr(10)为换行符