【百度云&薪火科技】表格文字识别

最新推荐文章于 2023-09-24 13:55:39 发布

RealMoYe

最新推荐文章于 2023-09-24 13:55:39 发布

阅读量684

点赞数 1

文章标签：百度

本文链接：https://blog.csdn.net/baidu_26646129/article/details/113884611

版权

理论技术：大数据 | 数据挖掘 | 机器学习专栏收录该内容

4 篇文章 0 订阅

订阅专栏

基本思路

调用文字识别服务商API，目前市场上成熟的产品一个是百度云，另一个是薪火科技。经过试用发现，百度云的表格识别功能在识别规范的电子文档中表格（比如PDF文件），在识别手机照片中表格时准确度一塌糊涂。相比之下，薪火科技表格文字识别功能更为可靠。

另外，所识别的Excel表格中存在一个格子包含多行文本，需要借助VBA将多行文本拆分成单独的格子。

百度表格文字识别API调用代码

# encoding:utf-8

import requests
import base64
import time
import urllib.request

'''
表格文字识别(异步接口)
'''

'''
请求access token
'''
def getToken(ak, sk):
    host = 'https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id='+ak+'&client_secret='+sk
    response = requests.get(host)
    if response:
        access_token = response.json()['access_token']
        print("---1---")
        return access_token
    else:
        return ''

'''
发送识别请求
'''
def postRqeust(access_token, img_path):
    request_url = "https://aip.baidubce.com/rest/2.0/solution/v1/form_ocr/request"
    # 二进制方式打开图片文件
    f = open(img_path, 'rb')
    img = base64.b64encode(f.read())

    params = {"image":img, "request_type": 'xls'}
    request_url = request_url + "?access_token=" + access_token
    headers = {'content-type': 'application/x-www-form-urlencoded'}
    response = requests.post(request_url, data=params, headers=headers)
    if response:
        print ('---2---')
        request_id = response.json()['result'][0]['request_id']
        return request_id

'''
获取识别结果
'''
def getRequestResult(access_token, request_id, xls_path):
    request_url = "https://aip.baidubce.com/rest/2.0/solution/v1/form_ocr/get_request_result"
    params = {"request_id":request_id, "result_type": "excel"}
    request_url = request_url + "?access_token=" + access_token
    headers = {'content-type': 'application/x-www-form-urlencoded'}
    time.sleep(10)

    while True:
        response = requests.post(request_url, data=params, headers=headers)
        print(response.json())
        if response:
            result = response.json()["result"]
            file_url = result["result_data"]
            if file_url!="":
                urllib.request.urlretrieve(file_url, xls_path)
                # res = requests.get(file_url)
                # with open(xls_path, "wb") as xls:
                #     xls.write(res.content)
                break
            else:
                time.sleep(3)
                continue
   

if __name__ == "__main__":
    ak = ""
    sk = ""
    access_token = getToken(ak, sk)
    img_path="./test3.jpg"
    xls_path = "./test3.xls"
    request_id = postRqeust(access_token, img_path)
    getRequestResult(access_token, request_id, xls_path)

薪火科技表格文字识别调用代码

# encoding:utf-8

import requests
import base64
import time
import json
import urllib.request


def saveXls(xls_url, xls_path):
    urllib.request.urlretrieve(xls_url, xls_path)

def postRequest(token, img_path):
    request_url = "https://www.xinhuokj.com/api/table_recog_excel"
    headers = {'content-type': 'application/json'}
    f = open(img_path, 'rb')
    img = base64.b64encode(f.read())
    params = {"image":str(img, encoding="utf-8"), "type": '11', "token": token}
    response = requests.post(request_url, data = json.dumps(params), headers = headers)
    if response:
        return response.json()['data']['filepath']


if __name__ == "__main__":
    token = ""
    img_path = "test2.jpg"
    xls_path = "result.xls"
    xls_url = postRequest(token, img_path)
    saveXls(xls_url, xls_path)

VBA将一个单元格中多行文本分割成一列多个单元格

' 基本思路：
' 1、遍历所有存在多行文本的单元格
' 2、对单元格中的多行文本进行分割，将分割后的字符串一一保存到一个新的数组，
' 3、最后赋值到指定的单元格区域

' 1.原始
Sub 分割单元格()
    Dim arr, brr(1 To 1000, 1 To 1), i&, x&, m&
    arr = Range("c1:c" & Range("c65536").End(3).Row)
    For x = 1 To UBound(arr)
        arr_split = VBA.Split(arr(x, 1) & Chr(10), Chr(10))
        For j = LBound( arr_split) To UBound( arr_split) - 1
            n = n + 1
            brr(n, 1) =  arr_split(j)
        Next
    Next
    
    Range("a1").Resize(n, 1) = brr
End Sub


' 2.优化
Sub 分割单元格()
    Dim arr, brr(1 To 1000, 1 To 1), i&, x&
    arr = Range("c1:c" & Range("c65536").End(3).Row)
    For x = 1 To UBound(arr)
        For i = 1 To Len(arr(x, 1)) + 1
            If Mid(arr(x, 1) & Chr(10), i, 1) = Chr(10) Then
                m = m + 1
                n = n + 1
                brr(n, 1) = VBA.Split(arr(x, 1) & Chr(10), Chr(10))(m - 1)
            End If
        Next
        m = 0
    Next
    
    Range("a1").Resize(n, 1) = brr
End Sub


' 3.增加对话框
Sub 分割单元格()
    c = Application.InputBox("", "输入列号")
    r = Application.InputBox("", "输入行号")
    t = Application.InputBox("", "保存单元格")
    
    Dim arr, brr(1 To 1000, 1 To 1), i&, x&
    
    e = Range(c & "65536").End(3).Row
    If e - r = 0 Then
        e = r + 1
    End If
        
    arr = Range(c & r & ":" & c & e)
    
    u = UBound(arr)
    For x = 1 To u
        For i = 1 To Len(arr(x, 1)) + 1
            If Mid(arr(x, 1) & Chr(10), i, 1) = Chr(10) Then
                m = m + 1
                n = n + 1
                brr(n, 1) = VBA.Split(arr(x, 1) & Chr(10), Chr(10))(m - 1)
            End If
        Next
        m = 0
    Next
    
    Range(t).Resize(n, 1) = brr
End Sub

' 4.自动识别待分割单元格
Sub 分割单元格()
    ' 获取待分割范围最大行列号
    min_row = Application.InputBox("", "输入起始行号")
    max_row = Range("b65536").End(xlUp).Row
    max_col = Cells(1, Columns.Count).End(xlToLeft).Column
    row_result = max_row + 3
    
    If max_row - min_row = 0 Then
        max_row = min_row + 1
    End If
    
    Dim arr_ori, arr_rst()
    ReDim arr_rst(1 To 1000, 1 To 1)
    
    ' 逐列分割
    For cl = 1 To max_col
    
        ' 将列号转为字母表示
        col_alpha = Replace(Cells(1, cl).Address(0, 0), "1", "")
        arr_ori = Range(col_alpha & min_row & ":" & col_alpha & max_row)
        
        
        ' 逐行分割
        For rw = 1 To (max_row - min_row)
        
            ' 不为空的单元格逐字符遍历
            If Not IsEmpty(arr_ori(rw, 1)) Then
                For i = 1 To Len(arr_ori(rw, 1)) + 1
                
                    ' 判断字符是否为换行符，是的话保存到arr_rst下一行
                    If Mid(arr_ori(rw, 1) & Chr(10), i, 1) = Chr(10) Then
                        m = m + 1
                        n = n + 1
                        arr_rst(n, 1) = VBA.Split(arr_ori(rw, 1) & Chr(10), Chr(10))(m - 1)
                    End If
                    
                Next
                m = 0
            End If
        Next
        Range(col_alpha & row_result).Resize(n, 1) = arr_rst
        ReDim arr_rst_re(1 To 1000, 1 To 1)
        arr_rst = arr_rst_re
        n = 0
        
    Next
End Sub


' 一些补充知识
' brr(1 To 1000, 1 To 1) 声明一个行数为1000，列数为1的数组
' i&即声明i为Long
' Range("c1:c5")为选中单元格范围，&符号为连接字符串之用
' Range().End(3).Row为单元格范围有数值的最后一行的行号
' LBound为数组开始下标，UBound为结束下标
' For循环中可以使用未事先声明的变量，起始值默认为0
' Chr(10)为换行符

RealMoYe

关注

1
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
【百度云&薪火科技】表格文字识别

# encoding:utf-8import requestsimport base64import timeimport urllib.request'''表格文字识别(异步接口)''''''请求access token'''def getToken(ak, sk): host = 'https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id='+ak+'&am
复制链接

扫一扫