目标
将图片中的表格保存到excel。
代码
with open(picture, "rb") as f:
img_data = f.read()
img_base64 = b64encode(img_data)
cred = credential.Credential(SecretId, SecretKey) # ID和Secret从腾讯云申请
httpProfile = HttpProfile()
httpProfile.endpoint = "ocr.tencentcloudapi.com"
clientProfile = ClientProfile()
clientProfile.httpProfile = httpProfile
client = ocr_client.OcrClient(cred, "ap-shanghai", clientProfile)
req = models.TableOCRRequest()
params = '{"ImageBase64":"' + str(img_base64, 'utf-8') + '"}'
req.from_json_string(params)
resp = client.TableOCR(req)
##提取识别出的数据,并且生成json
result1 = loads(resp.to_json_string())
rowIndex = []
colIndex = []
content = []
##按照行列写入字典,并把需要替换的字符替换
for item in result1['TextDetections']:
rowIndex.append(item['RowTl'])
colIndex.append(item['ColTl'])
item['Text'] = item['Text'].replace("\n", '')
item['Text'] = item['Text'].replace(" ", '')
item['Text'] = item['Text'].replace(",", '')
item['Text'] = item['Text'].replace("¥", '')
content.append(item['Text'])
##导出Excel
##ExcelWriter方案
rowIndex = Series(rowIndex)
colIndex = Series(colIndex)
index = rowIndex.unique()
index.sort()
columns = colIndex.unique()
columns.sort()
data = DataFrame(index=index, columns=columns)
for i in range(len(rowIndex)):
data.loc[rowIndex[i], colIndex[i]] = sub("", "", content[i])
##保存成excel文件
writer = ExcelWriter(match(".*\.",f.name).group()+"xlsx", engine='xlsxwriter')
data.to_excel(writer, sheet_name='Sheet1', index=False, header=False)
writer.save()
总结
- 需要用到腾讯云的SDK,可以在官网申请开通。
- 不一定是表格,其它票据文档也可以用,改api就行,后续对json的处理会不同,但不复杂,类似。
- 保存的文件命名是以图片名命名。