链接
[link]补充内容为利用文字定位模块与文字识别模块的接口批量处理图片生成OCR结果(加上标记结果进行比对)。https://blog.csdn.net/weixin_42343812/article/details/85100044
生成后的结果如下所示。
代码片
下面展示一些 内联代码片
。
// A code block
var foo = 'bar';
s = ''
j = 2000
k = 3
if os.path.exists('./img10000_ocr/'+'pict_0.xlsx'):
os.remove('./img10000_ocr/'+'pict_0.xlsx')
book = xlsxwriter.Workbook('./img10000_ocr/pict_0.xlsx')
sheet = book.add_worksheet('demo')
i = 1
flag_flag=0
file_json = open('/home/xudong/ocr/dataset.txt', 'r')
line=file_json.readline()
json_engchi=[]
#tuple存储所有的读取后的标记数据
tuplel=[]
#读取所有的json文件——标记数据
while(line!=''):
eng_j=line.split(',')[1]
chi_j = line.split(',')[2]
tuplel=[]
tuplel.append(eng_j)
tuplel.append(chi_j)
json_engchi.append(tuplel)
line = file_json.readline()
for i in range(1,len(b)):
shenqing = b_img_name[i]
ocr = b[i].split(',')[0]
chi = b[i].split(',')[1]
eng = b[i].split(',')[2]
if (i != 0):
if i % 1000 == 0:
if flag_flag==0:
flag_flag=1
else:
#book.close()
if os.path.exists('./img10000_ocr/'+'pict_'+str(i/1000)+'.xlsx'):
os.remove('./img10000_ocr/'+'pict_'+str(i/1000)+'.xlsx')
book = xlsxwriter.Workbook('./img10000_ocr/pict_'+str(int(i/1000))+'.xlsx')
sheet = book.add_worksheet('demo')
k=3
sheet.insert_image('A' + str(k), '/home/xudong/ocr/img10000/' + shenqing + '.jpg')
sheet.insert_image('B' + str(k), '/home/xudong/ocr/img10000/' + shenqing + '_ctpn.jpg')
sheet.write(k-1 , 2, ocr)
sheet.write(k-1 , 4, chi)
sheet.write(k-1 , 7, eng)
sheet.write(k-1, 8, json_engchi[int(shenqing)][0])
sheet.write(k-1, 5, json_engchi[int(shenqing)][1])
i += 1
print(i)
k += 1
print(i)
book.close()
全部代码如下:
import os
import shutil
import xlsxwriter
import sys
import re
import cv2
f = open('/home/xudong/ocr/text_detection-master/ocr_result.txt', 'r')
c = f.readline()
j = 0
a = []
b = []
b_img_name=[]
p = 0
ocr = '0.jpggpj &'
jpg = '0.jpg'
count_error=0 #计算一共有多少没有jpggpj
#读取result文件
while c != '':
regex_str = ".*?([A-Za-z0-9\u0800-\u4e00\u4E00-\u9FA5])"
#每行都执行这个过滤操作
res = re.findall(regex_str, c)
c = ''
for i in range(len(res)):
c += res[i]
#此时c表示过滤后的结果 68行注释
sqh = c.split('jpg')[0]
#sqh表示jpg的名字
flag = 0
#flag=1的时候,本词条不是新的一个图片,而是result里面的内容;
if len(c.split('jpg')) == 1:
flag = 1
if (sqh != ''):
for n in range(1, len(sqh) - 1):
if ((sqh[n] >= '0') & (sqh[n] <= '9')) == False:
flag = 1
break
else:
flag = 1
if flag == 0:
regex_str = ".*?([A-Za-z0-9\u0800-\u4e00\u4E00-\u9FA5])"
res = re.findall(regex_str, ocr)
result_ocr_chieng = ''
#这个变量存储经过彻底过滤后的每行信息,且分中文、英文;
for i in range(len(res)):
result_ocr_chieng += res[i]
result_ocr_chieng += ','
b_img_name.append(result_ocr_chieng.split('jpggpj')[0])
'''改动'''
result_ocr_chieng=result_ocr_chieng.split('jpggpj')[1]
#result_ocr_chieng表示上一条图片OCR过滤后的结果
ocr_chi=ocr.split('jpggpj')[1]
regex_str = ".*?([0-9\u4E00-\u9FA5])" #中文信息
res = re.findall(regex_str, ocr_chi)
for i in range(len(res)):
result_ocr_chieng += res[i]
result_ocr_chieng += ','
#此时result_ocr_chieng表示只过滤出中文后的结果,注意:是中文
'''改动'''
#result_ocr_chieng = result_ocr_chieng.split('jpggpj')[1]
regex_str = ".*?([A-Za-z0-9])" # 英文信息
result_eng=''
res = re.findall(regex_str, ocr)
for i in range(len(res)):
#result_ocr_chieng += res[i]
result_eng+= res[i]
# 此时result_ocr_chieng表示只过滤出英文后的结果
result_ocr_chieng+=result_eng.split('jpggpj')[1]
ocr = ''
ocr = c.split('jpg')[0]+'jpggpj'+c.split('jpg')[1]
#b_img_name.append(result_ocr_chieng.split('jpggpj')[0])
#b.append(result_ocr_chieng.split('jpggpj')[1])
'''改动'''
b.append(result_ocr_chieng)
else:
if c!='':
ocr += c
#看68行注释
c = f.readline()
for i in range(len(b)):
print(b_img_name[i]+'.jpg '+b[i])
print(str(len(b)))
s = ''
j = 2000
k = 3
if os.path.exists('./img10000_ocr/'+'pict_0.xlsx'):
os.remove('./img10000_ocr/'+'pict_0.xlsx')
book = xlsxwriter.Workbook('./img10000_ocr/pict_0.xlsx')
sheet = book.add_worksheet('demo')
i = 1
flag_flag=0
file_json = open('/home/xudong/ocr/dataset.txt', 'r')
line=file_json.readline()
json_engchi=[]
tuplel=[]
while(line!=''):
eng_j=line.split(',')[1]
chi_j = line.split(',')[2]
tuplel=[]
tuplel.append(eng_j)
tuplel.append(chi_j)
json_engchi.append(tuplel)
line = file_json.readline()
for i in range(1,len(b)):
shenqing = b_img_name[i]
ocr = b[i].split(',')[0]
chi = b[i].split(',')[1]
eng = b[i].split(',')[2]
if (i != 0):
if i % 1000 == 0:
if flag_flag==0:
flag_flag=1
else:
#book.close()
if os.path.exists('./img10000_ocr/'+'pict_'+str(i/1000)+'.xlsx'):
os.remove('./img10000_ocr/'+'pict_'+str(i/1000)+'.xlsx')
book = xlsxwriter.Workbook('./img10000_ocr/pict_'+str(int(i/1000))+'.xlsx')
sheet = book.add_worksheet('demo')
k=3
sheet.insert_image('A' + str(k), '/home/xudong/ocr/img10000/' + shenqing + '.jpg')
sheet.insert_image('B' + str(k), '/home/xudong/ocr/img10000/' + shenqing + '_ctpn.jpg')
sheet.write(k-1 , 2, ocr)
sheet.write(k-1 , 4, chi)
sheet.write(k-1 , 7, eng)
sheet.write(k-1, 8, json_engchi[int(shenqing)][0])
sheet.write(k-1, 5, json_engchi[int(shenqing)][1])
i += 1
print(i)
k += 1
book.close()