paddleocr 和 pytesseract 都是常见的ocr模块,不同的是,paddleocr 是百度开源的一个ocr识别库, pytesseract 基于Tesseract OCR引擎封装的python ocr库,在基础使用过程中,发现 paddleocr 比 pytesseract 要慢,而且内存占用更大,在识别中会持续缓存内存,直到占用到某个大小后,后续识别的图片才不会占用内存
paddleocr
import concurrent.futures
import re, os
from paddleocr import PaddleOCR
import logging, time
import asyncio
start_time = time.time()
paddleocr_logger = logging.getLogger('ppocr')
paddleocr_logger.setLevel(logging.WARNING)
# 初始化OCR引擎
ocr = PaddleOCR(use_angle_cls=True, lang='ch') # 使用中文模型
def ocr_tt_quota(img_path, filename) -> type:
result = ocr.ocr(img_path, cls=True)
# 初始化行列表
rows, current_row = [], []
last_y, text_0, text_1 = None, None, None
y_threshold = 10 # 行的y坐标差异阈值
for line in result:
for node in line:
bbox, text = node
y1, y2 = bbox[1][1], bbox[3][1]
if last_y is None or y1 - last_y > y_threshold:
# 如果这是第一行或与前一行有足够的垂直距离,则开始新行
if current_row:
rows.append(current_row)
current_row = [(bbox, text)]
last_y = y1
else:
# 否则,将文本块添加到当前行
current_row.append((bbox, text))
if current_row:
rows.append(current_row)
del result
# 输出结果
for idx, row in enumerate(rows, start=1):
if len(row) > 1:
text_0, text_1 = row[0][1][0], row[1][1][0]
if '单日累计购买上限' in text_0 or '单日累计购买上限' in text_1:
if text_0 != '单日累计购买上限':
text_0, text_1 = text_1, text_0
break
elif row[0][1][0] == '单日累计购买上限':
text_0, text_1 = row[0][1][0], '不限额'
break
return img_path, text_0, text_1
async def main():
loop = asyncio.get_running_loop()
task_list = []
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
for root, dirs, files in os.walk('./img'):
task_list.append(loop.run_in_executor(executor, ocr_tt_quota, i + '.png', 'file'))
data_list = await asyncio.gather(*task_list)
for data in data_list:
print(data)
del executor
if __name__ == '__main__':
asyncio.run(main())
pytesseract
import cv2
import numpy as np
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
def preprocess_image(image_path):
"""
图片预处理:灰度化、二值化
"""
# 读取图片
image = cv2.imdecode(np.fromfile(image_path, dtype=np.uint8), -1)
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
image = cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
image = image[800:2388, 0:1080]
# 降噪
image = cv2.medianBlur(image, 3)
return image
def extract_text(image_path, lang='chi_sim'):
"""
提取图片中的文本
:param image_path: 图片路径
:param lang: 语言模型(chi_sim 简体中文,eng 英文)
:return: 提取的文本
"""
# 预处理图片
image = preprocess_image(image_path)
# 使用 Tesseract 提取文本
return pytesseract.image_to_string(image, lang=lang)
for root, dirs, files in os.walk('./img')::
for filename in files:
# 提取中文和数字
text = extract_text(f'./img/{filename }', lang='chi_sim+eng')
text_list = text.split('\n')
for j in text_list:
if '单日累计购买上限' in j:
j = j.replace(' ', '')
j = j.replace('单日累计购买上限', '')
print(f"{i} 单日累计购买上限 ", j)