paddleocr 和 pytesseract 使用记录

paddleocr 和 pytesseract 都是常见的ocr模块,不同的是,paddleocr 是百度开源的一个ocr识别库, pytesseract 基于Tesseract OCR引擎封装的python ocr库,在基础使用过程中,发现 paddleocr 比 pytesseract 要慢,而且内存占用更大,在识别中会持续缓存内存,直到占用到某个大小后,后续识别的图片才不会占用内存

paddleocr
import concurrent.futures
import re, os

from paddleocr import PaddleOCR
import logging, time
import asyncio

start_time = time.time()

paddleocr_logger = logging.getLogger('ppocr')
paddleocr_logger.setLevel(logging.WARNING)
# 初始化OCR引擎
ocr = PaddleOCR(use_angle_cls=True, lang='ch')  # 使用中文模型


def ocr_tt_quota(img_path, filename) -> type:
    result = ocr.ocr(img_path, cls=True)
    # 初始化行列表
    rows, current_row = [], []
    last_y, text_0, text_1 = None, None, None
    y_threshold = 10  # 行的y坐标差异阈值
    for line in result:
        for node in line:
            bbox, text = node
            y1, y2 = bbox[1][1], bbox[3][1]
            if last_y is None or y1 - last_y > y_threshold:
                # 如果这是第一行或与前一行有足够的垂直距离,则开始新行
                if current_row:
                    rows.append(current_row)
                current_row = [(bbox, text)]
                last_y = y1
            else:
                # 否则,将文本块添加到当前行
                current_row.append((bbox, text))
    if current_row:
        rows.append(current_row)
    del result
    # 输出结果
    for idx, row in enumerate(rows, start=1):
        if len(row) > 1:
            text_0, text_1 = row[0][1][0], row[1][1][0]
            if '单日累计购买上限' in text_0 or '单日累计购买上限' in text_1:
                if text_0 != '单日累计购买上限':
                    text_0, text_1 = text_1, text_0
                break
        elif row[0][1][0] == '单日累计购买上限':
            text_0, text_1 = row[0][1][0], '不限额'
            break
    return img_path, text_0, text_1


async def main():
    loop = asyncio.get_running_loop()
    task_list = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:

        for root, dirs, files in os.walk('./img'):
            task_list.append(loop.run_in_executor(executor, ocr_tt_quota, i + '.png', 'file'))

        data_list = await asyncio.gather(*task_list)
        for data in data_list:
            print(data)
    del executor
if __name__ == '__main__':
    asyncio.run(main())

pytesseract
import cv2
import numpy as np
import pytesseract

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def preprocess_image(image_path):
    """
    图片预处理:灰度化、二值化
    """
    # 读取图片
    image = cv2.imdecode(np.fromfile(image_path, dtype=np.uint8), -1)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    image = cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)

    image = image[800:2388, 0:1080]
    # 降噪
    image = cv2.medianBlur(image, 3)

    return image


def extract_text(image_path, lang='chi_sim'):
    """
    提取图片中的文本
    :param image_path: 图片路径
    :param lang: 语言模型(chi_sim 简体中文,eng 英文)
    :return: 提取的文本
    """
    # 预处理图片
    image = preprocess_image(image_path)

    # 使用 Tesseract 提取文本
    return pytesseract.image_to_string(image, lang=lang)


for root, dirs, files in os.walk('./img')::
	for filename in files:
	    # 提取中文和数字
	    text = extract_text(f'./img/{filename }', lang='chi_sim+eng')
	    text_list = text.split('\n')
	    for j in text_list:
	        if '单日累计购买上限' in j:
	            j = j.replace(' ', '')
	            j = j.replace('单日累计购买上限', '')
	            print(f"{i} 单日累计购买上限 ", j)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值