paddleocr 和 pytesseract 使用记录

tnan2522

于 2025-02-14 15:50:40 发布

阅读量266

点赞数 3

分类专栏： python 基础高级其他文章标签： python

本文链接：https://blog.csdn.net/qq_42031243/article/details/145635242

版权

python 基础高级同时被 2 个专栏收录

26 篇文章

订阅专栏

其他

16 篇文章

订阅专栏

paddleocr 和 pytesseract 都是常见的ocr模块，不同的是，paddleocr 是百度开源的一个ocr识别库， pytesseract 基于Tesseract OCR引擎封装的python ocr库，在基础使用过程中，发现 paddleocr 比 pytesseract 要慢，而且内存占用更大，在识别中会持续缓存内存，直到占用到某个大小后，后续识别的图片才不会占用内存

paddleocr

import concurrent.futures
import re, os

from paddleocr import PaddleOCR
import logging, time
import asyncio

start_time = time.time()

paddleocr_logger = logging.getLogger('ppocr')
paddleocr_logger.setLevel(logging.WARNING)
# 初始化OCR引擎
ocr = PaddleOCR(use_angle_cls=True, lang='ch')  # 使用中文模型


def ocr_tt_quota(img_path, filename) -> type:
    result = ocr.ocr(img_path, cls=True)
    # 初始化行列表
    rows, current_row = [], []
    last_y, text_0, text_1 = None, None, None
    y_threshold = 10  # 行的y坐标差异阈值
    for line in result:
        for node in line:
            bbox, text = node
            y1, y2 = bbox[1][1], bbox[3][1]
            if last_y is None or y1 - last_y > y_threshold:
                # 如果这是第一行或与前一行有足够的垂直距离，则开始新行
                if current_row:
                    rows.append(current_row)
                current_row = [(bbox, text)]
                last_y = y1
            else:
                # 否则，将文本块添加到当前行
                current_row.append((bbox, text))
    if current_row:
        rows.append(current_row)
    del result
    # 输出结果
    for idx, row in enumerate(rows, start=1):
        if len(row) > 1:
            text_0, text_1 = row[0][1][0], row[1][1][0]
            if '单日累计购买上限' in text_0 or '单日累计购买上限' in text_1:
                if text_0 != '单日累计购买上限':
                    text_0, text_1 = text_1, text_0
                break
        elif row[0][1][0] == '单日累计购买上限':
            text_0, text_1 = row[0][1][0], '不限额'
            break
    return img_path, text_0, text_1


async def main():
    loop = asyncio.get_running_loop()
    task_list = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:

        for root, dirs, files in os.walk('./img'):
            task_list.append(loop.run_in_executor(executor, ocr_tt_quota, i + '.png', 'file'))

        data_list = await asyncio.gather(*task_list)
        for data in data_list:
            print(data)
    del executor
if __name__ == '__main__':
    asyncio.run(main())

pytesseract

import cv2
import numpy as np
import pytesseract

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def preprocess_image(image_path):
    """
    图片预处理：灰度化、二值化
    """
    # 读取图片
    image = cv2.imdecode(np.fromfile(image_path, dtype=np.uint8), -1)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    image = cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)

    image = image[800:2388, 0:1080]
    # 降噪
    image = cv2.medianBlur(image, 3)

    return image


def extract_text(image_path, lang='chi_sim'):
    """
    提取图片中的文本
    :param image_path: 图片路径
    :param lang: 语言模型（chi_sim 简体中文，eng 英文）
    :return: 提取的文本
    """
    # 预处理图片
    image = preprocess_image(image_path)

    # 使用 Tesseract 提取文本
    return pytesseract.image_to_string(image, lang=lang)


for root, dirs, files in os.walk('./img')::
	for filename in files:
	    # 提取中文和数字
	    text = extract_text(f'./img/{filename }', lang='chi_sim+eng')
	    text_list = text.split('\n')
	    for j in text_list:
	        if '单日累计购买上限' in j:
	            j = j.replace(' ', '')
	            j = j.replace('单日累计购买上限', '')
	            print(f"{i} 单日累计购买上限 ", j)