使用PaddleOCR批量提取图片里面的文字
import os
import csv
from paddleocr import PaddleOCR
# 指定图片文件夹和存储路径
image_folder = 'image/other'
output_file= 'output.csv'
os.makedirs(output_file, exist_ok=True)
all_texts = []
failed_images = []
ocr = PaddleOCR(use_angle_cls=True, lang='ch',use_gpu=True,show_log=False)
#use_gpu是否使用GPU,默认为True。 use_angle_cls,是否使用方向分类器。 show_log,是否输出一些日志。
from tqdm import tqdm
image_list = [image_name for image_name in os.listdir(image_folder) if image_name.lower().endswith(('.png', '.jpg', '.jpeg'))]
for image_name in tqdm(image_list, desc="Processing Images", unit="image"):
if image_name.lower().endswith(('.png', '.jpg', '.jpeg')):
image_path = os.path.join(image_folder, image_name)
result = ocr.ocr(image_path, cls=True)
if result:
try:
# 提取文字并拼接成一个字符串
extracted_text = ' '.join([text[1][0] for line in result for text in line])
# 后处理,例如去除多余的空格
extracted_text = ' '.join(extracted_text.split())
all_texts.append([image_name, extracted_text])
except Exception as e:
print(f"Error processing {image_name}: {e}")
failed_images.append(image_name)
continue
else:
print(f"No text found in {image_name}")
failed_images.append(image_name)
continue
with open(output_file, mode='w', newline='', encoding='utf-8-sig') as file:
writer = csv.writer(file)
writer.writerow(['Original Filename', 'Extracted Text']) # 写入表头
writer.writerows(all_texts)
print(f"所有图片的文字已提取并保存到CSV文件夹:{output_folder}")
# 输出未提取到文字的图像名称
if failed_images:
print("以下图片未能提取到文字:")
with open('failed_images.txt', mode='w', encoding='utf-8') as failed_file:
for image in failed_images:
print(image)
failed_file.write(f"{image}\n")
else:
print("所有图片均成功提取到文字。")
安装paddleocr 可能会遇到的问题:
- libGL.so.1: cannot open shared object file: No such file or directory
解决方法: pip install opencv-python-headless 参考:【解决方法】libGL.so.1: cannot open shared object file: No such file or directory()
如果安装之后仍无法报同样的错,则运行下面命令:
sudo apt update
sudo apt install libgl1-mesa-glx
参考: Python ImportError libGL.so.1 cannot open shared object file No such file or directory 解决方案
参考博客:【Python】paddleocr快速使用及参数详解
【解决方法】libGL.so.1: cannot open shared object file: No such file or directory()
Python ImportError libGL.so.1 cannot open shared object file No such file or directory 解决方案