前言
最近在爬取完视频数据后需要提取视频中的文本内容,搜寻一番没有找到满意的方案。最后自己总结出利用PaddleOCR提取视频文字的方案,简介易懂可实操。
1.PaddleOCR环境搭建
可参考其它环境搭建文档,这里推荐官方文档
个人心得:环境搭建过程坑较多,在anaconda虚拟环境下搭建更易成功。
2.提取视频帧,保存为图片集
根据视频情况调整截取视频帧的频率,这里是一秒一张图,保存图片到pics文件夹
传入视频所在文件夹的路径
def extract_frames_from_folder(folder_path):
# 设置视频文件路径和输出文件夹路径
video_path = os.path.join(folder_path, "video.mp4")#改为你的视频名
output_folder = os.path.join(folder_path, "pics")
# 创建输出文件夹
os.makedirs(output_folder, exist_ok=True)
# 打开视频文件
cap = cv2.VideoCapture(video_path)
# 视频帧率
fps = cap.get(cv2.CAP_PROP_FPS)
# 读取视频帧并保存为图片
frame_count = 0
second_count = 0
while True:
ret, frame = cap.read()
if not ret:
break
# 如果当前帧的时间大于等于当前秒数
if frame_count >= fps * second_count:
# 保存图片到输出文件夹
frame_name = f"{second_count}.jpg"
output_path = os.path.join(output_folder, frame_name)
cv2.imwrite(output_path, frame)
# 打印已保存的帧数
print(f"Saved frame {frame_count} at second {second_count}")
second_count += 1
# 递增帧计数器
frame_count += 1
# 释放视频流
cap.release()
3.识别图片文字
依次识别每张图片的文字,写入text_ocr.txt
def recognize_and_save_text(folder_path):
pics_path = os.path.join(folder_path, "pics")
text_file_path = os.path.join(folder_path, "text_ocr.txt")
with open(text_file_path, 'a', encoding='utf-8') as f:
pass
# 初始化PaddleOCR
ocr = PaddleOCR()
# 用于存储已识别的文字内容
recognized_text = set()
# 获取文件夹中的所有图片文件,并按照数字顺序排序
image_files = sorted(
[filename for filename in os.listdir(pics_path) if filename.endswith('.jpg') or filename.endswith('.png')],
key=lambda x: int(os.path.splitext(x)[0]))
for filename in image_files:
# 图片路径
image_path = os.path.join(pics_path, filename)
try:
# 使用PaddleOCR识别文字
result = ocr.ocr(image_path, cls=True)
# 检查识别结果是否为空
if result:
# 提取识别结果中的文字内容
text = ''.join([word[1][0] for line in result for word in line])
# 检查识别的文字是否已经在集合中,若不在则加入集合并写入文本文件
if text not in recognized_text:
recognized_text.add(text)
with open(text_file_path, 'a', encoding='utf-8') as f:
f.write(text + '\n')
else:
print(f"No text detected in {image_path}")
except Exception as e:
print(f"Error processing {image_path}: {e}")
continue
print("文字识别并保存完成。")
4.完整代码
import shutil
import cv2
import os
from paddleocr import PaddleOCR
def extract_frames_from_folder(folder_path):
# 设置视频文件路径和输出文件夹路径
video_path = os.path.join(folder_path, "video.mp4")
output_folder = os.path.join(folder_path, "pics")
# 创建输出文件夹
os.makedirs(output_folder, exist_ok=True)
# 打开视频文件
cap = cv2.VideoCapture(video_path)
# 视频帧率
fps = cap.get(cv2.CAP_PROP_FPS)
# 读取视频帧并保存为图片
frame_count = 0
second_count = 0
while True:
ret, frame = cap.read()
if not ret:
break
# 如果当前帧的时间大于等于当前秒数
if frame_count >= fps * second_count:
# 保存图片到输出文件夹
frame_name = f"{second_count}.jpg"
output_path = os.path.join(output_folder, frame_name)
cv2.imwrite(output_path, frame)
# 打印已保存的帧数
print(f"Saved frame {frame_count} at second {second_count}")
second_count += 1
# 递增帧计数器
frame_count += 1
# 释放视频流
cap.release()
def recognize_and_save_text(folder_path):
pics_path = os.path.join(folder_path, "pics")
text_file_path = os.path.join(folder_path, "text_ocr.txt")
with open(text_file_path, 'a', encoding='utf-8') as f:
pass
# 初始化PaddleOCR
ocr = PaddleOCR()
# 用于存储已识别的文字内容
recognized_text = set()
# 获取文件夹中的所有图片文件,并按照数字顺序排序
image_files = sorted(
[filename for filename in os.listdir(pics_path) if filename.endswith('.jpg') or filename.endswith('.png')],
key=lambda x: int(os.path.splitext(x)[0]))
for filename in image_files:
# 图片路径
image_path = os.path.join(pics_path, filename)
try:
# 使用PaddleOCR识别文字
result = ocr.ocr(image_path, cls=True)
# 检查识别结果是否为空
if result:
# 提取识别结果中的文字内容
text = ''.join([word[1][0] for line in result for word in line])
# 检查识别的文字是否已经在集合中,若不在则加入集合并写入文本文件
if text not in recognized_text:
recognized_text.add(text)
with open(text_file_path, 'a', encoding='utf-8') as f:
f.write(text + '\n')
else:
print(f"No text detected in {image_path}")
except Exception as e:
print(f"Error processing {image_path}: {e}")
continue
print("文字识别并保存完成。")
def check_pics_folder(folder_path):
"""
检查指定文件夹中是否存在名为'pics'的子文件夹。
参数:
folder_path (str): 要检查的文件夹路径。
返回:
bool: 如果存在名为'pics'的子文件夹则返回True,否则返回False。
"""
# 判断文件夹是否存在
if os.path.exists(folder_path):
# 获取文件夹下所有子文件夹的列表
subfolders = next(os.walk(folder_path))[1]
# 判断是否存在名为"pics"的文件夹
if "pics" in subfolders:
return True
else:
return False
else:
print("指定的文件夹路径不存在。")
return False
main_folder = 'data'
extract_frames_from_folder(main_folder)
recognize_and_save_text(main_folder)
#提取完后删除图片文件夹
#shutil.rmtree(os.path.join(main_folder, 'pics'))