类似于聊天工具中的截图功能,只不过是对选定照片的区域截取,这部分功能可用于对于复杂的书籍目录结构的文字识别,按照选框区域拉取的先后顺序生成子图。
1.成果展示:
我们只是希望文字识别目录提取到我们想要的部分,比如这个“第4篇 流行框架与XML技术”对于我们用处不大,我们只需要按照顺序拉选框框住目标区域即可。
选完了框按‘S’键保存,按‘R’键取消当前的选框(不是全部)
保存的图片按照顺序存储在对应的文件夹里。
这个程序只是一个选框并保存的功能,没有加入文字识别。文字识别使用百度飞桨配置的。
import os
import cv2
# from PIL import ImageGrab
import numpy as np
# 鼠标回调函数
def draw_rectangle(event, x, y, flags, param):
global start_point, end_point, drawing, top_left_pt, bottom_right_pt,sum_pt_list
if event == cv2.EVENT_LBUTTONDOWN:
drawing = True
start_point = (x, y)
elif event == cv2.EVENT_MOUSEMOVE:
if drawing:
end_point = (x, y)
elif event == cv2.EVENT_LBUTTONUP:
drawing = False
end_point = (x, y)
top_left_pt = min(start_point[0], end_point[0]), min(start_point[1], end_point[1])
bottom_right_pt = max(start_point[0], end_point[0]), max(start_point[1], end_point[1])
sum_pt_list.append([top_left_pt,bottom_right_pt])
cv2.rectangle(img, top_left_pt, bottom_right_pt, (0, 0, 0), thickness=3)
# 读取图片
global img
image_path = 'D:/jupyter/百度飞浆的OCR识别/PDF/JavaScript3.png'
img = cv2.imdecode(np.fromfile(image_path, dtype=np.uint8), 1)
# 初始化全局变量
start_point = (0, 0)
end_point = (0, 0)
drawing = False
top_left_pt = (0, 0)
bottom_right_pt = (0, 0)
sum_pt_list = []
# 创建窗口并绑定鼠标回调函数
cv2.namedWindow("image", cv2.WINDOW_NORMAL)
# cv2.namedWindow('image')
cv2.setMouseCallback('image', draw_rectangle)
output_img = []
while True:
cv2.imshow('image', img)
key = cv2.waitKey(1) & 0xFF
# 按下'r'键重置选框区域
if key == ord('r'):
# img = cv2.imread('input.jpg')
# img = cv2.resize(img, (800, 600))
if len(sum_pt_list)>0:
top_left_pt, bottom_right_pt = sum_pt_list.pop(-1)
cv2.rectangle(img, top_left_pt, bottom_right_pt, (255, 255, 255), thickness=3)
top_left_pt, bottom_right_pt = (-1, -1), (-1, -1)
# 按下's'键保存选框区域的子图片
elif key == ord('s'):
# if top_left_pt != (-1, -1) and bottom_right_pt != (-1, -1):
if len(sum_pt_list)>0:
for i in range(len(sum_pt_list)):
sub_img = img[sum_pt_list[i][0][1]:sum_pt_list[i][1][1], sum_pt_list[i][0][0]:sum_pt_list[i][1][0]]
output_path = 'D:/jupyter/百度飞浆的OCR识别/PDF/output_{}.jpg'.format(int(time.time()) + i)
encoded_path = output_path.encode('utf-8')
native_path = os.fsdecode(encoded_path)
output_img.append(sub_img)
# print('D:/jupyter/百度飞浆的OCR识别/PDF/output_{}.jpg'.format(int(time.time())+i))
cv2.imwrite(native_path, sub_img)
break
cv2.destroyAllWindows()