对图片加入PAD
在训练文字识别模型时,真实场景下的训练集图片的长宽可变,字数也不固定,字数从1-n(一般为25左右)不等,但是送入训练模型时,会经过统一的reshape,如果不同长度的图片可能会存在着变形,此时需要对长度小于100的图片进行pad,在进行reshape,会解决上述情况。实现代码如下:
"coding = utf-8"
import os
import sys
import torch
import cv2
from PIL import Image
import numpy as np
import math
import torchvision.transforms as transforms
from torchvision.transforms import ToPILImage
class ResizeNormalize(object):
def __init__(self, size, interpolation=Image.BICUBIC):
self.size = size
self.interpolation = interpolation
self.toTensor = transforms.ToTensor()
def __call__(self, img):
img = img.resize(self.size, self.interpolation)
img = self.toTensor(img)
img.sub_(0.5).div_(0.5)
return img
# 字不足宽度的补最右边的像素
class NormalizePAD(object):
def __init__(self, max_size, PAD_type='right'):
self.toTensor = transforms.ToTensor()
self.max_size = max_size
print('self.max_size',self.max_size)
self.max_width_half = math.floor(max_size[2] / 2)
self.PAD_type = PAD_type
def __call__(self, img):
img = self.toTensor(img)
img.sub_(0.5).div_(0.5)
c, h, w = img.size()
Pad_img = torch.FloatTensor(*self.max_size).fill_(0)
print('Pad_img',Pad_img)
print('img.shape',img.shape)
print('Pad_img.shape',Pad_img.shape)
Pad_img[:, :, :w] = img # right pad
if self.max_size[2] != w: # add border Pad
Pad_img[:, :, w:] = img[:, :, w - 1].unsqueeze(2).expand(c, h, self.max_size[2] - w)
print('Pad_img.shape1',Pad_img.shape)
print('Pad_img.shape2',Pad_img)
return Pad_img
imgH=32
imgW=100
input_path = "/home/zhou/PAD_img/input_images"
save_path = "/home/zhou/PAD_img/pad_images"
save_path_nopad = "/home/zhou/PAD_img/no_pad_images"
keep_ratio_with_pad = True
if keep_ratio_with_pad: # same concept with 'Rosetta' paper
resized_max_w = imgW
transform = NormalizePAD((3, imgH, resized_max_w))
filelist = os.listdir(input_path)
for item in filelist:
img_path = os.path.join(input_path, item)
image=Image.open(img_path)
#h = image.shape[0]
#w = image.shape[1]
w, h = image.size
print('w',w)
ratio = w / float(h)
if math.ceil(imgH * ratio) > imgW:
resized_w = imgW #如果输入图片的尺寸宽度高度比大于100/32,resize后的尺寸宽度等于100
print(1)
else:
print(2)
resized_w = math.ceil(imgH * ratio)
print('resized_w',resized_w)
resized_image = image.resize((resized_w, imgH), Image.BICUBIC)
print('resized_image.size',resized_image.size)
resized_images = transform(resized_image)
print('resized_images11',resized_images)
img=resized_images.cpu()
img=img.squeeze()
npimg=img.permute(1,2,0).numpy().astype('uint8')
print(npimg.shape)
img_name = os.path.join(save_path, item)
cv2.imwrite(img_name,npimg)
效果展示
输入图片分别为11819,和20654
|
pad过后的图片,注:由于在pad的过程中,需要对图片进行做一些预处理,减均值等,图片已经失去了原来的基本面貌,如下:
故图片的长度和宽都变成了100*32。