【目标检测】Faster RCNN代码实现 ——（1）

最新推荐文章于 2024-03-25 15:07:07 发布

不断进步的咸鱼

最新推荐文章于 2024-03-25 15:07:07 发布

阅读量1.5k

点赞数 3

分类专栏：目标检测文章标签：深度学习计算机视觉 python

本文链接：https://blog.csdn.net/qq_36926037/article/details/108508551

版权

目标检测专栏收录该内容

32 篇文章 19 订阅

订阅专栏

文章目录

一、项目总览
二、data文件夹

一、项目总览

在这里插入图片描述

二、data文件夹

在这里插入图片描述

2.1 dataset.py

在这里插入图片描述
（1）inverse_normalize()：将图片数组的值（范围为-1~1，有固定的均值和标准差）反规范化，还原到原始图像的像素值。
（2）pytorch_normalze()、caffe_normalize：将图片数组的值规范化，使得图像数组的值均值为mean，标准差为std。
（3）preprocess()：调整图片的尺寸、图片值的取值范围（调用pytorch_normalze或caffe_normalize）。最终返回调整后的图像数组。
（4）Transform类：输入为原始的ori_img, bbox, label，返回调整后的img(调用preprocess函数)、调整后的box（调用util.resize_bbox函数，返回适应于调整后图像的真值框坐标：（y,x,y,x）格式）、lable(边界框的类别标签)、scale(图片或边界框的调整尺度)。
（5） Dataset类：训练数据集类。读取原始图像的ori_img, bbox, label（通过 VOCBboxDataset类），返回经过调整后的img, bbox, label, scale（通过调用Transform类）
（6） TestDataset类：评估数据集类。读取原始图像的ori_img, bbox, label（通过 VOCBboxDataset类），返回经过调整后的img、原始 ori_img的尺寸、调整后的bbox、label（通过调用Transform类）、difficult

from __future__ import  absolute_import
from __future__ import  division
import torch as t
from data.voc_dataset import VOCBboxDataset
from skimage import transform as sktsf
from torchvision import transforms as tvtsf
from data import util
import numpy as np
from utils.config import opt
'''本py文件：用于数据集的加载（包含图片和标注框的处理）'''

'''inverse_normalize()函数：输入图像img的值为规范化后的值（取值范围为-1~1，用均值方差标准化），本函数用来去除这种规范化，得到原始图像的像素值'''
def inverse_normalize(img):
    if opt.caffe_pretrain:
        img = img + (np.array([122.7717, 115.9465, 102.9801]).reshape(3, 1, 1))
        return img[::-1, :, :]
    # approximate un-normalize for visualize
    return (img * 0.225 + 0.45).clip(min=0, max=1) * 255

'''pytorch_normalze（）函数：将图片数组的数值(0-1范围)进行归一化，返回img数组值的范围为-1~1 '''
def pytorch_normalze(img):
    """ return appr -1~1 RGB"""
    normalize = tvtsf.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])
    img = normalize(t.from_numpy(img))
    return img.numpy()
def caffe_normalize(img):
    """return appr -125-125 BGR"""
    img = img[[2, 1, 0], :, :]  # RGB-BGR
    img = img * 255
    mean = np.array([122.7717, 115.9465, 102.9801]).reshape(3, 1, 1)
    img = (img - mean).astype(np.float32, copy=True)
    return img

'''preprocess()函数：调整图像的大小、值的范围。返回调整大小后的img数组，归一化值的范围为-1~1'''
def preprocess(img, min_size=600, max_size=1000):
    """Args:
        img (~numpy.ndarray): 原始图像的数组CHW 和RGB个税，每个值的范围为`[0, 255]`.
    Returns:
       处理后的图像~numpy.ndarray
    """
    '''（1）原始图像的高和宽'''
    C, H, W = img.shape

    '''（2）原始图像的应该调整的尺度scale'''
    scale1 = min_size / min(H, W)#例H=1080,W=1920,min_size=600
    scale2 = max_size / max(H, W)
    scale = min(scale1, scale2)

    '''(3)原始图像值归一化范围（0-1）'''
    img = img / 255.

    '''(4)调整图像大小：根据调整尺度scale'''
    #from skimage import transform as sktsf
    img = sktsf.resize(img, (C, H * scale, W * scale), mode='reflect',anti_aliasing=False)

    # both the longer and shorter should be less than
    # max_size and min_size
    '''(5)图片数组的值规范化：（-1~1）'''
    if opt.caffe_pretrain:
        # caffe_pretrain = False # use caffe pretrained model instead of torchvision
        normalize = caffe_normalize
    else:
        '''将图片数组的数值进行归一化，返回img数组值的范围为-1~1'''
        normalize = pytorch_normalze
    return normalize(img)

'''Transform类：调整原始图像的ori_img, bbox, label，得到变换（调整大小、水平翻转）后的img（值范围为-1~1）, bbox, label，scale'''
class Transform(object):
    ''''''
    '''__init__()函数：初始化函数，用于定义网络输入的最大最小尺寸'''
    def __init__(self, min_size=600, max_size=1000):
        self.min_size = min_size
        self.max_size = max_size


    def __call__(self, in_data):
        ''''''
        '''（1）原始图像的img, bbox, label'''
        img, bbox, label = in_data
        _, H, W = img.shape

        '''(2)原始图像尺寸调整，并且图像数组的值在-1——1范围内'''
        img = preprocess(img, self.min_size, self.max_size)


        '''(3)原始图像的边界框尺寸调整：得到bbox，形状为（真值框的数量、4）'''
        #调整后图像的宽和高：'''
        _, o_H, o_W = img.shape
        #计算调整的尺度scale
        scale = o_H / H
        #调整大小后的边界框（真值框数量，4），坐标形式为（y,x,y,x）像素坐标
        bbox = util.resize_bbox(bbox, (H, W), (o_H, o_W))##调用util.py中flip_bbox函数，对原始边界框坐标进行调整（调整尺度与原图像相同）

        '''(3)图片和边界框水平翻转'''
        img, params = util.random_flip(img, x_random=True, return_param=True)#调用util.py中的random_flip函数
        bbox = util.flip_bbox(bbox, (o_H, o_W), x_flip=params['x_flip'])#调用util.py中flip_bbox函数
        return img, bbox, label, scale

'''Dataset类：训练集对象，返回经过处理后（填充。调整大小等）的img, bbox, label, scale'''
class Dataset:
    
    '''__init__（）函数：初始化函数'''
    def __init__(self, opt):
        '''(1)模型配置参数'''
        self.opt = opt
        # {'caffe_pretrain': False,
        #  'caffe_pretrain_path': 'checkpoints/vgg16_caffe.pth',
        #  'data': 'voc',
        #  'max_size': 1000,
        #  'min_size': 600,
        #  'num_workers': 8,
        # .....
        # ....
        #  'voc_data_dir': 'D:\\data\\voc2007\\data\\VOCdevkit\\VOC2007',
        #  'weight_decay': 0.0005}
        #voc数据集加载
        '''(2)解析训练集图片和标注：得到self.db既原始图像的img, bbox, label, difficult'''
        self.db = VOCBboxDataset(opt.voc_data_dir)#'voc_data_dir': 'D:\\data\\voc2007\\data\\VOCdevkit\\VOC2007',
        '''(3)定义Transform对象'''
        self.tsf = Transform(opt.min_size, opt.max_size)  #  'max_size': 1000,  'min_size': 600,

    '''__getitem__()函数：获取第idx个图片，经过处理后（填充。调整大小等）的img, bbox, label, 缩放尺度'''
    def __getitem__(self, idx):

        '''(1)原始图像的img, bbox, label, difficult'''
        ori_img, bbox, label, difficult = self.db.get_example(idx)
        '''(2)对原始img, bbox, label进行变换，得到调整大小等处理后的img（值范围为-1~1）, bbox, label，调整尺度'''
        img, bbox, label, scale = self.tsf((ori_img, bbox, label))
        # some of the strides of a given numpy array are negative.
        return img.copy(), bbox.copy(), label.copy(), scale

    '''__len__()函数：数据集的大小'''
    def __len__(self):
        return len(self.db)

'''TestDataset类：评估数据集对象，返回处理后的图像、原始图像的高和宽、原始边界框、标签、difficult'''
class TestDataset:

    '''__init__()函数：初始化函数'''
    def __init__(self, opt, split='test', use_difficult=True):
        ''''''
        '''(1)配置参数'''
        self.opt = opt
        '''(2)解析训练集图片和标注：得到原始self.db=img, bbox, label, difficult'''
        self.db = VOCBboxDataset(opt.voc_data_dir, split=split, use_difficult=use_difficult)

    '''__getitem__()函数：返回第idx代表的图片——处理后的图像、原始图像的高和宽、原始边界框、标签、difficult'''
    def __getitem__(self, idx):
        ''''''
        '''(1)原始图像的img, bbox, label, difficult'''
        ori_img, bbox, label, difficult = self.db.get_example(idx)
        '''(2)调整后的原始图像'''
        img = preprocess(ori_img)
        '''(3)返回处理后的图像、原始图像的高和宽、原始边界框、标签、difficult'''
        return img, ori_img.shape[1:], bbox, label, difficult

    '''__len__()函数:评估数据集的大小'''
    def __len__(self):
        return len(self.db)

2.2 voc_dataset.py

在这里插入图片描述

'''本py文件：用来解析voc数据集，得到原始图片的img, bbox, label, difficult'''
import os
import xml.etree.ElementTree as ET
import numpy as np
from .util import read_image
'''voc数据集的类名称'''
VOC_BBOX_LABEL_NAMES = ('aeroplane','bicycle','bird','boat','bottle','bus','car','cat','chair','cow',
    'diningtable','dog','horse','motorbike','person','pottedplant','sheep','sofa','train','tvmonitor')

'''VOCBoxDataset类：解析训练集 得到原始图片的：img, bbox（ymin, xmin, ymax, xmax]）, label, difficult'''
class VOCBboxDataset:
    """
        data_dir (string): 训练数据的根路径。例'voc_data_dir': 'D:\\data\\voc2007\\data\\VOCdevkit\\VOC2007'
        split ({'train', 'val', 'trainval', 'test'}): 选择数据集类别，
        year ({'2007', '2012'}): 选择数据集的年限
        use_difficult (bool): 如果选择为`True`, 使用原始标注中标记为困难的图像。
        return_difficult (bool): 默认值为False.如果设置为`True`, 此数据集返回一个布尔数组，该数组指示边界框是否标记为困难。
    """
    '''__init__()函数：初始化函数'''
    def __init__(self, data_dir, split='trainval',use_difficult=False, return_difficult=False,):
        # if split not in ['train', 'trainval', 'val']:
        #     if not (split == 'test' and year == '2007'):
        #         warnings.warn(
        #             'please pick split from \'train\', \'trainval\', \'val\''
        #             'for 2012 dataset. For 2007 dataset, you can pick \'test\''
        #             ' in addition to the above mentioned splits.'
        #         )
    
        '''(1)获取训练集图片名称组成的列表self.ids ：['000005', '000007', '000009', 。。。]'''
        #'voc_data_dir': 'D:\\data\\voc2007\\data\\VOCdevkit\\VOC2007',
        id_list_file = os.path.join(data_dir, 'ImageSets/Main/{0}.txt'.format(split))
        #id_list_file=D:\data\voc2007\data\VOCdevkit\VOC2007\ImageSets/Main/trainval.txt:该txt文件的每一行数据为每个图片的名称
        self.ids = [id_.strip() for id_ in open(id_list_file)] #print(self.ids)#['000005', '000007', '000009', 。。。]


        '''(2)训练集的根路径：data_dir': 'D:\\data\\voc2007\\data\\VOCdevkit\\VOC2007'''
        self.data_dir = data_dir
        self.use_difficult = use_difficult
        self.return_difficult = return_difficult

        '''(3)数据集类的名称,组成的元组'''
        self.label_names = VOC_BBOX_LABEL_NAMES

    '''__len__（）函数：数据集的样本数量'''
    def __len__(self):
        return len(self.ids)

    '''get_example（）函数：返回第i个原始图片的img, bbox（ymin, xmin, ymax, xmax]）, label, difficult '''
    def get_example(self, i):
        """返回第i个样本：图片（CHW格式，图像是RGB格式）、边界框
        Args:i (int): 样本的索引号
        Returns:返回元组：图像和真值边界框"""
        
        '''1.第i个图像：例000005'''
        id_ = self.ids[i]
        img_file = os.path.join(self.data_dir, 'JPEGImages', id_ + '.jpg')
        #img为原始图像数组(C, H, W)形式
        img = read_image(img_file, color=True)

        '''2.第i个图像的标注：得到bbox（原始图片的边界框）、label（边界框的类标签） '''
        
        #第i图像的标注文件
        anno = ET.parse(os.path.join(self.data_dir, 'Annotations', id_ + '.xml'))
        bbox = list()
        label = list()
        difficult = list()
        for obj in anno.findall('object'):
            # when in not using difficult split, and the object is difficult, skipt it.
            if not self.use_difficult and int(obj.find('difficult').text) == 1:
                continue
            difficult.append(int(obj.find('difficult').text))

            '''边界框坐标列表bbox：获取边界框的坐标['ymin', 'xmin', 'ymax', 'xmax']添加到bbox列表'''
            bndbox_anno = obj.find('bndbox')
            bbox.append([int(bndbox_anno.find(tag).text) - 1
                for tag in ('ymin', 'xmin', 'ymax', 'xmax')])
            '''边界框类别标签label：'''
            name = obj.find('name').text.lower().strip()

            label.append(VOC_BBOX_LABEL_NAMES.index(name))
            
        '''3.第i个图像，所有边界框、边界框类标签的信息拼接'''
        bbox = np.stack(bbox).astype(np.float32)
        label = np.stack(label).astype(np.int32)
        # print(bbox.shape)
        # print(label.shape)
        # 如果use_difficult==False`, difficult`的所有元素为 False.
        difficult = np.array(difficult, dtype=np.bool).astype(np.uint8)  # PyTorch don't support np.bool
        # if self.return_difficult:
        #     return img, bbox, label, difficult
        return img, bbox, label, difficult

    __getitem__ = get_example

2.3 utils.py

在这里插入图片描述


'''本py文件用于。读取图像、调整边界框，图片及边界框的水平翻转'''
import numpy as np
from PIL import Image
import random

''' read_image()函数：读取图像（根据图片路径）返回图片的ndarray数组——(C, H, W)格式'''
def read_image(path, dtype=np.float32, color=True):
    """
        path (str): 图片的路径。该图像是CHW形式，值的范围为0-255
        dtype: 数组类型. 默认值为`numpy.float32`.
        color (bool): 这个选择决定了通道的数量。
           `True`, 通道数量为3. 通道的顺序为RGB. 这是默认选项
           `False`,返回灰度图像
    Returns:图像的numpy.ndarray形式
    """
    f = Image.open(path)
    try:
        if color:
            img = f.convert('RGB')
        else:
            img = f.convert('P')
        img = np.asarray(img, dtype=dtype)
    finally:
        if hasattr(f, 'close'):
            f.close()
    if img.ndim == 2:
        # 把(H, W) -> (1, H, W)
        return img[np.newaxis]
    else:
        # 改变维度的顺序 (H, W, C) -> (C, H, W)
        return img.transpose((2, 0, 1))

'''resize_bbox()函数：返回调整大小后的边界框。根据原始图像h、w的调整尺度分别调整边界框的坐标，返回调整后的边界框（y,x,y,x）'''
def resize_bbox(bbox, in_size, out_size):
    """根据调整图像的方法，调整边界框
    Args:
        bbox (~numpy.ndarray):数组，形状为:(边界框的数量, 4)`.
        in_size (tuple): 原始图像的尺寸（h,w）
        out_size (tuple):调整后图像的尺寸（h,w）

    Returns:调整后的边界框~numpy.ndarray。边界框的调整是根据调整的图像尺寸调整的
    """
    bbox = bbox.copy()
    '''(1)获取图像h和w的调整尺度'''
    y_scale = float(out_size[0]) / in_size[0]
    x_scale = float(out_size[1]) / in_size[1]

    '''(2)根据图像h和w的调整尺度，调整边界框坐标'''
    bbox[:, 0] = y_scale * bbox[:, 0]
    bbox[:, 2] = y_scale * bbox[:, 2]
    bbox[:, 1] = x_scale * bbox[:, 1]
    bbox[:, 3] = x_scale * bbox[:, 3]
    return bbox

'''random_flip()函数：图片的水平翻转,返回翻转后的图片数组'''
def random_flip(img, y_random=False, x_random=False,return_param=False, copy=False):
    """在水平或竖直方向随机翻转图片
    Args:
        img (~numpy.ndarray): An array that gets flipped.  CHW 格式
        y_random (bool): Randomly flip in vertical direction.
        x_random (bool): Randomly flip in horizontal direction.
        return_param (bool): Returns information of flip.
        copy (bool): If False, a view of :obj:`img` will be returned.
    Returns: ~numpy.ndarray or (~numpy.ndarray, dict):
    """
    y_flip, x_flip = False, False
    if y_random:
        y_flip = random.choice([True, False])
    if x_random:
        x_flip = random.choice([True, False])
    if y_flip:
        img = img[:, ::-1, :]
    if x_flip:
        img = img[:, :, ::-1]

    if copy:
        img = img.copy()

    if return_param:
        return img, {'y_flip': y_flip, 'x_flip': x_flip}
    else:
        return img

'''flip_bbox()函数：边界框的水平翻转。返回翻转后的边界框（y,x,y,x）'''
def flip_bbox(bbox, size, y_flip=False, x_flip=False):
    """翻转边界框的坐标，根据图片的翻转
    Args:
        bbox (~numpy.ndarray): An array whose shape is :math:`(R, 4)`.
            :math:`R` is the number of bounding boxes.
        size (tuple): A tuple of length 2. The height and the width
            of the image before resized.
        y_flip (bool): Flip bounding box according to a vertical flip of
            an image.
        x_flip (bool): Flip bounding box according to a horizontal flip of
            an image.
    Returns:~numpy.ndarray:
    """
    H, W = size
    bbox = bbox.copy()
    if y_flip:
        y_max = H - bbox[:, 0]
        y_min = H - bbox[:, 2]
        bbox[:, 0] = y_min
        bbox[:, 2] = y_max
    if x_flip:
        x_max = W - bbox[:, 1]
        x_min = W - bbox[:, 3]
        bbox[:, 1] = x_min
        bbox[:, 3] = x_max
    return bbox

'''crop_bbox()函数：用于裁剪边界框（y,x,y,x）。 此函数会截断边界框以适合裁剪区域。 如果边框与裁切区域不重叠，则将删除此边框。'''
def crop_bbox(bbox, y_slice=None, x_slice=None,allow_outside_center=True, return_param=False):
    """裁剪边界框，使其在原图的区域内
    该方法主要与图像裁剪一起使用。用于转换边界框的坐标,如data.util.translate_bbox函数。
    此外，此函数会截断边界框以适合裁剪区域。 如果边框与裁切区域不重叠，则将删除此边框。
    将边界框将打包成形状为：（R，4）的二维张量， R图像中边界框的数量。 边界框：`（y_ {min}，x_ {min}，y_ {max}，x_ {max}）），
    其中四个属性是左上和右下顶点的坐标。

    Args:
        bbox (~numpy.ndarray): 想要裁剪的边界框 shape ：`(R, 4)`. `R`是边界框坐标
        y_slice (slice): y轴的切片
        x_slice (slice): x轴的切片
        allow_outside_center (bool): 如果为False，则删除中心在裁剪区域之外的边界框。 默认值为True。
        return_param (bool): 如果为True，则此函数返回保留的边界框的索引。
    Returns:
        ~numpy.ndarray or (~numpy.ndarray, dict):

        如果`return_param = False`, 返回边界框数组
        如果`return_param = True`,返回元组（边界框,paras）
                param是中间参数的字典，其内容在下面列出，包括键，值类型和值的描述。
        * **index** (*numpy.ndarray*): An array holding indices of used bounding boxes.
    """
    '''裁剪区域的范围'''
    t, b = _slice_to_bounds(y_slice)
    l, r = _slice_to_bounds(x_slice)
    crop_bb = np.array((t, l, b, r))

    '''是否删除，中心点不在裁剪区域内的边界框'''
    if allow_outside_center:
        mask = np.ones(bbox.shape[0], dtype=bool)
    else:
        center = (bbox[:, :2] + bbox[:, 2:]) / 2.0
        mask = np.logical_and(crop_bb[:2] <= center, center < crop_bb[2:]) .all(axis=1)

    '''裁剪边界框，使其在裁剪区域内'''
    bbox = bbox.copy()
    bbox[:, :2] = np.maximum(bbox[:, :2], crop_bb[:2])
    bbox[:, 2:] = np.minimum(bbox[:, 2:], crop_bb[2:])
    bbox[:, :2] -= crop_bb[:2]
    bbox[:, 2:] -= crop_bb[:2]

    mask = np.logical_and(mask, (bbox[:, :2] < bbox[:, 2:]).all(axis=1))
    bbox = bbox[mask]
	'''返回调裁剪后的边界框（y,x,y,x）'''
    if return_param:
        return bbox, {'index': np.flatnonzero(mask)}
    else:
        return bbox

'''_slice_to_bounds()函数：用于获取裁剪区域'''
def _slice_to_bounds(slice_):
    if slice_ is None:
        return 0, np.inf

    if slice_.start is None:
        l = 0
    else:
        l = slice_.start

    if slice_.stop is None:
        u = np.inf
    else:
        u = slice_.stop

    return l, u

'''translate_bbox（）函数：根据指定的偏移，调整边界框，返回调整后的边界框（y,x,y,x）'''
def translate_bbox(bbox, y_offset=0, x_offset=0):
    """Translate bounding boxes.
    此方法主要与图像转换（例如填充和裁切）一起使用，例该转换将会使图像的左上点坐标从坐标： `（0，0）`转换为坐标`(y, x) = (y_{offset}, x_{offset})`.
    将边界框将打包成形状为：（R，4）的二维张量， R图像中边界框的数量。 边界框：`（y_ {min}，x_ {min}，y_ {max}，x_ {max}）），其中四个属性是左上和右下顶点的坐标。
    Args:
        bbox (~numpy.ndarray):想要转换的边界框，形状为`(R, 4)`
        y_offset (int or float): The offset along y axis.
        x_offset (int or float): The offset along x axis.
    Returns:
        ~numpy.ndarray:
        根据给定的偏移量转换边界框。
    """
    out_bbox = bbox.copy()
    out_bbox[:, :2] += (y_offset, x_offset)
    out_bbox[:, 2:] += (y_offset, x_offset)
    return out_bbox

不断进步的咸鱼

关注

3
点赞
踩
25

收藏

觉得还不错? 一键收藏
1
评论
【目标检测】Faster RCNN代码实现 ——（1）

文章目录一、项目总览二、data文件夹代码解析2.1 dataset.py2.2 voc_dataset.py2.3 utils.py一、项目总览二、data文件夹代码解析2.1 dataset.py （1）inverse_normalize()：将图片数组的值（范围为-1~1，有固定的均值和标准差）反规范化，还原到原始图像的像素值。（2）pytorch_normalze()、caffe_normalize：将图片数组的值规范化，使得图像数组的值均值为mean，标准差为std。（
复制链接

扫一扫

专栏目录