FCN源码解读之voc_layers.py

voc_layers.py是FCN中利用python写的数据层(即使用caffe的Python API 写的数据输入层),其格式是相对固定的,包含setup()、reshape()、forward()、backward()四个必要函数。

其源码如下:
[python] view plain copy

import caffe  

import numpy as np  
from PIL import Image  

import random  

class VOCSegDataLayer(caffe.Layer):  
    """ 
    Load (input image, label image) pairs from PASCAL VOC 
    one-at-a-time while reshaping the net to preserve dimensions. 

    Use this to feed data to a fully convolutional network. 
    """  

    def setup(self, bottom, top):  
        """ 
        Setup data layer according to parameters: 

        - voc_dir: path to PASCAL VOC year dir 
        - split: train / val / test 
        - mean: tuple of mean values to subtract 
        - randomize: load in random order (default: True) 
        - seed: seed for randomization (default: None / current time) 

        for PASCAL VOC semantic segmentation. 

        example 

        params = dict(voc_dir="/path/to/PASCAL/VOC2011", 
            mean=(104.00698793, 116.66876762, 122.67891434), 
            split="val") 
        """  
        # config  
        params = eval(self.param_str)  
        self.voc_dir = params['voc_dir']  
        self.split = params['split']  
        self.mean = np.array(params['mean'])  
        self.random = params.get('randomize', True)  
        self.seed = params.get('seed', None)  

        # two tops: data and label  
        if len(top) != 2:  
            raise Exception("Need to define two tops: data and label.")  
        # data layers have no bottoms  
        if len(bottom) != 0:  
            raise Exception("Do not define a bottom.")  

        # load indices for images and labels  
        split_f  = '{}/ImageSets/Segmentation/{}.txt'.format(self.voc_dir,  
                self.split)  
        self.indices = open(split_f, 'r').read().splitlines()  
        self.idx = 0  

        # make eval deterministic  
        if 'train' not in self.split:  
            self.random = False  

        # randomization: seed and pick  
        if self.random:  
            random.seed(self.seed)  
            self.idx = random.randint(0, len(self.indices)-1)  


    def reshape(self, bottom, top):  
        # load image + label image pair  
        self.data = self.load_image(self.indices[self.idx])  
        self.label = self.load_label(self.indices[self.idx])  
        # reshape tops to fit (leading 1 is for batch dimension)  
        top[0].reshape(1, *self.data.shape)  
        top[1].reshape(1, *self.label.shape)  


    def forward(self, bottom, top):  
        # assign output  
        top[0].data[...] = self.data  
        top[1].data[...] = self.label  

        # pick next input  
        if self.random:  
            self.idx = random.randint(0, len(self.indices)-1)  
        else:  
            self.idx += 1  
            if self.idx == len(self.indices):  
                self.idx = 0  


    def backward(self, top, propagate_down, bottom):  
        pass  


    def load_image(self, idx):  
        """ 
        Load input image and preprocess for Caffe: 
        - cast to float 
        - switch channels RGB -> BGR 
        - subtract mean 
        - transpose to channel x height x width order 
        """  
        im = Image.open('{}/JPEGImages/{}.jpg'.format(self.voc_dir, idx))  
        in_ = np.array(im, dtype=np.float32)  
        in_ = in_[:,:,::-1]  
        in_ -= self.mean  
        in_ = in_.transpose((2,0,1))  
        return in_  


    def load_label(self, idx):  
        """ 
        Load label image as 1 x height x width integer array of label indices. 
        The leading singleton dimension is required by the loss. 
        """  
        im = Image.open('{}/SegmentationClass/{}.png'.format(self.voc_dir, idx))  
        label = np.array(im, dtype=np.uint8)  
        label = label[np.newaxis, ...]  
        return label  


class SBDDSegDataLayer(caffe.Layer):  
    """ 
    Load (input image, label image) pairs from the SBDD extended labeling 
    of PASCAL VOC for semantic segmentation 
    one-at-a-time while reshaping the net to preserve dimensions. 

    Use this to feed data to a fully convolutional network. 
    """  

    def setup(self, bottom, top):  
        """ 
        Setup data layer according to parameters: 

        - sbdd_dir: path to SBDD `dataset` dir 
        - split: train / seg11valid 
        - mean: tuple of mean values to subtract 
        - randomize: load in random order (default: True) 
        - seed: seed for randomization (default: None / current time) 

        for SBDD semantic segmentation. 

        N.B.segv11alid is the set of segval11 that does not intersect with SBDD. 
        Find it here: https://gist.github.com/shelhamer/edb330760338892d511e. 

        example 

        params = dict(sbdd_dir="/path/to/SBDD/dataset", 
            mean=(104.00698793, 116.66876762, 122.67891434), 
            split="valid") 
        """  
        # config  
        params = eval(self.param_str)  
        self.sbdd_dir = params['sbdd_dir']  
        self.split = params['split']  
        self.mean = np.array(params['mean'])  
        self.random = params.get('randomize', True)  
        self.seed = params.get('seed', None)  

        # two tops: data and label  
        if len(top) != 2:  
            raise Exception("Need to define two tops: data and label.")  
        # data layers have no bottoms  
        if len(bottom) != 0:  
            raise Exception("Do not define a bottom.")  

        # load indices for images and labels  
        split_f  = '{}/{}.txt'.format(self.sbdd_dir,  
                self.split)  
        self.indices = open(split_f, 'r').read().splitlines()  
        self.idx = 0  

        # make eval deterministic  
        if 'train' not in self.split:  
            self.random = False  

        # randomization: seed and pick  
        if self.random:  
            random.seed(self.seed)  
            self.idx = random.randint(0, len(self.indices)-1)  


    def reshape(self, bottom, top):  
        # load image + label image pair  
        self.data = self.load_image(self.indices[self.idx])  
        self.label = self.load_label(self.indices[self.idx])  
        # reshape tops to fit (leading 1 is for batch dimension)  
        top[0].reshape(1, *self.data.shape)  
        top[1].reshape(1, *self.label.shape)  


    def forward(self, bottom, top):  
        # assign output  
        top[0].data[...] = self.data  
        top[1].data[...] = self.label  

        # pick next input  
        if self.random:  
            self.idx = random.randint(0, len(self.indices)-1)  
        else:  
            self.idx += 1  
            if self.idx == len(self.indices):  
                self.idx = 0  


    def backward(self, top, propagate_down, bottom):  
        pass  


    def load_image(self, idx):  
        """ 
        Load input image and preprocess for Caffe: 
        - cast to float 
        - switch channels RGB -> BGR 
        - subtract mean 
        - transpose to channel x height x width order 
        """  
        im = Image.open('{}/img/{}.jpg'.format(self.sbdd_dir, idx))  
        in_ = np.array(im, dtype=np.float32)  
        in_ = in_[:,:,::-1]  
        in_ -= self.mean  
        in_ = in_.transpose((2,0,1))  
        return in_  


    def load_label(self, idx):  
        """ 
        Load label image as 1 x height x width integer array of label indices. 
        The leading singleton dimension is required by the loss. 
        """  
        import scipy.io  
        mat = scipy.io.loadmat('{}/cls/{}.mat'.format(self.sbdd_dir, idx))  
        label = mat['GTcls'][0]['Segmentation'][0].astype(np.uint8)  
        label = label[np.newaxis, ...]  
        return label  

详细代码解读如下

1.VOCSegDataLayer类(也即net.py中申明的测试时的输入层pylayer)

此类对应于val.prototxt中的输入层,即:
[python] view plain copy

layer {  
  name: "data"  
  type: "Python"  
  top: "data"  
  top: "label"  
  python_param {  
    module: "voc_layers"  
    layer: "VOCSegDataLayer"  
    param_str: "{\'voc_dir\': \'../data/VOC2012\', \'seed\': 1337, \'split\': \'seg11valid\', \'mean\': (104.00699, 116.66877, 122.67892)}"  
  }  
}  

具体源码解读如下
[python] view plain copy

#测试时用到的定义数据层的VOCSegDataLayer类(对应于测试集或验证集),类中根据caffe提供的python接口定义相  
#应的函数,详细可参见https://chrischoy.github.io/research/caffe-python-layer/  
class VOCSegDataLayer(caffe.Layer):  
    """ 
    Load (input image, label image) pairs from PASCAL VOC 
    one-at-a-time while reshaping the net to preserve dimensions. 

    Use this to feed data to a fully convolutional network. 
    """  
    #setup函数,根据相应参数设置数据层  
    def setup(self, bottom, top):  
        """ 
        Setup data layer according to parameters: 

        - voc_dir: path to PASCAL VOC year dir 测试集或验证集的路径 
        - split: train / val / test split可以为train/val/test中的任意一者(即也可以看看训练集的效果) 
        - mean: tuple of mean values to subtract  存储着所要减去的平均值(减去平均值可以加速迭代) 
        - randomize: load in random order (default: True) 当randomize=True时,开启随机加载图片模式 
        - seed: seed for randomization (default: None / current time) 随机模式的种子(默认值为None) 

        for PASCAL VOC semantic segmentation. 

        example 

        params = dict(voc_dir="/path/to/PASCAL/VOC2011", 
            mean=(104.00698793, 116.66876762, 122.67891434), 
            split="val") 
        """  
        # config  
        params = eval(self.param_str) #读入参数  
        self.voc_dir = params['voc_dir']  
        self.split = params['split']  
        self.mean = np.array(params['mean'])  
        self.random = params.get('randomize', True)  
        self.seed = params.get('seed', None)  

        # two tops: data and label  
        #判断输出是否包含数据和标记  
        if len(top) != 2:  
            raise Exception("Need to define two tops: data and label.")  
        # data layers have no bottoms  
        #判断是否有输入(数据层不需要定义输入bottom)  
        if len(bottom) != 0:  
            raise Exception("Do not define a bottom.")  

        # load indices for images and labels  
        #获取所需要加载的图片的编号(即读取'split'.txt文档中的图片索引编号,这些编号其实是图片名)  
        split_f  = '{}/ImageSets/Segmentation/{}.txt'.format(self.voc_dir,  
                self.split) #第一个{}即self.voc_dir;第二个{}即self.split  
        #splitlines(),按行('\r', '\r\n', \n')分隔,返回一个包含各行作为元素的列表  
        #即indices是所有图片编号的列表(按行存放成一列)  
        self.indices = open(split_f, 'r').read().splitlines()  
        self.idx = 0  #indices列表索引指针idx初始化为0  

        # make eval deterministic  
        #当split=test或者val时,不需要开启随机模式  
        if 'train' not in self.split:  
            self.random = False  

        # randomization: seed and pick  
        #判断是否开启随机读取图片模式  
        if self.random:  
            random.seed(self.seed)  
            #随机生成一个整数作为索引号idx(范围为0~(len(self.indices)-1))  
            self.idx = random.randint(0, len(self.indices)-1)   

    #reshape函数,根据索引号idx加载相应图片,并调整数据层的大小  
    def reshape(self, bottom, top):  
        # load image + label image pair  
        #load_image()和load_label()函数在后面定义  
        self.data = self.load_image(self.indices[self.idx])  
        self.label = self.load_label(self.indices[self.idx])  
        # reshape tops to fit (leading 1 is for batch dimension)  
        ''''' 
        重新调整数据层的大小(即caffe所加载的数据层的大小在每次迭代训练中是可以变的, 
        因为数据层的大小并不影响各层参数的大小) 
        caffe中的数据按N*C*H*W存储的,N为batch size,C为通道数,H和W分别为长和宽,这里的1即为batch size 
        也就对应了FCN论文中所讲到的采用SGD算法(随机梯度下降法,每一迭代训练的图片数为1) 
        '''  
        top[0].reshape(1, *self.data.shape)  #data  
        top[1].reshape(1, *self.label.shape) #label  

    #定义前向传播函数forward(),数据层的前向传播不对数据进行任何操作,只是简单的输出数据本身  
    def forward(self, bottom, top):  
        # assign output  
        top[0].data[...] = self.data  
        top[1].data[...] = self.label  
        #输出数据的同时,进行下一次迭代时所需要的图片的选择(即产生下一个索引号idx)  
        # pick next input  
        if self.random:  
            self.idx = random.randint(0, len(self.indices)-1)  
        else:  
            self.idx += 1  
            if self.idx == len(self.indices):  
                self.idx = 0  

    #数据层不需要后向传播,直接pass  
    def backward(self, top, propagate_down, bottom):  
        pass  

    #加载图片的函数(根据索引号idx进行加载)  
    def load_image(self, idx):  
        """ 
        Load input image and preprocess for Caffe: 加载图片并处理成caffe的数据格式 
        - cast to float 转换为float型 
        - switch channels RGB -> BGR 交换通道位置,即R通道和B通道交换(感觉是用了opencv库的原因) 
        - subtract mean  减去均值 
        - transpose to channel x height x width order 将通道数放在前面(对应caffe数据存储的格式) 
        """  
        im = Image.open('{}/JPEGImages/{}.jpg'.format(self.voc_dir, idx))  
        in_ = np.array(im, dtype=np.float32)  
        in_ = in_[:,:,::-1]  #-1表示从最后一维开始往前读取数据。即交换R通道和B通道  
        in_ -= self.mean  #减去均值  
        in_ = in_.transpose((2,0,1)) #将通道数放在前面  
        return in_  

python中的PIL所读取的三通道彩色图片是按H*W*C存放的,且三通道是顺序是标准的RGB顺序,输入到caffe中处理前,需要进行相应的转换。

caffe中的数据存储方式是N*C*H*W,且是按BGR顺序存放三通道的。

所有需要先进行RGB转换到BGR,具体可直接使用 in_ = in_[:,:,::-1]语句实现,具体理解可参见以下例子(其中a的第三维可看成是C,且按RGB顺序存放,前两维可看成是H和W):
[python] view plain copy

import numpy as np  
a = np.array([[[1,2,3],[4,5,6]],[[7,8,9],[10,11,12]],[[13,14,15],[16,17,18]]])  
print(str(a.shape))  
print(str(a))  
a = a[:,:,::-1]   
#a = a.transpose((2,0,1))  
print(str(a.shape))  
print(str(a))  

以这个例子来说,第一行第一列所在位置的像素点的像素值分别为:R=1,G=2,B=3

运行结果为(可以看出第一行第一列所在位置的像素点的像素值分别为:B=3,G=2,R=1):
[python] view plain copy

(3L, 2L, 3L)  
[[[ 1  2  3]  
  [ 4  5  6]]  

 [[ 7  8  9]  
  [10 11 12]]  

 [[13 14 15]  
  [16 17 18]]]  
(3L, 2L, 3L)  
[[[ 3  2  1]  
  [ 6  5  4]]  

 [[ 9  8  7]  
  [12 11 10]]  

 [[15 14 13]  
  [18 17 16]]]  

在此基础上还需要减去各个通道的均值,来进行均值归一化来加速算法执行速度。

最后按照caffe的存储数据的格式将通道数放在前面,即利用python中的transpose()函数进行转置操作,具体理解参见以下例子:
[python] view plain copy

import numpy as np  
a = np.array([[[1,2,3],[4,5,6]],[[7,8,9],[10,11,12]],[[13,14,15],[16,17,18]]])  
print(str(a.shape))  
print(str(a))  
a = a[:,:,::-1]   
a = a.transpose((2,0,1))  
print(str(a.shape))  
print(str(a))  

运行结果如下(由此可看出原来的第三维变到了第一维):
[python] view plain copy

(3L, 2L, 3L)  
[[[ 1  2  3]  
  [ 4  5  6]]  

 [[ 7  8  9]  
  [10 11 12]]  

 [[13 14 15]  
  [16 17 18]]]  
(3L, 3L, 2L)  
[[[ 3  6]  
  [ 9 12]  
  [15 18]]  

 [[ 2  5]  
  [ 8 11]  
  [14 17]]  

 [[ 1  4]  
  [ 7 10]  
  [13 16]]]  

即,例如结果中的
[python] view plain copy

[[ 3  6]  
  [ 9 12]  
  [15 18]]  

表示的是所有像素点的B通道的像素数值,也即表示的是原图像的B通道。
[python] view plain copy

#加载标记的函数(按照索引号idx加载相应的label图片)  
def load_label(self, idx):  
    """ 
    Load label image as 1 x height x width integer array of label indices. 
    The leading singleton dimension is required by the loss. 
    """  
    im = Image.open('{}/SegmentationClass/{}.png'.format(self.voc_dir, idx))  
    label = np.array(im, dtype=np.uint8)  #标签是单通道的  
    #np.newaxis的功能是插入新维度,即将原来的H×W转换为1×H×W  
    label = label[np.newaxis, ...]  
    return label  

2.SBDDSegDataLayer类(也即net.py中申明的训练时的输入层pylayer)

此类对应于train.prototxt中的输入层,即:
[python] view plain copy

layer {  
  name: "data"  
  type: "Python"  
  top: "data"  
  top: "label"  
  python_param {  
    module: "voc_layers"  
    layer: "SBDDSegDataLayer"  
    param_str: "{\'sbdd_dir\': \'../data/VOC2012\', \'seed\': 1337, \'split\': \'train\', \'mean\': (104.00699, 116.66877, 122.67892)}"  
  }  
}  

SBDDSegDataLayer类的代码和VOCSegDataLayer类类似,在此不再重复解读,就其中的一小点进行说明。
[python] view plain copy

#训练时用到的定义数据层的SBDDSegDataLayer类(对应于训练集),类中根据caffe提供的python接口定义相  
#应的函数  
class SBDDSegDataLayer(caffe.Layer):  
    """ 
    Load (input image, label image) pairs from the SBDD extended labeling 
    of PASCAL VOC for semantic segmentation 
    one-at-a-time while reshaping the net to preserve dimensions. 

    Use this to feed data to a fully convolutional network. 
    """  

    def setup(self, bottom, top):  
        """ 
        Setup data layer according to parameters: 

        - sbdd_dir: path to SBDD `dataset` dir 
        - split: train / seg11valid 
        - mean: tuple of mean values to subtract 
        - randomize: load in random order (default: True) 
        - seed: seed for randomization (default: None / current time) 

        for SBDD semantic segmentation. 

        N.B.segv11alid is the set of segval11 that does not intersect with SBDD. 
        Find it here: https://gist.github.com/shelhamer/edb330760338892d511e. 

        example 

        params = dict(sbdd_dir="/path/to/SBDD/dataset", 
            mean=(104.00698793, 116.66876762, 122.67891434), 
            split="valid") 
        """  
        # config  
        params = eval(self.param_str)  
        self.sbdd_dir = params['sbdd_dir']  
        self.split = params['split']  
        self.mean = np.array(params['mean'])  
        self.random = params.get('randomize', True)  
        self.seed = params.get('seed', None)  

        # two tops: data and label  
        if len(top) != 2:  
            raise Exception("Need to define two tops: data and label.")  
        # data layers have no bottoms  
        if len(bottom) != 0:  
            raise Exception("Do not define a bottom.")  

        # load indices for images and labels  
        split_f  = '{}/{}.txt'.format(self.sbdd_dir,  
                self.split)  
        self.indices = open(split_f, 'r').read().splitlines()  
        self.idx = 0  

        # make eval deterministic  
        if 'train' not in self.split:  
            self.random = False  

        # randomization: seed and pick  
        if self.random:  
            random.seed(self.seed)  
            self.idx = random.randint(0, len(self.indices)-1)  


    def reshape(self, bottom, top):  
        # load image + label image pair  
        self.data = self.load_image(self.indices[self.idx])  
        self.label = self.load_label(self.indices[self.idx])  
        # reshape tops to fit (leading 1 is for batch dimension)  
        top[0].reshape(1, *self.data.shape)  
        top[1].reshape(1, *self.label.shape)  


    def forward(self, bottom, top):  
        # assign output  
        top[0].data[...] = self.data  
        top[1].data[...] = self.label  

        # pick next input  
        if self.random:  
            self.idx = random.randint(0, len(self.indices)-1)  
        else:  
            self.idx += 1  
            if self.idx == len(self.indices):  
                self.idx = 0  


    def backward(self, top, propagate_down, bottom):  
        pass  


    def load_image(self, idx):  
        """ 
        Load input image and preprocess for Caffe: 
        - cast to float 
        - switch channels RGB -> BGR 
        - subtract mean 
        - transpose to channel x height x width order 
        """  
        im = Image.open('{}/img/{}.jpg'.format(self.sbdd_dir, idx))  
        in_ = np.array(im, dtype=np.float32)  
        in_ = in_[:,:,::-1]  
        in_ -= self.mean  
        in_ = in_.transpose((2,0,1))  
        return in_  


    def load_label(self, idx):  
        """ 
        Load label image as 1 x height x width integer array of label indices. 
        The leading singleton dimension is required by the loss. 
        """  
        import scipy.io  
        mat = scipy.io.loadmat('{}/cls/{}.mat'.format(self.sbdd_dir, idx)) #训练集的标签为.mat格式  
        label = mat['GTcls'][0]['Segmentation'][0].astype(np.uint8)  
        label = label[np.newaxis, ...]  
        return label  

SBDDSegDataLayer类所加载的训练样本的标记图片是按Matlab的mat进行存储的,但实际使用时,我们没有必要按照mat格式来加载标记图片,可参见VOCSegDataLayer类直接读取.png或.jpg格式的标记图片,即可将这个load_label()函数修改为:
[python] view plain copy

   def load_label(self, idx):  
        """ 
        Load label image as 1 x height x width integer array of label indices. 
        The leading singleton dimension is required by the loss. 
        """  
        im = Image.open('{}/SegmentationClass/{}.png'.format(self.sbdd_dir, idx))  
        label = np.array(im, dtype=np.uint8)  
        label = label[np.newaxis, ...]  
        return label  

转自:https://blog.csdn.net/qq_21368481/article/details/80246028

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值