朱师兄论文复现

  • 训练数据集的位置是FCN-data-nyud-data-images-ISO_rot_UZ{}.mat共29个
  • 生成网络结构文件:使用Python脚本生成 trainval.prototxttest.prototxt
  • 准备Solver文件:手动编写一个 solver.prototxt 文件。
  • 启动训练:使用命令 caffe train -solver solver.prototxt 来启动训练过程。
  • 其中nyud_dir为 FCN/data/nyud

fcn8s-color

其中deploy.prototxt:常见的配置文件测试的时候用!测试caffe model文件,用于定义一个训练好的模型在推理(预测)阶段的结构和设置。与训练用的 train.prototxt 文件不同,deploy.prototxt 文件通常用于在生产环境中进行模型推理(维度变了 还有一个参数num_output类别数?)现在数据层是任意的一个输入

layer {
  name: "input"
  type: "Input"
  top: "data"
  input_param {
    # These dimensions are purely for sake of example;
    # see infer.py for how to reshape the net to the given input size.
    shape { dim: 1 dim: 1 dim: 401 dim: 401 }
# batch=1 chanel=1灰度值 nx=401 nz=401一张一张的测试

#输入层定义了网络的输入数据,形状为(1, 3, 425, 560),即批大小为1,1个通道,图像高度为401,宽度为#401
  }
}
#卷积层conv1_1使用64个3x3卷积核,填充100,步幅为1。ReLU层relu1_1应用非线性激活
layer {
  name: "conv1_1"
  type: "Convolution"
  bottom: "data"
  top: "conv1_1"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 64
    pad: 100
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "relu1_1"
  type: "ReLU"
  bottom: "conv1_1"
  top: "conv1_1"
}
#卷积层conv1_2使用64个3x3卷积核,填充1,步幅为1。ReLU层relu1_2应用非线性激活
layer {
  name: "conv1_2"
  type: "Convolution"
  bottom: "conv1_1"
  top: "conv1_2"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 64
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "relu1_2"
  type: "ReLU"
  bottom: "conv1_2"
  top: "conv1_2"
}
#池化层pool1使用2x2的最大池化,步幅为2
layer {
  name: "pool1"
  type: "Pooling"
  bottom: "conv1_2"
  top: "pool1"
  pooling_param {
    pool: MAX
    kernel_size: 2
    stride: 2
  }
}
'''
网络继续使用更多的卷积、ReLU和池化层,结构类似于上述第一和第二组卷积层和池化层。每一组层将逐渐增加卷积核的数量(如128, 256, 512),并应用相应的激活和池化操作
'''
layer {
  name: "conv2_1"
  type: "Convolution"
  bottom: "pool1"
  top: "conv2_1"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 128
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "relu2_1"
  type: "ReLU"
  bottom: "conv2_1"
  top: "conv2_1"
}
layer {
  name: "conv2_2"
  type: "Convolution"
  bottom: "conv2_1"
  top: "conv2_2"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 128
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "relu2_2"
  type: "ReLU"
  bottom: "conv2_2"
  top: "conv2_2"
}
layer {
  name: "pool2"
  type: "Pooling"
  bottom: "conv2_2"
  top: "pool2"
  pooling_param {
    pool: MAX
    kernel_size: 2
    stride: 2
  }
}
layer {
  name: "conv3_1"
  type: "Convolution"
  bottom: "pool2"
  top: "conv3_1"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 256
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "relu3_1"
  type: "ReLU"
  bottom: "conv3_1"
  top: "conv3_1"
}
layer {
  name: "conv3_2"
  type: "Convolution"
  bottom: "conv3_1"
  top: "conv3_2"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 256
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "relu3_2"
  type: "ReLU"
  bottom: "conv3_2"
  top: "conv3_2"
}
layer {
  name: "conv3_3"
  type: "Convolution"
  bottom: "conv3_2"
  top: "conv3_3"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 256
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "relu3_3"
  type: "ReLU"
  bottom: "conv3_3"
  top: "conv3_3"
}
layer {
  name: "pool3"
  type: "Pooling"
  bottom: "conv3_3"
  top: "pool3"
  pooling_param {
    pool: MAX
    kernel_size: 2
    stride: 2
  }
}
layer {
  name: "conv4_1"
  type: "Convolution"
  bottom: "pool3"
  top: "conv4_1"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 512
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "relu4_1"
  type: "ReLU"
  bottom: "conv4_1"
  top: "conv4_1"
}
layer {
  name: "conv4_2"
  type: "Convolution"
  bottom: "conv4_1"
  top: "conv4_2"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 512
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "relu4_2"
  type: "ReLU"
  bottom: "conv4_2"
  top: "conv4_2"
}
layer {
  name: "conv4_3"
  type: "Convolution"
  bottom: "conv4_2"
  top: "conv4_3"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 512
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "relu4_3"
  type: "ReLU"
  bottom: "conv4_3"
  top: "conv4_3"
}
layer {
  name: "pool4"
  type: "Pooling"
  bottom: "conv4_3"
  top: "pool4"
  pooling_param {
    pool: MAX
    kernel_size: 2
    stride: 2
  }
}
layer {
  name: "conv5_1"
  type: "Convolution"
  bottom: "pool4"
  top: "conv5_1"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 512
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "relu5_1"
  type: "ReLU"
  bottom: "conv5_1"
  top: "conv5_1"
}
layer {
  name: "conv5_2"
  type: "Convolution"
  bottom: "conv5_1"
  top: "conv5_2"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 512
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "relu5_2"
  type: "ReLU"
  bottom: "conv5_2"
  top: "conv5_2"
}
layer {
  name: "conv5_3"
  type: "Convolution"
  bottom: "conv5_2"
  top: "conv5_3"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 512
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "relu5_3"
  type: "ReLU"
  bottom: "conv5_3"
  top: "conv5_3"
}
layer {
  name: "pool5"
  type: "Pooling"
  bottom: "conv5_3"
  top: "pool5"
  pooling_param {
    pool: MAX
    kernel_size: 2
    stride: 2
  }
}
#fc6层是一个使用7x7卷积核的卷积层,相当于全连接层,有4096个输出。relu6应用ReLU激活,drop6应用#Dropout防止过拟合。

layer {
  name: "fc6"
  type: "Convolution"
  bottom: "pool5"
  top: "fc6"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 4096
    pad: 0
    kernel_size: 7
    stride: 1
  }
}
layer {
  name: "relu6"
  type: "ReLU"
  bottom: "fc6"
  top: "fc6"
}
layer {
  name: "drop6"
  type: "Dropout"
  bottom: "fc6"
  top: "fc6"
  dropout_param {
    dropout_ratio: 0.5
  }
}
layer {
  name: "fc7"
  type: "Convolution"
  bottom: "fc6"
  top: "fc7"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 4096
    pad: 0
    kernel_size: 1
    stride: 1
  }
}
layer {
  name: "relu7"
  type: "ReLU"
  bottom: "fc7"
  top: "fc7"
}
layer {
  name: "drop7"
  type: "Dropout"
  bottom: "fc7"
  top: "fc7"
  dropout_param {
    dropout_ratio: 0.5
  }
}
#score_fr层使用1x1卷积核,有75个输出,用于分类任务。
layer {
  name: "score_fr"
  type: "Convolution"
  bottom: "fc7"
  top: "score_fr"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 75
    pad: 0
    kernel_size: 1
  }
}
#upscore2是一个反卷积层,将分类分数上采样一倍。

#类似的反卷积和融合操作在后续层中重复,逐步将特征图上采样至与输入图像相同的空间分辨率
layer {
  name: "upscore2"
  type: "Deconvolution"
  bottom: "score_fr"
  top: "upscore2"
  param {
    lr_mult: 0.0
  }
  convolution_param {
    num_output: 75
    bias_term: false
    kernel_size: 4
    stride: 2
  }
}
layer {
  name: "score_pool4"
  type: "Convolution"
  bottom: "pool4"
  top: "score_pool4"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 75
    pad: 0
    kernel_size: 1
  }
}
layer {
  name: "score_pool4c"
  type: "Crop"
  bottom: "score_pool4"
  bottom: "upscore2"
  top: "score_pool4c"
  crop_param {
    axis: 2
    offset: 5
  }
}
layer {
  name: "fuse_pool4"
  type: "Eltwise"
  bottom: "upscore2"
  bottom: "score_pool4c"
  top: "fuse_pool4"
  eltwise_param {
    operation: SUM
  }
}
layer {
  name: "upscore_pool4"
  type: "Deconvolution"
  bottom: "fuse_pool4"
  top: "upscore_pool4"
  param {
    lr_mult: 0.0
  }
  convolution_param {
    num_output: 75
    bias_term: false
    kernel_size: 4
    stride: 2
  }
}
layer {
  name: "score_pool3"
  type: "Convolution"
  bottom: "pool3"
  top: "score_pool3"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 75
    pad: 0
    kernel_size: 1
  }
}
layer {
  name: "score_pool3c"
  type: "Crop"
  bottom: "score_pool3"
  bottom: "upscore_pool4"
  top: "score_pool3c"
  crop_param {
    axis: 2
    offset: 9
  }
}
layer {
  name: "fuse_pool3"
  type: "Eltwise"
  bottom: "upscore_pool4"
  bottom: "score_pool3c"
  top: "fuse_pool3"
  eltwise_param {
    operation: SUM
  }
}
layer {
  name: "upscore8"
  type: "Deconvolution"
  bottom: "fuse_pool3"
  top: "upscore8"
  param {
    lr_mult: 0.0
  }
  convolution_param {
    num_output: 75
    bias_term: false
    kernel_size: 16
    stride: 8
  }
}
#最后,score层通过裁剪操作将上采样后的特征图对齐到输入图像的尺寸,生成最终的分割结果
layer {
  name: "score"
  type: "Crop"
  bottom: "upscore8"
  bottom: "data"
  top: "score"
  crop_param {
    axis: 2
    offset: 31
  }
}

net.py:这段代码使用了Caffe框架,定义了一个基于全卷积网络(Fully Convolutional Network, FCN)的深度学习模型,并将其保存为Caffe的网络配置文件(train.prototxt,test.prototxt。模型的目的是进行语义分割任务,即对图像的每个像素进行分类。以下是代码的详细解释

(全连接层的num_output=5/75)

import caffe
from caffe import layers as L, params as P
from caffe.coord_map import crop

def conv_relu(bottom, nout, ks=3, stride=1, pad=1):
    conv = L.Convolution(bottom, kernel_size=ks, stride=stride,
        num_output=nout, pad=pad,
        param=[dict(lr_mult=1, decay_mult=1), dict(lr_mult=2, decay_mult=0)])
    return conv, L.ReLU(conv, in_place=True)

def max_pool(bottom, ks=2, stride=2):
    return L.Pooling(bottom, pool=P.Pooling.MAX, kernel_size=ks, stride=stride)

def fcn(split, tops):
    n = caffe.NetSpec()
    n.data, n.label = L.Python(module='nyud_layers',
            layer='NYUDSegDataLayer', ntop=2,
            param_str=str(dict(nyud_dir='../data/nyud', split=split,
                tops=tops, seed=1337)))

    # the base net
    n.conv1_1, n.relu1_1 = conv_relu(n.data, 64, pad=100)
    n.conv1_2, n.relu1_2 = conv_relu(n.relu1_1, 64)
    n.pool1 = max_pool(n.relu1_2)

    n.conv2_1, n.relu2_1 = conv_relu(n.pool1, 128)
    n.conv2_2, n.relu2_2 = conv_relu(n.relu2_1, 128)
    n.pool2 = max_pool(n.relu2_2)

    n.conv3_1, n.relu3_1 = conv_relu(n.pool2, 256)
    n.conv3_2, n.relu3_2 = conv_relu(n.relu3_1, 256)
    n.conv3_3, n.relu3_3 = conv_relu(n.relu3_2, 256)
    n.pool3 = max_pool(n.relu3_3)

    n.conv4_1, n.relu4_1 = conv_relu(n.pool3, 512)
    n.conv4_2, n.relu4_2 = conv_relu(n.relu4_1, 512)
    n.conv4_3, n.relu4_3 = conv_relu(n.relu4_2, 512)
    n.pool4 = max_pool(n.relu4_3)

    n.conv5_1, n.relu5_1 = conv_relu(n.pool4, 512)
    n.conv5_2, n.relu5_2 = conv_relu(n.relu5_1, 512)
    n.conv5_3, n.relu5_3 = conv_relu(n.relu5_2, 512)
    n.pool5 = max_pool(n.relu5_3)

    # fully conv
    n.fc6, n.relu6 = conv_relu(n.pool5, 4096, ks=7, pad=0)
    n.drop6 = L.Dropout(n.relu6, dropout_ratio=0.5, in_place=True)
    n.fc7, n.relu7 = conv_relu(n.drop6, 4096, ks=1, pad=0)
    n.drop7 = L.Dropout(n.relu7, dropout_ratio=0.5, in_place=True)

    n.score_fr = L.Convolution(n.drop7, num_output=75, kernel_size=1, pad=0,
        param=[dict(lr_mult=1, decay_mult=1), dict(lr_mult=2, decay_mult=0)])
    n.upscore2 = L.Deconvolution(n.score_fr,
        convolution_param=dict(num_output=75, kernel_size=4, stride=2,
            bias_term=False),
        param=[dict(lr_mult=0)])

    n.score_pool4 = L.Convolution(n.pool4, num_output=75, kernel_size=1, pad=0,
        param=[dict(lr_mult=1, decay_mult=1), dict(lr_mult=2, decay_mult=0)])
    n.score_pool4c = crop(n.score_pool4, n.upscore2)
    n.fuse_pool4 = L.Eltwise(n.upscore2, n.score_pool4c,
            operation=P.Eltwise.SUM)
    n.upscore_pool4 = L.Deconvolution(n.fuse_pool4,
        convolution_param=dict(num_output=40, kernel_size=4, stride=2,
            bias_term=False),
        param=[dict(lr_mult=0)])

    n.score_pool3 = L.Convolution(n.pool3, num_output=75, kernel_size=1, pad=0,
        param=[dict(lr_mult=1, decay_mult=1), dict(lr_mult=2, decay_mult=0)])
    n.score_pool3c = crop(n.score_pool3, n.upscore_pool4)
    n.fuse_pool3 = L.Eltwise(n.upscore_pool4, n.score_pool3c,
            operation=P.Eltwise.SUM)
    n.upscore8 = L.Deconvolution(n.fuse_pool3,
        convolution_param=dict(num_output=75, kernel_size=16, stride=8,
            bias_term=False),
        param=[dict(lr_mult=0)])

    n.score = crop(n.upscore8, n.data)
    n.loss = L.SoftmaxWithLoss(n.score, n.label,
            loss_param=dict(normalize=False, ignore_label=255))

    return n.to_proto()

def make_net():
    tops = ['color', 'label']
    with open('trainval.prototxt', 'w') as f:
        f.write(str(fcn('trainval', tops)))

    with open('test.prototxt', 'w') as f:
        f.write(str(fcn('test', tops)))

if __name__ == '__main__':
    make_net()
'''
make_net() 函数会生成两个配置文件:
trainval.prototxt:用于训练和验证数据集。
test.prototxt:用于测试数据集
'''

训练和测试网络

  • 训练网络
    • 一旦生成了 trainval.prototxt,你可以使用Caffe命令行工具来启动训练。训练时还需要提供一个Solver文件(.prototxt格式),其中定义了优化器参数、学习率、训练时长等。Caffe的训练入口通常是类似于 caffe train -solver solver.prototxt 的命令。
  • 测试网络
    • test.prototxt 文件则用于测试阶段,Caffe可以加载预训练的模型权重,并对测试集进行评估。测试时使用类似于 caffe test -model test.prototxt -weights model.caffemodel 的命令。
  • nyud_layers.py这个Python代码定义了一个名为 NYUDSegDataLayer 的Caffe数据层,它从NYUDv2数据集加载数据,用于RGB-D图像的语义分割任务。这种数据层用于在训练和测试过程中向神经网络提供数据。以下是代码的详细解释 d
    • import caffe
      
      import numpy as np
      from PIL import Image
      import scipy.io
      
      import random
      
      import pdb#这行师兄代码没有
      class NYUDSegDataLayer(caffe.Layer):
          """
          Load (input image, label image) pairs from NYUDv2
          one-at-a-time while reshaping the net to preserve dimensions.
      
          The labels follow the 40 class task defined by
      
              S. Gupta, R. Girshick, p. Arbelaez, and J. Malik. Learning rich features
              from RGB-D images for object detection and segmentation. ECCV 2014.
      
          with 0 as the void label and 1-40 the classes.
      
          Use this to feed data to a fully convolutional network.
          """
      
          def setup(self, bottom, top):
              """
              Setup data layer according to parameters:
      
              - nyud_dir: path to NYUDv2 dir
              - split: train / val / test
              - tops: list of tops to output from {color, depth, hha, label}
              - randomize: load in random order (default: True)
              - seed: seed for randomization (default: None / current time)
      
              for NYUDv2 semantic segmentation.
      
              example: params = dict(nyud_dir="/path/to/NYUDVOC2011", split="val",
                                     tops=['color', 'hha', 'label'])
              """
              # config
              params = eval(self.param_str)
              self.nyud_dir = params['nyud_dir']
              self.split = params['split']
              self.tops = params['tops']
              self.random = params.get('randomize', True)
              self.seed = params.get('seed', None)
      
              # store top data for reshape + forward
              self.data = {}
      
              # means
              self.mean_bgr = np.array((0,0), dtype=np.float32)
      #表示创建了一个只包含一个浮点数 0.0 的 NumPy 数组。这个数组的类型为 float32,并且只有一个元素。##这种设置可能在处理单通道数据时是合理的,但对于 BGR 彩色图像的处理是不合适的
              self.mean_hha = np.array((132.431, 94.076, 118.477), dtype=np.float32)
              self.mean_logd = np.array((7.844,), dtype=np.float32)
              self.class_map = scipy.io.loadmat('{}/data/benchmarkData/metadata/classMapping40.mat'.format(self.nyud_dir))['mapClass'].astype(np.uint8)
      
              # tops: check configuration
              if len(top) != len(self.tops):
                  raise Exception("Need to define {} tops for all outputs.")
              # data layers have no bottoms
              if len(bottom) != 0:
                  raise Exception("Do not define a bottom.")
      
              # load indices for images and labels
              split_f  = '{}/{}.txt'.format(self.nyud_dir, self.split)
      #获取训练和测试的索引 nyud_dir=fcn/data/nyud split=trianval/test
              self.indices = open(split_f, 'r').read().splitlines()
              self.idx = 0
      
              # make eval deterministic
              if 'train' not in self.split:
                  self.random = False
      
              # randomization: seed and pick
              if self.random:
                  random.seed(self.seed)
                  self.idx = random.randint(0, len(self.indices)-1)
      
          def reshape(self, bottom, top):
              # load data for tops and  reshape tops to fit (1 is the batch dim)
              for i, t in enumerate(self.tops):
                  self.data[t] = self.load(t, self.indices[self.idx])
                  top[i].reshape(1, *self.data[t].shape)
      
          def forward(self, bottom, top):
              # assign output
              for i, t in enumerate(self.tops):
                  top[i].data[...] = self.data[t]
      
              # pick next input
              if self.random:
                  self.idx = random.randint(0, len(self.indices)-1)
              else:
                  self.idx += 1
                  if self.idx == len(self.indices):
                      self.idx = 0
      
          def backward(self, top, propagate_down, bottom):
              pass
      
          def load(self, top, idx):
              if top == 'color':
                  return self.load_image(idx)
              elif top == 'label':
                  return self.load_label(idx)
              elif top == 'depth':
                  return self.load_depth(idx)
              elif top == 'hha':
                  return self.load_hha(idx)
              else:
                  raise Exception("Unknown output type: {}".format(top))
      #输入数据预处理
          def load_image(self, idx):
              """
              Load input image and preprocess for Caffe:
              29*nx*nt
              - cast to float
              - switch channels RGB -> BGR
              - subtract mean
              - transpose to channel x height x width order
              """
              #im = scipy.io.loadmat('{}/data/images/UX{}.mat'.format(self.nyud_dir, idx))['iso2'].astype(np.float32)
              im = scipy.io.loadmat('{}/data/images/ISO_rot_UZ{}.mat'.format(self.nyud_dir,idx))['pao'].astype(np.float32)
              im=im*1e+15
      '''
      8s
      im = scipy.io.loadmat('{}/data/images/ISO_rot_UZ{}.mat'.format(self.nyud_dir,idx))['pao'].astype(np.float32)#文件里确实有29个炮记录数据
              im=im*1e+15
      8s color
      im=scipy.io.loadmat('{}/data/images/UX{}.mat'.format(self.nyud_dir,idx))['iso2'].astype(np.float32)#mat3里有UX文件
       
      16s-color
      im=scipy.io.loadmat('{}/data/images/UZ{}.mat'.format(self.nyud_dir,idx))['pao'].astype(np.float32)
      im=im*1e+10
      
      32s-color
      im=scipy.io.loadmat('{}/data/images/UX{}.mat'.format(self.nyud_dir,idx))['iso2'].astype(np.float32)
      '''
      
      '''
      scipy.io.loadmat('{}/data/images/UX{}.mat'.format(self.nyud_dir, idx)) 使用 SciPy 库加载 MATLAB .mat 文件。文件路径根据 self.nyud_dir 和 idx 动态生成
      ['iso2'] 从 .mat 文件中提取名为 'iso2' 的数据。假设 iso2 是存储图像数据的键
      因此,加载的数据文件是以 UX{idx}.mat 命名的,文件内容通过 'iso2' 键访问
      中 {} 是 nyud_dir 路径,{} 是图像的索引 idx
      '''
              im=np.reshape(im,(400,400,1))
              in_ = np.array(im, dtype=np.float32)
              in_ = in_[:,:,::-1]
              in_ -= self.mean_bgr
              in_ = in_.transpose((2,0,1))
              return in_
      
          def load_label(self, idx):
              """
              Load label image as 1 x height x width integer array of label indices.
              Shift labels so that classes are 0-39 and void is 255 (to ignore it).
              The leading singleton dimension is required by the loss.
              """
              label = scipy.io.loadmat('{}/segmentation/LUX{}.mat'.format(self.nyud_dir, idx))['iso2'].astype(np.uint8)#8scolor
      '''
      8s
      label = scipy.io.loadmat('{}/segmentation/zrc.mat'.format(self.nyud_dir, idx))['Yita'].astype(np.float32)#对应的是双层介质特征尺度参数模型(上层:0.0005202 下层:0.002032)
      label=label*1e+4
      
      8s color
      label = scipy.io.loadmat('{}/segmentation/LUX{}.mat'.format(self.nyud_dir, idx))['iso2'].astype(np.uint8)#8scolor
      label = label.astype(np.flost32)
      
      16s-color
      label = scipy.io.loadmat('{}/segmentation/label.mat'.format(self.nyud_dir, idx))['double_yita'].astype(np.uint8)
      
      32scolor
      label = scipy.io.loadmat('{}/segmentation/LUX{}.mat'.format(self.nyud_dir, idx))['iso2'].astype(np.uint8)#8scolor
      label = label.astype(np.flost32)
      
      '''
              for (x,y), value in np.ndenumerate(label):#8s没有
                  label[x,y] = self.class_map[0][value-1]#
              label = label.astype(np.uint8)
              label -= 1  # rotate labels
              label = label[np.newaxis, ...]
              # pdb.set_trace()
              return label
            
      # 16 32 把下面两段代码都注释了
          def load_depth(self, idx):
              """
              Load pre-processed depth for NYUDv2 segmentation set.
              """
              im = Image.open('{}/data/depth/img_{}.png'.format(self.nyud_dir, idx))
              d = np.array(im, dtype=np.float32)
              d = np.log(d)
              d -= self.mean_logd
              d = d[np.newaxis, ...]
              return d
      
          def load_hha(self, idx):
              """
              Load HHA features from Gupta et al. ECCV14.
              See https://github.com/s-gupta/rcnn-depth/blob/master/rcnn/saveHHA.m
              """
              im = Image.open('{}/data/hha/img_{}.png'.format(self.nyud_dir, idx))
              hha = np.array(im, dtype=np.float32)
              hha -= self.mean_hha
              hha = hha.transpose((2,0,1))
              return hha

      score.py:这段代码用于在语义分割任务中评估模型的性能。它包含了计算混淆矩阵(Confusion Matrix)、评估模型在测试集上的性能指标(如总体精度、平均精度、平均交并比等),并在指定的目录中保存预测的分割结果图像,主要是调用脚本中的 seg_tests 函数来评估模型:

    • seg_tests 函数中,接收 solver 对象意味着这个函数将利用 solver 来获取测试网络,并在指定的迭代次数上执行测试

    • 在Caffe框架中,solver 对象是核心组件之一,用于控制模型的训练和测试过程。solver 对象包含网络结构、优化算法、学习率等信息,并负责执行前向传播、反向传播以及参数更新等任务。

    • from __future__ import division
      import caffe
      import numpy as np
      import os
      import sys
      from datetime import datetime
      from PIL import Image
      #计算混淆矩阵(Confusion Matrix),用来衡量预测结果与真实标签之间的匹配情况。
      def fast_hist(a, b, n):
          k = (a >= 0) & (a < n)
          return np.bincount(n * a[k].astype(int) + b[k], minlength=n**2).reshape(n, n)
      
      def compute_hist(net, save_dir, dataset, layer='score', gt='label'):
          n_cl = net.blobs[layer].channels
          if save_dir:
              os.mkdir(save_dir)
          hist = np.zeros((n_cl, n_cl))
          loss = 0
          for idx in dataset:
              net.forward()
              hist += fast_hist(net.blobs[gt].data[0, 0].flatten(),
                                      net.blobs[layer].data[0].argmax(0).flatten(),
                                      n_cl)
      
              if save_dir:
                  im = Image.fromarray(net.blobs[layer].data[0].argmax(0).astype(np.uint8), mode='P')
                  im.save(os.path.join(save_dir, idx + '.png'))
              # compute the loss as well
              loss += net.blobs['loss'].data.flat[0]
          return hist, loss / len(dataset)
      
      def seg_tests(solver, save_format, dataset, layer='score', gt='label'):
          print '>>>', datetime.now(), 'Begin seg tests'
          solver.test_nets[0].share_with(solver.net)
          do_seg_tests(solver.test_nets[0], solver.iter, save_format, dataset, layer, gt)
      
      def do_seg_tests(net, iter, save_format, dataset, layer='score', gt='label'):
          n_cl = net.blobs[layer].channels
          if save_format:
              save_format = save_format.format(iter)
          hist, loss = compute_hist(net, save_format, dataset, layer, gt)
          # mean loss
          print '>>>', datetime.now(), 'Iteration', iter, 'loss', loss
          # overall accuracy
          acc = np.diag(hist).sum() / hist.sum()
          print '>>>', datetime.now(), 'Iteration', iter, 'overall accuracy', acc
          # per-class accuracy
          acc = np.diag(hist) / hist.sum(1)
          print '>>>', datetime.now(), 'Iteration', iter, 'mean accuracy', np.nanmean(acc)
          # per-class IU
          iu = np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist))
          print '>>>', datetime.now(), 'Iteration', iter, 'mean IU', np.nanmean(iu)
          freq = hist.sum(1) / hist.sum()
          print '>>>', datetime.now(), 'Iteration', iter, 'fwavacc', \
                  (freq[freq > 0] * iu[freq > 0]).sum()
          return hist

    • solver.prototxt 是一个配置文件,定义了solver对象的参数和行为。
    • solver 对象是在Caffe运行时由solver.prototxt文件生成的Python对象,它管理和控制训练、测试等过程

solve.py 是文件进行网络训练的入口文件

用于训练和评估深度学习模型的

  • caffe.SGDSolver:

    • SGDSolver 是 Caffe 中的一个类,用于实现基于随机梯度下降法(Stochastic Gradient Descent, SGD)的求解器。
    • 它继承自 Caffe 的 Solver 基类,专门用于通过 SGD 方式进行网络权重的优化。
  • solver.prototxt:

    • 这是一个配置文件,定义了训练过程的各种参数,比如学习率(learning rate)、优化器的参数、模型保存路径、训练迭代次数等。
    • solver.prototxt 还指定了要使用的网络模型(通常通过 net: "train_val.prototxt" 指定)。
  • 代码功能:

    • solver = caffe.SGDSolver('solver.prototxt') 这行代码通过读取 solver.prototxt 配置文件,创建了一个 solver 对象。
    • 这个对象可以用来执行训练过程,例如通过 solver.step(n) 来运行 n 次迭代,或者直接调用 solver.solve() 来运行整个训练过程
      import caffe
      import surgery, score
      
      import numpy as np
      import os
      import sys
      #caffe: Caffe 框架的核心模块。
      #surgery: 自定义模块,用于处理模型的插值等。
      #score: 自定义模块,用于评估模型性能。
      #numpy: 用于处理数组数据。
      #os, sys: 用于处理文件和系统相关操作。
      
      try:
          import setproctitle
          setproctitle.setproctitle(os.path.basename(os.getcwd()))
      except:
          pass
      #这是一个可选的功能,用于设置当前进程的标题为当前工作目录的名称。主要用于进程管理和调试。
      weights = '../nyud-fcn16s-color/snapshot/train_iter_170000.caffemodel'
      #vgg_weights = '../ilsvrc-nets/VGG_ILSVRC_16_layers.caffemodel'
      #vgg_proto = '../ilsvrc-nets/VGG_ILSVRC_16_layers_deploy.prototxt'
      
      # init
      #caffe.set_device(int(sys.argv[1]))
      caffe.set_device(0)
      caffe.set_mode_gpu()
      #caffe.set_mode_cpu()
      
      solver = caffe.SGDSolver('solver.prototxt')
      solver.net.copy_from(weights)
      #solver = caffe.SGDSolver('solver.prototxt')
      #vgg_net = caffe.Net(vgg_proto, vgg_weights, caffe.TRAIN)
      #surgery.transplant(solver.net, vgg_net)
      #del vgg_net
      '''
      solver = caffe.SGDSolver('solver.prototxt'): 创建一个 SGDSolver 对象,用于训练模型。solver.prototxt 定义了训练的参数和网络结构。
      solver.net.copy_from(weights): 从指定的预训练模型权重文件加载权重到网络中。
      注释掉的部分显示了如果需要迁移学习,可以加载 VGG 网络并将其权重转移到当前网络
      '''
      # surgeries
      #查找所有包含 'up' 的层名,并应用插值处理。这通常是用于调整网络中某些层的尺寸,确保输出尺寸匹配
      interp_layers = [k for k in solver.net.params.keys() if 'up' in k]
      surgery.interp(solver.net, interp_layers)
      
      # scoring
      #从 test.txt 文件中读取测试数据集的索引。test.txt 文件包含了测试数据的文件名或索引。
      test = np.loadtxt('../data/nyud/test.txt', dtype=str)
      #solver.step(5000): 进行 5000 步训练。solver.step 函数会执行指定数量的训练步骤。
      #score.seg_tests(solver, False, test, layer='score'): 评估模型性能。seg_tests 函数会使用测#试数据集进行模型评估,并输出性能指标。
      for _ in range(50):
          solver.step(5000)
          score.seg_tests(solver, False, test, layer='score')

    • 模型权重:训练完成后,模型的权重会被更新,新的权重会保存在 solver.snapshot_prefix 指定的位置。

    • 总的来说,执行这个 Python 文件将启动训练过程,并定期对模型进行评估,生成相关的性能指标输出和预测结果图像

    • Solver.py配置文件设置了一个固定学习率的训练过程,主要是针对一个已经确定好的模型进行长时间的训练,同时保存模型快照以便在未来使用。显示和测试设置表明它主要侧重于训练过程,而不是频繁的测试和验证

      train_net: "trainval.prototxt"
      '''
      指定用于训练的网络结构文件。trainval.prototxt 包含了模型的架构、数据层以及其他层的定义。这个文件被用来初始化训练过程
      '''
      test_net: "test.prototxt"
      '''
      指定用于测试的网络结构文件。test.prototxt 通常包含与训练网络相同的架构,但可能会移除一些与训练无关的层,如 dropout 层。
      '''
      test_iter: 200
      #每次测试时使用的迭代次数。假设每个 test_iter 处理一个 batch(批次)的测试样本,那么这里会用 200 #个 batch 进行测试。也就是说,测试集的总样本数应该是 200 * batch_size。
      # make test net, but don't invoke it from the solver itself
      test_interval: 999999999
      #每隔多少次迭代进行一次测试 过大表示不进行测试
      display: 20
      #隔 20 次迭代打印一次训练状态,比如当前的损失值
      average_loss: 20
      #损失值是基于前 20 次迭代的损失进行平均的。
      lr_policy: "fixed"
      #学习率固定
      # lr for unnormalized softmax
      base_lr: 1e-10#14变成10
      #基础学习率,不能太大。指定训练过程中使用的学习率。在此例中,学习率被设置为一个非常小的值 1e-14。#学习率太低#可能会导致模型训练非常缓慢,甚至停滞
      #最终学习率=基础学习率*每层学习率
      # high momentum
      momentum: 0.99
      #用于加速 SGD 优化算法中的梯度下降过程。动量值接近 1 表示历史梯度的权重较大,可以平滑训练过程中的#梯度更新。
      # no gradient accumulation
      iter_size: 1
      #iter_size=1 表示没有累积,梯度将在每个 batch 后更新
      max_iter: 150000
      #最大迭代次数。训练将在 150,000 次迭代后停止
      weight_decay: 0.0005
      # 权重衰减因子,常用于 L2 正则化,以防止模型过拟合。它通过在损失函数中添加权重平方和的惩罚项来实##现
      snapshot: 2000
      #每隔 2000 次迭代保存一次模型快照(即保存模型的当前状态),以便在之后可以恢复训练或用于推理。
      snapshot_prefix: "snapshot/train"
      #保存快照文件的路径
      test_initialization: false
      #设为 false,则在训练开始时不会测试模型

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值