caffe下在已有的网络上进行微调

最新推荐文章于 2019-04-01 16:46:47 发布

Ychan_cc

最新推荐文章于 2019-04-01 16:46:47 发布

阅读量1k

点赞数

分类专栏： caffe

本文链接：https://blog.csdn.net/Ychan_cc/article/details/69569390

版权

caffe 专栏收录该内容

8 篇文章 0 订阅

订阅专栏

本文根据Caffe官方教程，详细介绍了如何在预训练模型上进行微调。首先尝试仅对最后一层进行局部微调，然后通过设置learn_all参数进行全局微调，以提升模型性能。主要内容涉及模块导入、预训练网络配置、数据输入和输出的修改以及微调过程。

摘要由CSDN通过智能技术生成

本文主要学习caffe官网文章：Fine-tuning for Style Recognition的整理

内容包括两个部分：

１．　采用预训练好的网络，改变最后一层，只对最后一层网络进行微调参数（局部微调），效果不太好

２．　开启learn_all参数对全局进行微调（这里是将第一步局部调后的权值作为初始化权值）

１）　模块导入

#!usr/bin/env python
# -*- coding:utf-8 -*- 
"""
@author:cc
@file: Fine_tuning a pretrained network for style recognition.py
@time:2017/04/06
"""
caffe_root = '/home/cc/caffe-master/'  # this file should be run from {caffe_root}/examples (otherwise change this line)

import sys
sys.path.insert(0, caffe_root + 'python')
import caffe

# caffe.set_device(0)
# caffe.set_mode_gpu()
caffe.set_mode_cpu()

import numpy as np
from pylab import *
# %matplotlib inline
import tempfile

2) 定义图片格式转换函数

# Helper function for deprocessing preprocessed images, e.g., for display.
def deprocess_net_image(image):
    """将caffe格式图片转化为python格式"""
    image = image.copy()              # don't modify destructively
    image = image[::-1]               # BGR -> RGB
    image = image.transpose(1, 2, 0)  # CHW -> HWC
    image += [123, 117, 104]          # (approximately) undo mean subtraction, 近似加回减去的均值
    # clamp values in [0, 255]
    image[image < 0], image[image > 255] = 0, 255

    # round and cast from float32 to uint8
    image = np.round(image)
    image = np.require(image, dtype=np.uint8)

    return image

３）　下载相关数据集

# 1) setup and dataset download
# Download just a small subset of the data for this exercise.
# (2000 of 80K images, 5 of 20 labels.)
# To download the entire dataset, set `full_dataset = True`.
full_dataset = False
if full_dataset:
    NUM_STYLE_IMAGES = NUM_STYLE_LABELS = -1
else:
    NUM_STYLE_IMAGES = 2000
    NUM_STYLE_LABELS = 5

# This downloads the ilsvrc auxiliary data (mean file, etc),
# and a subset of 2000 images for the style recognition task.
import os
os.chdir(caffe_root)  # run scripts from caffe root
# !data/ilsvrc12/get_ilsvrc_aux.sh
# !scripts/download_model_binary.py models/bvlc_reference_caffenet
# os.system("python /home/cc/caffe-master/examples/finetune_flickr_style/assemble_data.py \
#    --workers=-1  --seed=1701 \
#  --images=$NUM_STYLE_IMAGES  --label=$NUM_STYLE_LABELS")

# os.chdir('examples')

# 2) define weights
import os
weights = os.path.join(caffe_root, 'models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel')
assert os.path.exists(weights)


# load the imageNet labels and 5 style labels
# Load ImageNet labels to imagenet_labels
imagenet_label_file = caffe_root + 'data/ilsvrc12/synset_words.txt'
imagenet_labels = list(np.loadtxt(imagenet_label_file, str, delimiter='\t'))
assert len(imagenet_labels) == 1000
print 'Loaded ImageNet labels:\n', '\n'.join(imagenet_labels[:10] + ['...'])

# Load style labels to style_labels
style_label_file = caffe_root + 'examples/finetune_flickr_style/style_names.txt'
style_labels = list(np.loadtxt(style_label_file, str, delimiter='\n'))
if NUM_STYLE_LABELS > 0:
    style_labels = style_labels[:NUM_STYLE_LABELS]
print '\nLoaded style labels:\n', ',　'.join(style_labels)

４）产生预训练网络的.prototxt配置文件

#define and running the nets
from caffe import layers as L
from caffe import params as P

#定义权值学习策略
weight_param = dict(lr_mult=1, decay_mult=1)
bias_param   = dict(lr_mult=2, decay_mult=0)
learned_param = [weight_param, bias_param]

frozen_param = [dict(lr_mult=0)] * 2
# 将学习率因子置为０

# bottom: 传入数据
# nout: 输出核数目
# param: 决定是否需要更新权值
def conv_relu(bottom, ks, nout, stride=1, pad=0, group=1,
              param=learned_param,
              weight_filler=dict(type='gaussian', std=0.01),
              bias_filler=dict(type='constant', value=0.1)):
    """定义一个网络,便于赋予一些参数初始值和直接调用"""
    conv = L.Convolution(bottom, kernel_size=ks, stride=stride,
                         num_output=nout, pad=pad, group=group,
                         param=param, weight_filler=weight_filler,
                         bias_filler=bias_filler)
    return conv, L.ReLU(conv, in_place=True)

def fc_relu(bottom, nout, param=learned_param,
            weight_filler=dict(type='gaussian', std=0.005),
            bias_filler=dict(type='constant', value=0.1)):
    fc = L.InnerProduct(bottom, num_output=nout, param=param,
                        weight_filler=weight_filler,
                        bias_filler=bias_filler)
    return fc, L.ReLU(fc, in_place=True)

def max_pool(bottom, ks, stride=1):
    return L.Pooling(bottom, pool=P.Pooling.MAX, kernel_size=ks, stride=stride)


# data: 外部载入数据
# train: 决定训练阶段＋dropout,　测试阶段加softmax层
# label: 用于计算损失和精度
# learn_all: 决定是否需要更新权值
# classifier_name:将L.InnerProduct层的layer：fc8重新赋予一个新的名字(对应于训练的最后一层：输出层)
def caffenet(data, label=None, train=True, num_classes=1000,
             classifier_name='fc8', learn_all=False):
    """　便于定义新的数据格式和参数选择, 用来生成deploy.prototxt文件"""
    n = caffe.NetSpec()
    n.data = data#数据格式在外部定义
    param = learned_param if learn_all else frozen_param#param=l_param or fr_param
    n.conv1, n.relu1 = conv_relu(n.data, 11, 96, stride=4, param=param)
    n.pool1 = max_pool(n.relu1, 3, stride=2)
    n.norm1 = L.LRN(n.pool1, local_size=5, alpha=1e-4, beta=0.75)
    n.conv2, n.relu2 = conv_relu(n.norm1, 5, 256, pad=2, group=2, param=param)
    n.pool2 = max_pool(n.relu2, 3, stride=2)
    n.norm2 = L.LRN(n.pool2, local_size=5, alpha=1e-4, beta=0.75)
    n.conv3, n.relu3 = conv_relu(n.norm2, 3, 384, pad=1, param=param)
    n.conv4, n.relu4 = conv_relu(n.relu3, 3, 384, pad=1, group=2, param=param)
    n.conv5, n.relu5 = conv_relu(n.relu4, 3, 256, pad=1, group=2, param=param)
    n.pool5 = max_pool(n.relu5, 3, stride=2)
    n.fc6, n.relu6 = fc_relu(n.pool5, 4096, param=param)
    if train:# 如果训练时,采用dropout
        n.drop6 = fc7input = L.Dropout(n.relu6, in_place=True)
    else:
        fc7input = n.relu6
    n.fc7, n.relu7 = fc_relu(fc7input, 4096, param=param)
    if train:
        n.drop7 = fc8input = L.Dropout(n.relu7, in_place=True)
    else:
        fc8input = n.relu7
    # always learn fc8 (param=learned_param)
    fc8 = L.InnerProduct(fc8input, num_output=num_classes, param=learned_param)
    # give fc8 the name specified by argument `classifier_name`
    n.__setattr__(classifier_name, fc8)
    #用法：def __setattr__(self, name, value):
       # self.tops[name] = value
    if not train:#测试时,增加softmax输出层
        n.probs = L.Softmax(fc8)
    if label is not None:
        n.label = label
        n.loss = L.SoftmaxWithLoss(fc8, n.label)
        n.acc = L.Accuracy(fc8, n.label)
    # write the net to a temporary file and return its filename
    with tempfile.NamedTemporaryFile(delete=False) as f:
        f.write(str(n.to_proto()))
        return f.name
# create the .prototxt file in /temp/tmpj6kh


#  create a CaffeNet that takes unlabeled "dummy data" as input
dummy_data = L.DummyData(shape=dict(dim=[1, 3, 227, 227]))
# 一个dummydata层产生随机数据(num,channel,height,width)这里输出全为０,用于调试

imagenet_net_filename = caffenet(data=dummy_data, train=False)
#产生image_net.prototxt文件

# 初始化/装载一个预训练好的网络：deploy.prototxt文件, 权值文件,　开始盖楼测试输出网络参数
imagenet_net = caffe.Net(imagenet_net_filename, weights, caffe.TEST)

５）　在原先.prototxt基础上修改数据输入和最后输出，定义自己网络

# train: 训练时用于mirror
#subset: 决定调用训练数据还是测试数据
def style_net(train=True, learn_all=False, subset=None):
    """调用新的数据层,导入数据,调用caffeNet生成flickr_style的deploy.prototxt文档"""
    if subset is None:
        subset = 'train' if train else 'test'
    source = caffe_root + 'data/flickr_style/%s.txt' % subset
    #图片从txt里面的图片路径导入; source: each line give an images filename and label
    transform_param = dict(mirror=train, crop_size=227,
        mean_file=caffe_root + 'data/ilsvrc12/imagenet_mean.binaryproto')
    style_data, style_label = L.ImageData(
        transform_param=transform_param, source=source,
        batch_size=50, new_height=256, new_width=256, ntop=2)
    return caffenet(data=style_data, label=style_label, train=train,
                    num_classes=NUM_STYLE_LABELS,
                    classifier_name='fc8_flickr',
                    learn_all=learn_all)


# 初始化/装载一个预训练好的网络(最后一层权值没有预训练)：deploy.prototxt文件, 权值文件
print 'Caffe_Net_untrained_style_net'
untrained_style_net = caffe.Net(style_net(train=False, subset='train'),
                                weights, caffe.TEST)

# 前馈走一遍
print 'untrained_style_net.forward()'
untrained_style_net.forward()

#　拷贝一个批次的数据出来,每次载入50个batch_size
style_data_batch = untrained_style_net.blobs['data'].data.copy()
style_label_batch = np.array(untrained_style_net.blobs['label'].data, dtype=np.int32)

# 拿一张出来
image = untrained_style_net.blobs['data'].data[0,...]
plt.imshow(deprocess_net_image(image))
# plt.show()

６）　定义显示函数

# 定义显示函数：将训练好的网络前馈输出预测显示(top5)
# ImageNet网络的输入数据时随机的,所以这里先赋一张图片给网络; start='conv1'?意义不大.
# net: 为采用caffe.Net装载好的一个预训练或完全训练好的网络
def disp_preds(net, image, labels, k=5, name='ImageNet'):
    input_blob = net.blobs['data']
    net.blobs['data'].data[0, ...] = image
    probs = net.forward(start='conv1')['probs'][0]
    top_k = (-probs).argsort()[:k]#负号变为从大到小
    print 'top %d predicted %s labels =' % (k, name)
    print '\n'.join('\t(%d) %5.2f%% %s' % (i+1, 100*probs[p], labels[p])
                    for i, p in enumerate(top_k))
    # enumerate同时返回index and item

def disp_imagenet_preds(net, image):
    disp_preds(net, image, imagenet_labels, name='ImageNet')

def disp_style_preds(net, image):
    disp_preds(net, image, style_labels, name='style')



batch_index = 8
#随机选择一个样本
image = style_data_batch[batch_index]
plt.imshow(deprocess_net_image(image))
# plt.show()

# style_label_batch[batch_index] 返回图片属于第几类
# style_labels[*]返回具体名称
print 'actual label =', style_labels[style_label_batch[batch_index]]


disp_imagenet_preds(imagenet_net, image)
# 采用预训练网络预测

disp_style_preds(untrained_style_net, image)
# 最后一层权值重新初始化未训练,输出为平均值

#　倒数第二层
diff = untrained_style_net.blobs['fc7'].data[0] - imagenet_net.blobs['fc7'].data[0]
error = (diff ** 2).sum()
print error
assert error < 1e-8

# Delete untrained_style_net to save memory. (Hang on to imagenet_net as we'll use it again later.)
del untrained_style_net

７）　生成solver.prototxt函数

#  training and style classifier
from caffe.proto import caffe_pb2

# 定义用于网络训练的solver函数:参考： 之前的caffe下python网络创建(solving in python with lenet)
def solver(train_net_path, test_net_path=None, base_lr=0.001):
    print 'caffe_pb2.SolverParameter()'
    s = caffe_pb2.SolverParameter()
    """生成一个solver.prototxt文件"""

    # Specify locations of the train and (maybe) test networks.
    s.train_net = train_net_path # 定义训练.prototxt文件所在路径
    if test_net_path is not None:
        s.test_net.append(test_net_path)
        s.test_interval = 1000  # 每1000次迭代测试一次
        s.test_iter.append(100)  # 每次100个样本测试

    # The number of iterations over which to average the gradient.
    # Effectively boosts the training batch size by the given factor, without
    # affecting memory utilization.
    s.iter_size = 1

    s.max_iter = 100000  # # of times to update the net (training iterations)

    # Solve using the stochastic gradient descent (SGD) algorithm.
    # Other choices include 'Adam' and 'RMSProp'.
    s.type = 'SGD'

    # Set the initial learning rate for SGD.
    s.base_lr = base_lr

    # Set `lr_policy` to define how the learning rate changes during training.
    # Here, we 'step' the learning rate by multiplying it by a factor `gamma`
    # every `stepsize` iterations.
    s.lr_policy = 'step'# learning rate policy: drop the learning rate in "steps"
                        # by a factor of gamma every stepsize iterations
    s.gamma = 0.1# drop the learning rate by a factor of 10
                  # (i.e., multiply it by a factor of gamma = 0.1)
    s.stepsize = 20000#drop the learning rate every 20K iterations

    # Set other SGD hyperparameters. Setting a non-zero `momentum` takes a
    # weighted average of the current gradient and previous gradients to make
    # learning more stable. L2 weight decay regularizes learning, to help prevent
    # the model from overfitting.
    s.momentum = 0.9
    s.weight_decay = 5e-4

    # Display the current training loss and accuracy every 1000 iterations.
    s.display = 1000

    # Snapshots are files used to store networks we've trained.  Here, we'll
    # snapshot every 10K iterations -- ten times during training.
    s.snapshot = 10000
    s.snapshot_prefix = caffe_root + 'models/finetune_flickr_style/finetune_flickr_style'
    #快照保存文件名和地址
    # Train on the GPU.  Using the CPU to train large networks is very slow.
    # s.solver_mode = caffe_pb2.SolverParameter.GPU
    s.solver_mode = caffe_pb2.SolverParameter.CPU
    # Write the solver to a temporary file and return its filename.
    with tempfile.NamedTemporaryFile(delete=False) as f:
        f.write(str(s))
        return f.name

８）　定义保存中间训练结果函数

def run_solvers(niter, solvers, disp_interval=10):
    """目的时启动两个求解器求解进行后面的比较
        Run solvers for niter iterations,
       returning the loss and accuracy recorded each iteration.
       `solvers` is a list of (name, solver) tuples."""
    blobs = ('loss', 'acc')
    # for _ in blobs----> for _in ['one','two'],相当于将前面语句循环两次赋予loss和acc
    loss, acc = ({name: np.zeros(niter) for name, _ in solvers}
                 for _ in blobs)

    for it in range(niter):
        for name, s in solvers:
            s.step(1)  # run a single SGD step in Caffe
            loss[name][it], acc[name][it] = (s.net.blobs[b].data.copy()
                                             for b in blobs)  # blobs = ('loss','acc')
        if it % disp_interval == 0 or it + 1 == niter:
            loss_disp = '; '.join('%s: loss=%.3f, acc=%2d%%' %
                                  (n, loss[n][it], np.round(100*acc[n][it]))
                                  for n, _ in solvers)
            print '%3d) %s' % (it, loss_disp)
    # Save the learned weights from both nets.
    weight_dir = tempfile.mkdtemp()
    weights = {}
    for name, s in solvers:
        filename = 'weights.%s.caffemodel' % name
        weights[name] = os.path.join(weight_dir, filename)
        s.net.save(weights[name])#保存权值
    return loss, acc, weights

９）　调用求解器进行训练和结果显示（不开启learn_all进行最后一层局部微调）

niter = 50  # number of iterations to train

# Reset style_solver as before.
# style_net(train=True)生成.prototxt文件,只训练最后一层
print 'solver'
style_solver_filename = solver(style_net(train=True))

# load solver
print 'caffe.get_solver'
style_solver = caffe.get_solver(style_solver_filename)
# load the pretrained weights
style_solver.net.copy_from(weights)

# For reference, we also create a solver that isn't initialized from
# the pretrained ImageNet weights.
scratch_style_solver_filename = solver(style_net(train=True))

scratch_style_solver = caffe.get_solver(scratch_style_solver_filename)

print 'Running solvers for %d iterations...' % niter
solvers = [('pretrained', style_solver),
           ('scratch', scratch_style_solver)]
loss, acc, weights = run_solvers(niter, solvers)
print 'Done.'

train_loss, scratch_train_loss = loss['pretrained'], loss['scratch']
train_acc, scratch_train_acc = acc['pretrained'], acc['scratch']
style_weights, scratch_style_weights = weights['pretrained'], weights['scratch']

# Delete solvers to save memory.
del style_solver, scratch_style_solver, solvers

# 将两个１＊n数组,堆叠为2*n, plot(a)a是矩阵时,以对应列画出
plot(np.vstack([train_loss, scratch_train_loss]).T)
xlabel('Iteration #')
ylabel('Loss')
plt.show()


plot(np.vstack([train_acc, scratch_train_acc]).T)
xlabel('Iteration #')
ylabel('Accuracy')
plt.show()

１０）　将训练好的网络用测试样本测试

# 将训练好的网络采用测试样本进行测试
def eval_style_net(weights, test_iters=10):
    test_net = caffe.Net(style_net(train=False), weights, caffe.TEST)
    accuracy = 0
    for it in xrange(test_iters):
        accuracy += test_net.forward()['acc']
    accuracy /= test_iters
    return test_net, accuracy

test_net, accuracy = eval_style_net(style_weights)
print 'Accuracy, trained from ImageNet initialization: %3.1f%%' % (100*accuracy, )
scratch_test_net, scratch_accuracy = eval_style_net(scratch_style_weights)
print 'Accuracy, trained from   random initialization: %3.1f%%' % (100*scratch_accuracy, )

１１）　在原先基础上进行全局微调

# 进行全网络调优(开启learn_all),此时主要变化是学习率要设置小一点(微调)
end_to_end_net = style_net(train=True, learn_all=True)

# Set base_lr to 1e-3, the same as last time when learning only the classifier.
# You may want to play around with different values of this or other
# optimization parameters when fine-tuning.  For example, if learning diverges
# (e.g., the loss gets very large or goes to infinity/NaN), you should try
# decreasing base_lr (e.g., to 1e-4, then 1e-5, etc., until you find a value
# for which learning does not diverge).
base_lr = 0.001

style_solver_filename = solver(end_to_end_net, base_lr=base_lr)
style_solver = caffe.get_solver(style_solver_filename)
style_solver.net.copy_from(style_weights)

scratch_style_solver_filename = solver(end_to_end_net, base_lr=base_lr)
scratch_style_solver = caffe.get_solver(scratch_style_solver_filename)
scratch_style_solver.net.copy_from(scratch_style_weights)

print 'Running solvers for %d iterations...' % niter
solvers = [('pretrained, end-to-end', style_solver),
           ('scratch, end-to-end', scratch_style_solver)]
_, _, finetuned_weights = run_solvers(niter, solvers)
print 'Done.'

style_weights_ft = finetuned_weights['pretrained, end-to-end']
scratch_style_weights_ft = finetuned_weights['scratch, end-to-end']

# Delete solvers to save memory.
del style_solver, scratch_style_solver, solvers



test_net, accuracy = eval_style_net(style_weights_ft)
print 'Accuracy, finetuned from ImageNet initialization: %3.1f%%' % (100*accuracy, )
scratch_test_net, scratch_accuracy = eval_style_net(scratch_style_weights_ft)
print 'Accuracy, finetuned from   random initialization: %3.1f%%' % (100*scratch_accuracy, )



plt.imshow(deprocess_net_image(image))
plt.show()
disp_style_preds(test_net, image)

batch_index = 1
image = test_net.blobs['data'].data[batch_index]
plt.imshow(deprocess_net_image(image))
print 'actual label =', style_labels[int(test_net.blobs['label'].data[batch_index])]


disp_style_preds(test_net, image)

disp_style_preds(scratch_test_net, image)

disp_imagenet_preds(imagenet_net, image)