概述
安装好Caffe后,这篇文章是我根据http://caffe.berkeleyvision.org/gathered/examples/mnist.html 进行学习的笔记。在Caffe上做模型训练,要按照四个步骤来执行:1、准备数据和进行数据转换。2、定义使用的神经网络。3、定义网络训练器的参数,即我们如何学习参数。4、进行训练和测试
准备数据
cd $CAFFE_ROOT
#从http://yann.lecun.com/exdb/mnist/网上下载四个文件,train-images-idx3-ubyte train-labels-idx1-ubyte t10k-images-idx3-ubyte t10k-labels-idx1-ubyte,分别是训练集的数据/标签以及测试集的数据/标签
./data/mnist/get_mnist.sh
#调用convert_mnist_data.cpp程序把上面下载的数据转化成lmdb格式放在examples/mnist/mnist_train_lmdb和examples/mnist/mnist_test_lmdb。远离上就是把数据一个个读进内存,每个数据/标签初始化为一个Datum对象,并通过protobuf通过DB接口(http://blog.csdn.net/acmwwy/article/details/52643387)写入数据库
./examples/mnist/create_mnist.sh
定义LeNet模型
# examples/mnist/lenet_train_test.prototxt
name: "LeNet"
# 训练数据层
layer {
name: "mnist"
type: "Data" #表示数据层
# 数据层没有输入只有输出,它输出data blob和label blob
top: "data" # 大小为64(batch_size)*1(channel)*28(height)*28(width)
top: "label" # 大小为64*1
include {
phase: TRAIN #表示是训练数据
}
transform_param {
scale: 0.00390625 # pixel * 1/256, 使得数据的值在[0, 1]
}
data_param {
source: "examples/mnist/mnist_train_lmdb"
batch_size: 64
backend: LMDB
}
}
# 测试数据层
layer {
name: "mnist"
type: "Data"
top: "data"
top: "label"
include {
phase: TEST #表示是测试数据
}
transform_param {
scale: 0.00390625
}
data_param {
source: "examples/mnist/mnist_test_lmdb"
batch_size: 100 # 注意这里和训练网络的64不一样,所以测试网络的blob大小不一样
backend: LMDB
}
}
# 第一个卷积层
layer {
name: "conv1"
type: "Convolution"
bottom: "data" #输入是data blob, 大小为64(batch_size)*1(channel)*28(height)*28(width)
top: "conv1" #输出是卷积blob,大小为64(batch_size)*20(channel)*24(28-kernel_size+1)*24(28-kernel_size+1)
param {
lr_mult: 1 # 参数的学习率是lr_mult*solver定义的学习率 = solver给定学习率
}
param {
lr_mult: 2 # bias参数学习率=2*solver给定学习率
}
convolution_param {
num_output: 20 # 20 channels
kernel_size: 5 # 5*5的卷积核
stride: 1 # 每移动一个pixel做一次卷积
weight_filler { #定义参数的初始化策略
type: "xavier" #使用xavier算法自动根据输入和输出决定初始值的范围
}
bias_filler { #定义bias参数的初始化策略
type: "constant" #默认是0
}
}
}
# Pooling 层
layer {
name: "pool1"
type: "Pooling"
bottom: "conv1" # 大小为64(batch_size)*20(channel)*24*24
top: "pool1" # 大小为64(batch_size)*20(channel)*12(24/kernel_size)*12(24/kernel_size)
pooling_param {
pool: MAX # max pooling, 取最大值
kernel_size: 2 # 2*2的Pooling核
stride: 2 # 每移动2个像素做一个pooling,注意水平垂直都要移动2个像素,所以相邻的pooling区域之间没有重叠
}
}
# 卷积层2
layer {
name: "conv2"
type: "Convolution"
bottom: "pool1" # 大小为64(batch_size)*20(channel)*12*12
top: "conv2" # 大小为64(batch_size)*50(new channel size)*8(12-kernel_size+1)*8(12-kernel_size+1)
param {
lr_mult: 1
}
param {
lr_mult: 2
}
convolution_param {
num_output: 50 # 这次是50 channel
kernel_size: 5
stride: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
}
}
}
# Pooling层2
layer {
name: "pool2"
type: "Pooling" # 大小为64(batch_size)*50(channel size)*8*8
bottom: "conv2" # 大小为64(batch_size)*50(channel size)*4(8/kernel_size)*4(8/kernel_size)
top: "pool2"
pooling_param {
pool: MAX
kernel_size: 2
stride: 2
}
}
# InnerProduct层,全连接层
layer {
name: "ip1"
type: "InnerProduct"
bottom: "pool2" # 大小为64(batch_size)*50(channel size)*4*4
top: "ip1" # 大小为64*500(500个节点)
param {
lr_mult: 1
}
param {
lr_mult: 2
}
inner_product_param {
num_output: 500 # 输出blob有500个隐藏节点
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
}
}
}
# Relu层
layer {
name: "relu1"
type: "ReLU"
bottom: "ip1" #通过把bottom blob 和top blob设置成相同来避免建立一个新的层。省内存
top: "ip1"
}
# 全连接层2
layer {
name: "ip2"
type: "InnerProduct"
bottom: "ip1" # 大小为64*500
top: "ip2" # 大小为64*10
param {
lr_mult: 1
}
param {
lr_mult: 2
}
inner_product_param {
num_output: 10 # 输出类0-9
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
}
}
}
# Loss层
layer {
name: "loss"
type: "SoftmaxWithLoss"
bottom: "ip2" # 大小为64*10
bottom: "label" # 这个是数据层的top blob label. 大小是64*1
top: "loss" # 计算loss值,大小是1
}
# 到此为止,所有的层已经描述完毕
# 准确度层
layer {
name: "accuracy"
type: "Accuracy"
bottom: "ip2" # 根据ip2输出和TEST的label计算准确率
bottom: "label"
top: "accuracy"
include {
phase: TEST
}
}
定义Solver
#/examples/mnist/lenet_solver.prototxt
# 指定神经网络使用哪一个文件
net: "examples/mnist/lenet_train_test.prototxt"
# 我们做多少次前向传播,因为我们一共有10000个测试用例,而Test网络的batchsize是100,所以我们只需要做100次前向传播。
test_iter: 100
# 每500个训练迭代,输出在测试集上的效果
test_interval: 500
# 基本的学习率
base_lr: 0.01
momentum: 0.9
weight_decay: 0.0005
# 使用什么样的学习策略
lr_policy: "inv"
gamma: 0.0001
power: 0.75
# 每100个迭代输出训练网络信息
display: 100
# 一共进行多少次训练迭代
max_iter: 10000
# 每5000次迭代进行存档
snapshot: 5000
snapshot_prefix: "examples/mnist/lenet"
solver_mode: GPU
进行训练和测试
cd $CAFFE_ROOT
./build/tools/caffe train --solver=examples/mnist/lenet_solver.prototxt
Log分析
建立训练网络和测试网路
I1003 16:28:14.755233 2719146944 solver.cpp:91] Creating training net from net file: examples/mnist/lenet_train_test.prototxt
# 建立不同的Layer
I1003 16:28:14.766196 2719146944 net.cpp:270] This network produces output loss # 输出Loss值
I1003 16:28:14.766206 2719146944 net.cpp:283] Network initialization done.
I1003 16:28:14.766470 2719146944 solver.cpp:181] Creating test net (#0) specified by net file: examples/mnist/lenet_train_test.prototxt
I1003 16:28:14.773440 2719146944 net.cpp:270] This network produces output accuracy #除了输出Loss还输出准确度。
I1003 16:28:14.773448 2719146944 net.cpp:270] This network produces output loss
I1003 16:28:14.773459 2719146944 net.cpp:283] Network initialization done.
I1003 16:28:14.773522 2719146944 solver.cpp:60] Solver scaffolding done.
开始训练
# 每100次训练输出训练网络信息
I1003 16:28:28.524513 2719146944 sgd_solver.cpp:106] Iteration 300, lr = 0.00978075 // 当前的学习率
I1003 16:28:32.037479 2719146944 solver.cpp:228] Iteration 400, loss = 0.0713889
I1003 16:28:32.037539 2719146944 solver.cpp:244] Train net output #0: loss = 0.0713891 (* 1 = 0.0713891 loss) // Loss值
# 每500次训练输出训练集和测试集的Loss和准确度信息。
I1003 16:28:35.514955 2719146944 solver.cpp:337] Iteration 500, Testing net (#0)
I1003 16:28:38.088145 2719146944 solver.cpp:404] Test net output #0: accuracy = 0.9733
I1003 16:28:38.088207 2719146944 solver.cpp:404] Test net output #1: loss = 0.0818712 (* 1 = 0.0818712 loss)
I1003 16:28:38.122164 2719146944 solver.cpp:228] Iteration 500, loss = 0.106188
I1003 16:28:38.122211 2719146944 solver.cpp:244] Train net output #0: loss = 0.106188 (* 1 = 0.106188 loss)
# 每5000次训练进行Snapshot
I1003 16:31:44.522483 2719146944 solver.cpp:454] Snapshotting to binary proto file examples/mnist/lenet_iter_5000.caffemodel
I1003 16:31:44.539623 2719146944 sgd_solver.cpp:273] Snapshotting solver state to binary proto file examples/mnist/lenet_iter_5000.solverstate
训练结果 (99%的准确度!)
I1003 16:35:16.854141 2719146944 solver.cpp:337] Iteration 10000, Testing net (#0)
I1003 16:35:19.574384 2719146944 solver.cpp:404] Test net output #0: accuracy = 0.9906
I1003 16:35:19.574434 2719146944 solver.cpp:404] Test net output #1: loss = 0.0285571 (* 1 = 0.0285571 loss)