最近因为项目需要,需要研究深度学习相关的技术,发现资料特别少,坑特别多。
我们需要将原本通过Caffe训练的模型切换到Caffe2,因为Caffe2通过Neon、OpenMP、OpenGL技术大大提升了其在Android手机上的运行速度。
碰到的第一个坑就是Caffe2的MaxPool少了一个top_mask输出,UpSample层需要top_mask来做上采用。还好top_mask只是记录了每次Max Pooling的索引位置,算法并不复杂,看看Caffe中PoolingLayer<Dtype>::Forward_cpu函数就能弄明白。
下面是具体的修改点:
第一步:扩展MaxPool的输出个数
OPERATOR_SCHEMA(MaxPool)
.NumInputs(1)
.NumOutputs(1, 2) // 输出个数改为2
.TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForPool)
.FillUsing(MaxPoolDocGenerator(""));
REGISTER_CPU_OPERATOR(MaxPool1D, PoolOp<float, CPUContext, MaxPool<float>>);
OPERATOR_SCHEMA(MaxPool1D)
.NumInputs(1)
.NumOutputs(1, 2) // 输出个数改为2
.TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForPool)
.FillUsing(MaxPoolDocGenerator("1D"));
REGISTER_CPU_OPERATOR(MaxPool2D, PoolOp<float, CPUContext, MaxPool<float>>);
OPERATOR_SCHEMA(MaxPool2D)
.NumInputs(1)
.NumOutputs(1, 2) // 输出个数改为2
.TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForPool)
.FillUsing(MaxPoolDocGenerator("2D"));
REGISTER_CPU_OPERATOR(MaxPool3D, PoolOp<float, CPUContext, MaxPool<float>>);
OPERATOR_SCHEMA(MaxPool3D)
.NumInputs(1)
.NumOutputs(1, 2) // 输出个数改为2
.TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForPool)
.FillUsing(MaxPoolDocGenerator("3D"));
第二步:修改MaxPool类的process函数
原来的代码:
static void process(const T& x_data, T& y_data) {
if (x_data > y_data) {
y_data = x_data;
}
}
修改之后:
static bool process(const T& x_data, T& y_data) {
if (x_data > y_data) {
y_data = x_data;
return true;
}
return false;
}
这样修改的目的是方便调用process函数的地方能够知道y_data有没有被修改,这样才能记录每次max pooling操作时的索引位置。
第三步:修改PoolOp<T, Context, PoolType>::RunOnDeviceWithOrderNCHW() 函数
template <typename T, class Context, typename PoolType>
bool PoolOp<T, Context, PoolType>::RunOnDeviceWithOrderNCHW() {
auto& X = Input(0);
auto* Y = Output(0);
ConvPoolOpBase<Context>::SetOutputSize(X, Y, X.dim32(1));
// 第一步:根据OutputSize确定是否需要输出top_mask
int outputSize = OutputSize();
const bool use_top_mask = outputSize > 1 ? true : false;
caffe2::Tensor<Context> *Ymask = nullptr;
int* Ymask_data = nullptr;
if (use_top_mask) {
Ymask = Output(1);
Ymask->Resize(Y->dims());
Ymask_data = Ymask->template mutable_data<int>();
math::Set<int, Context>(Ymask->size(), -1, Ymask_data, &context_);
LOG(ERROR) << "[pool_op] outputSize " << outputSize;
}
// 此处代码有省略
...
// We specialize certain variants on ARM for vectorization
// 第二步:输出top_mask的情况下不对kernel size 为2*2的情况做特别处理
if (!use_top_mask && kernel_.size() == 2 &&
// 此处代码有省略
...
}
switch (kernel_.size()) {
case 1: // h * 1
for (int n = 0; n < X.dim32(0); ++n) {
for (int c = 0; c < channels; ++c) {
for (int ph = 0; ph < pooled_height; ++ph) {
int hstart = ph * stride_h() - pad_t();
int hend = min(hstart + kernel_h(), height);
hstart = max(hstart, 0);
T Yh = PoolType::initialize();
// 记录每个块的最大值索引
int index = 0;
for (int h = hstart; h < hend; ++h) {
if ( PoolType::process(Xdata[h], Yh)) {
index = h;
}
}
PoolType::finalize(hend - hstart, Yh);
Ydata[ph] = Yh;
// 将每个块的最大值索引写入输出buffer
if (use_top_mask) {
Ymask_data[ph] = index;
}
}
// Do offset.
Xdata += height;
Ydata += pooled_height;
// 移动top_mask对应的buffer指针
if (use_top_mask) {
Ymask_data += pooled_height;
}
}
}
break;
case 2: // w * h * 1
for (int n = 0; n < X.dim32(0); ++n) {
for (int c = 0; c < channels; ++c) {
for (int ph = 0; ph < pooled_height; ++ph) { // 根据输出的宽高来计算
int hstart = ph * stride_h() - pad_t();
int hend = min(hstart + kernel_h(), height);
hstart = max(hstart, 0);
for (int pw = 0; pw < pooled_width; ++pw) {
int wstart = pw * stride_w() - pad_l();
int wend = min(wstart + kernel_w(), width);
wstart = max(wstart, 0);
const int pool_index = ph * pooled_width + pw;
T Yh = PoolType::initialize();
// 记录每个块的最大值索引
int index = -1;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
const int input_index = h * width + w;
if (PoolType::process(Xdata[input_index], Yh)) {
index = input_index;
}
}
}
PoolType::finalize((hend - hstart) * (wend - wstart), Yh);
Ydata[pool_index] = Yh;
// 将每个块的最大值索引写入输出buffer
if (use_top_mask) {
Ymask_data[pool_index] = index;
}
}
}
// Do offset.
Xdata += height * width;
Ydata += pooled_height * pooled_width;
// 移动top_mask对应的buffer指针
if (use_top_mask) {
Ymask_data += pooled_height * pooled_width;
}
}
}
break;
case 3: // w * h * depth
for (int n = 0; n < X.dim32(0); ++n) {
for (int c = 0; c < channels; ++c) {
for (int ph = 0; ph < pooled_height; ++ph) {
int hstart = ph * stride_h() - pad_t();
int hend = min(hstart + kernel_h(), height);
hstart = max(hstart, 0);
for (int pw = 0; pw < pooled_width; ++pw) {
int wstart = pw * stride_w() - pad_l();
int wend = min(wstart + kernel_w(), width);
wstart = max(wstart, 0);
for (int pd = 0; pd < pooled_depth; ++pd) {
int dstart = pd * stride_[2] - pads_[2];
int dend = min(dstart + kernel_[2], depth);
dstart = max(dstart, 0);
const int pool_index =
ph * pooled_width * pooled_depth + pw * pooled_depth + pd;
T Yh = PoolType::initialize();
// 记录每个块的最大值索引
int index = 0;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
for (int d = dstart; d < dend; ++d) {
const int input_index = h * width * depth + w * depth + d;
if (PoolType::process(Xdata[input_index], Yh)) {
index = input_index;
}
}
}
}
PoolType::finalize(
(hend - hstart) * (wend - wstart) * (dend - dstart), Yh);
Ydata[pool_index] = Yh;
// 将每个块的最大值索引写入输出buffer
if (use_top_mask) {
Ymask_data[pool_index] = index;
}
}
}
}
// Do offset.
Xdata += height * width * depth;
Ydata += pooled_height * pooled_width * pooled_depth;
// 移动top_mask对应的buffer指针
if (use_top_mask) {
Ymask_data += pooled_height * pooled_width * pooled_depth;
}
}
}
break;
default:
CAFFE_THROW("Unsupported pooling size : ", kernel_.size());
return false;
}
return true;
}