Depthwise Conv的量化计算公式和Conv一样,此处不再赘述,我们直接看DataFormat以及计算优化方法。
Data Format:
Feature Map:
Depthwise Conv 的Feature Map也是采用D32 Format,即:
N x H x C32 x W4 x 4 x 32 → N x H x C32 x PADDED_WIDTH x 32
Weights :
DepthwiseWeights 采用的Data Format是为了适配 D32 Format 对 Weights原始的数据排布HWCN进行了变换,变换后的数据排布和示意图如下:
C32 x H x W4 x 32 x 4
W4 = (filt_width + 3) >> 2
C32 = (filt_channel + 31) >> 5
// weight for depthwise conv:
// |---------------H * W4 * (32 * 4)--------------|
// =============================
// = =
// = =
// = = C32
// = =
// = =
// =============================
Depthwise Conv 优化示例:
depthwiseconv3x3s1d32:
和conv一样,depthwiseconv3x3也是滑窗计算,但由于Depthwise Conv属于二维卷积,使用Conv优化中的向量和标量进行vrmpy显然不太合适,
所以需要对Input或者Weight进行适当的重排,而Hexagon 提供了 vshuff 指令,这条指令可以将 Input Vector 从 4(Width) x 32(Channel) 转置成
32(Channel) x 4(Width),即Input 和 Weights 保持相同的排布,这样可以对向量和向量进行vrmpy操作,当然,vshuff 的功能远不止于此,读者
可以参考Hexagon Document Bundle。
下面我们来看 depthwiseconv3x3s1_callback 函数的实现,源码如下:
static void depthwiseconv3x3s1_callback(void *data) {
conv_callback_t *dptr = (conv_callback_t *)data;
uint64_t L2FETCH_INPUT_REGISTER = (1ULL << 48) | ((uint64_t)dptr->pre_padded_in_width_depth << 32) |
((uint64_t)((dptr->pre_padded_in_width_depth + 127) & 0xffffff80) << 16) | 3ULL;
uint32_t thread_id = dspCV_atomic_inc_return((unsigned int *)(&(dptr->job_count))) - 1;
uint32_t start_height = thread_id * dptr->rows_per_job;
uint32_t end_height = MIN((thread_id + 1) * dptr->rows_per_job, dptr->top_h);
uint8_t *input = dptr->input + (start_height - dptr->pad_top) * dptr->pre_padded_in_width_depth;//* dptr->stride_h
uint8_t *vtcm_input = dptr->vtcm_input + thread_id * dptr->next_in_width_depth * dptr->vtcm_height;
uint8_t *output = dptr->output + start_height * dptr->next_out_width_depth;
if (thread_id == 0) {
L2FETCH(input + dptr->pad_top * dptr->pre_padded_in_width_depth, L2FETCH_INPUT_REGISTER);
} else {
L2FETCH(input, L2FETCH_INPUT_REGISTER);
}
const uint32_t pre_pad_w4 = dptr->pre_padded_in_width >> 2;
const uint32_t next_pad_w4 = dptr->padded_in_width >> 2;
int remain = dptr->padded_in_width - dptr->bottom_w - dptr->pad_width;
int vtcm_pad_right = dptr->padded_in_width - dptr->bottom_w - dptr->pad_left;
HVX_VectorPred Qsplit0 = Q6_Q_vsetq_R((3 - remain) * 32);
HVX_Vector input_zero_vec = Q6_Vb_vsplat_R(dptr->quant_info->bottom_zp);
if (end_height == dptr->top_h) {
end_height -= 1;
}
int h = 0;
if (start_height == 0) {
// pad top with input_zero
vmemset_asm(vtcm_input + h * dptr->next_in_width_d32, (uint8_t)dptr->quant_info->bottom_zp, dptr->next_in_width_depth);
h += dptr->out_d32;
}
// cp first two lines if start_height == 0
for (; h < dptr->vtcm_height * dptr->out_d32; h++) {
unsigned char *p_vtcm_input = vtcm_input + h * dptr->next_in_width_d32;
unsigned char *p_input = input + h * dptr->pre_padded_in_width_d32;
HVX_Vector *vtcm_input_vec = (HVX_Vector *)(p_vtcm_input);
HVX_Vector *input_vec = (HVX_Vector *)(p_input);
HVX_Vector first_vec = *input_vec++;
HVX_Vector second_vec = *input_vec++;
*vtcm_input_vec++ = Q6_V_valign_VVR(first_vec, input_zero_vec, 96);
for (int w = 1; w < next_pad_w4; w++) {
*vtcm_input_vec++ = Q6_V_valign_VVR(second_vec, first_vec, 96);
first_vec = second_vec;
second_vec = *input_vec++;
}
vtcm_input_vec--;
HVX_Vector last_vec = *vtcm_input_vec;
*vtcm_input_vec = Q6_V_vmux_QVV(Qsplit0, last_vec, input_zero_vec);
}
depthwiseconv3x3s1_asm(vtcm_input, output, data);
input += dptr->pre_padded_in_width_depth;
output += dptr->next_out_width_depth;
L2FETCH(input, L2FETCH_INPUT_REGISTER);
start_height++;
for (; start_height < end_height; start_height++) {
for (h = 0; h < dptr->vtcm_height * dptr->out_d32; h++) {
unsigned char *p_vtcm_input = vtcm_input + h * dptr->next_in_width_d32;
unsigned char *p_input = input + h * dptr->pre_padded_in_width_d32;
HVX_Vector *vtcm_input_vec = (HVX_Vector *)(p_vtcm_input);
HVX_Vector *input_vec = (HVX_Vector *)(p_input);
HVX_Vector first_vec = *input_vec++;
HVX_Vector second_vec = *input_vec++;
*vtcm_input_vec++ = Q6_V_valign_VVR(first_vec, input_zero_vec, 96);
for (int w = 1; w < next_pad_w4; w++) {
*vtcm_input_vec++ = Q6_V_valign_VVR(second_vec, first_vec, 96);
first_vec = second_vec;
second_vec = *input_vec++;
}
vtcm_input_vec--;
HVX_Vector last_vec = *vtcm_input_vec;
*vtcm_input_vec = Q6_V_vmux_QVV(Qsplit0, last_vec, input_zero_vec);
}
depthwiseconv3x3s1_asm(vtcm_input, output, data);
input += dptr->pre_padded_in_width_depth;//dptr->stride_h *
output += dptr->next_out_width_depth;
if (start_height < (end_height - 1)) {
L2FETCH(input, L2FETCH_INPUT_REGISTER);
}
}