深度学习计算框架综述（十三）HVX 计算优化实践—Depthwise Conv 优化

最新推荐文章于 2022-07-24 14:59:13 发布

Kane技术漫谈

最新推荐文章于 2022-07-24 14:59:13 发布

阅读量1.4k

点赞数

分类专栏：深度学习计算框架综述文章标签：深度学习神经网络

本文链接：https://blog.csdn.net/u012542087/article/details/107284739

版权

本节主要介绍DepthwiseConvHVX的实现。

摘要由CSDN通过智能技术生成

Depthwise Conv的量化计算公式和Conv一样，此处不再赘述，我们直接看DataFormat以及计算优化方法。

Data Format：

Feature Map：

Depthwise Conv 的Feature Map也是采用D32 Format，即：

N x H x C32 x W4 x 4 x 32 → N x H x C32 x PADDED_WIDTH x 32

Weights ：

DepthwiseWeights 采用的Data Format是为了适配 D32 Format 对 Weights原始的数据排布HWCN进行了变换，变换后的数据排布和示意图如下：

C32 x H x W4 x 32 x 4

W4 = (filt_width + 3) >> 2

C32 = (filt_channel + 31) >> 5

// weight for depthwise conv:
// |---------------H * W4 *  (32 * 4)--------------|
// =============================
// = =
// = =
// = = C32
// =    =
// =    =
// =============================

Depthwise Conv 优化示例：

depthwiseconv3x3s1d32：

和conv一样，depthwiseconv3x3也是滑窗计算，但由于Depthwise Conv属于二维卷积，使用Conv优化中的向量和标量进行vrmpy显然不太合适，

所以需要对Input或者Weight进行适当的重排，而Hexagon 提供了 vshuff 指令，这条指令可以将 Input Vector 从 4(Width) x 32(Channel) 转置成

32(Channel) x 4(Width)，即Input 和 Weights 保持相同的排布，这样可以对向量和向量进行vrmpy操作，当然，vshuff 的功能远不止于此，读者

可以参考Hexagon Document Bundle。

下面我们来看 depthwiseconv3x3s1_callback 函数的实现，源码如下：

static void depthwiseconv3x3s1_callback(void *data) {
  conv_callback_t *dptr = (conv_callback_t *)data;
  uint64_t L2FETCH_INPUT_REGISTER = (1ULL << 48) | ((uint64_t)dptr->pre_padded_in_width_depth << 32) |
                                    ((uint64_t)((dptr->pre_padded_in_width_depth + 127) & 0xffffff80) << 16) | 3ULL;
  uint32_t thread_id = dspCV_atomic_inc_return((unsigned int *)(&(dptr->job_count))) - 1;
  uint32_t start_height = thread_id * dptr->rows_per_job;
  uint32_t end_height = MIN((thread_id + 1) * dptr->rows_per_job, dptr->top_h);
  uint8_t *input = dptr->input + (start_height - dptr->pad_top) * dptr->pre_padded_in_width_depth;//* dptr->stride_h
  uint8_t *vtcm_input = dptr->vtcm_input + thread_id * dptr->next_in_width_depth * dptr->vtcm_height;
  uint8_t *output = dptr->output + start_height * dptr->next_out_width_depth;
  if (thread_id == 0) {
    L2FETCH(input + dptr->pad_top * dptr->pre_padded_in_width_depth, L2FETCH_INPUT_REGISTER);
  } else {
    L2FETCH(input, L2FETCH_INPUT_REGISTER);
  }
  const uint32_t pre_pad_w4 = dptr->pre_padded_in_width >> 2;
  const uint32_t next_pad_w4 = dptr->padded_in_width >> 2;
  int remain = dptr->padded_in_width - dptr->bottom_w - dptr->pad_width;
  int vtcm_pad_right = dptr->padded_in_width - dptr->bottom_w - dptr->pad_left;
  HVX_VectorPred Qsplit0 = Q6_Q_vsetq_R((3 - remain) * 32);
  HVX_Vector input_zero_vec = Q6_Vb_vsplat_R(dptr->quant_info->bottom_zp);
  if (end_height == dptr->top_h) {
    end_height -= 1;
  }
 
  int h = 0;
  if (start_height == 0) {
    // pad top with input_zero
    vmemset_asm(vtcm_input + h * dptr->next_in_width_d32, (uint8_t)dptr->quant_info->bottom_zp, dptr->next_in_width_depth);
    h += dptr->out_d32;
  }
  // cp first two lines if start_height == 0
  for (; h < dptr->vtcm_height * dptr->out_d32; h++) {
    unsigned char *p_vtcm_input = vtcm_input + h * dptr->next_in_width_d32;
    unsigned char *p_input = input + h * dptr->pre_padded_in_width_d32;
    HVX_Vector *vtcm_input_vec = (HVX_Vector *)(p_vtcm_input);
    HVX_Vector *input_vec = (HVX_Vector *)(p_input);
    HVX_Vector first_vec = *input_vec++;
    HVX_Vector second_vec = *input_vec++;
    *vtcm_input_vec++ = Q6_V_valign_VVR(first_vec, input_zero_vec, 96);
    for (int w = 1; w < next_pad_w4; w++) {
      *vtcm_input_vec++ = Q6_V_valign_VVR(second_vec, first_vec, 96);
      first_vec = second_vec;
      second_vec = *input_vec++;
    }
    vtcm_input_vec--;
    HVX_Vector last_vec = *vtcm_input_vec;
    *vtcm_input_vec = Q6_V_vmux_QVV(Qsplit0, last_vec, input_zero_vec);
  }
  depthwiseconv3x3s1_asm(vtcm_input, output, data);
  input += dptr->pre_padded_in_width_depth;
  output += dptr->next_out_width_depth;
  L2FETCH(input, L2FETCH_INPUT_REGISTER);
  start_height++;
 
  for (; start_height < end_height; start_height++) {
    for (h = 0; h < dptr->vtcm_height * dptr->out_d32; h++) {
      unsigned char *p_vtcm_input = vtcm_input + h * dptr->next_in_width_d32;
      unsigned char *p_input = input + h * dptr->pre_padded_in_width_d32;
      HVX_Vector *vtcm_input_vec = (HVX_Vector *)(p_vtcm_input);
      HVX_Vector *input_vec = (HVX_Vector *)(p_input);
      HVX_Vector first_vec = *input_vec++;
      HVX_Vector second_vec = *input_vec++;
      *vtcm_input_vec++ = Q6_V_valign_VVR(first_vec, input_zero_vec, 96);
      for (int w = 1; w < next_pad_w4; w++) {
        *vtcm_input_vec++ = Q6_V_valign_VVR(second_vec, first_vec, 96);
        first_vec = second_vec;
        second_vec = *input_vec++;
      }
      vtcm_input_vec--;
      HVX_Vector last_vec = *vtcm_input_vec;
      *vtcm_input_vec = Q6_V_vmux_QVV(Qsplit0, last_vec, input_zero_vec);
    }
    depthwiseconv3x3s1_asm(vtcm_input, output, data);
    input += dptr->pre_padded_in_width_depth;//dptr->stride_h *
    output += dptr->next_out_width_depth;
    if (start_height < (end_height - 1)) {
      L2FETCH(input, L2FETCH_INPUT_REGISTER);
    }
  }