纯C++超分辨率重建LapSRN --改编--（二）正向卷积vl

接上文，现在是激励函数vl_nnrelu：

void vl_nnrelu(卷积层 * out, float leak)//激励函数
{

	float * s=out->data, *s0;
	float p;
	int width=out->width; 
	int height=out->height; 

	for (int c=0; c<out->depth; ++c)
	{
		for (int i=0; i<height; ++i)
		{
			for (int j=0; j<width; ++j)
			{
				s0 = s+i*width+j;
				p  = *s0;//
				if(p<0)
					*s0 *= leak;
			}
			
		}
		s+=width*height;
	}
}

然后是vl_nnconv，去掉一些在当前程序中用不到的东西，就成这样的：

void vl_nnconv(卷积层 const *indata,卷积层 *out,层数据 *filters_biases,
	int strideX = 1 ,int strideY = 1 ,
	int padLeft = 0 ,int padRight = 0 ,int padTop = 0 ,int padBottom = 0  )
                
{

  vl_impl_nnconv_forward_blas_CPU_float//
      (
       out, 0,
       indata, 1,
	   filters_biases,
       //filters, biases,
       strideY, strideX,
       padTop, padBottom,
       padLeft, padRight) ;

}

在vl_impl_nnconv_forward_blas_CPU_float中图像排成乘法矩阵，再应用卷积乘法和偏置乘法：

void vl_impl_nnconv_forward_blas_CPU_float(//Context& context,
                              卷积层 * output, double outputMult,
                              const 卷积层 * data, double dataMult,
							  const 层数据 * filters_biases,
                              //卷积层 * filters,
                              //卷积层 * biases,
                              int strideY, int strideX,
                              int padTop, int padBottom,
                              int padLeft, int padRight)
{


  int filters_getDepth,filters_getSize,filters_getHeight,filters_getWidth;
  if ( filters_biases->权重长度==576) //3x3x64
  {
		if(data->depth==1){//第1层
			filters_getDepth=1;
			filters_getSize=64;
		}else{//合成残差层
			filters_getDepth=64;
			filters_getSize=1;

		}
  }else if( filters_biases->权重长度==36864)//3 x 3 x 64 x 64
											//       大小 深度
  {
		filters_getDepth=64;
		filters_getSize=64;
  }
	filters_getHeight=3;
	filters_getWidth=3;

  //int numGroups = data.getDepth() / filters.getDepth() ;//组数量
  int numGroups = data->depth / filters_getDepth ;//组数量

  //int numFiltersPerGroup = filters.getSize() / numGroups ;//每个组的核数量
  int numFiltersPerGroup = filters_getSize / numGroups ;//每个组的核数量

  //ptrdiff_t numOutputPixels = output.getHeight() * output.getWidth() ;//输出像素数量(一通道)
  int numOutputPixels = output->height * output->width ;//输出像素数量(一通道)

  //ptrdiff_t filtersVolume = filters.getHeight() * filters.getWidth() * filters.getDepth() ;//核体积（空间）
   int filtersVolume = filters_getWidth * filters_getHeight * filters_getDepth ;//核体积（空间）

 //ptrdiff_t tempVolume = numOutputPixels * filtersVolume * numGroups ;//重排空间（乘法）
  int tempVolume = numOutputPixels * filtersVolume * numGroups ;//重排空间（乘法）

  float* tempMemory = new float[tempVolume * sizeof(float)]; //

  //值都是一的内存
  float * allOnesMemory = new float[numOutputPixels * sizeof(float)]; //
  if (tempMemory == NULL || allOnesMemory == NULL) {
    printf("内存分配错误！"); ;
    goto done ;
  }

  {//设值为1
	  float * tt=allOnesMemory;
	  for(int i=0;i<numOutputPixels;i++)
	  {
		  *tt++  = 1;

	  }
  }
  //for (/* int */image = 0 ; image < data.getSize() ; ++image) {

    //ptrdiff_t dataOffset = (data.getHeight()*data.getWidth()*data.getDepth()) * image ;
    int dataOffset = 0 ;

    //ptrdiff_t outputOffset = (output.getHeight()*output.getWidth()*output.getDepth()) * image ;
    int outputOffset = 0 ;

    vl_impl_im2row(//输入特征图转化为乘法矩阵
                                        tempMemory,
                                        (float*)data->data,//(float*)data.getMemory() + dataOffset,
                                        data->height, data->width, data->depth, //data.getHeight(), data.getWidth(), data.getDepth(),
                                        
                                        filters_getHeight,filters_getWidth,//filters.getHeight(), filters.getWidth(),

                                        strideY, strideX,
                                        padTop, padBottom, padLeft, padRight) ;


	//save_矩阵 ("im2rowMat.txt",tempMemory,data->width,data->height,3);  //filtersVolume 有9x64个 先检查前9个 保存


		float alpha = dataMult ;
      float beta = outputMult ;
	  float * filters=filters_biases->权重_数据;//=NULL;
	//save_矩阵 ("filters.txt",filters,3,3,64);  //filtersVolume 有9x64个 先检查前9个 保存

	  
	  //多核乘法
     gemm(
                              'n', 'n',
                              numOutputPixels, numFiltersPerGroup, filtersVolume,
                              alpha,
                              tempMemory, numOutputPixels,

                              (float*)filters, filtersVolume,//(float*)filters.getMemory() + filterGrpOffset, filtersVolume,

                              beta,
                              
                              (float*)output->data, numOutputPixels) ;//(float*)output.getMemory() + outputOffset + outputGrpOffset, numOutputPixels) ;
	//加上偏移
	float* biases=filters_biases->偏移_数据;

	//直接加上
	//float* outd = output->data;
	//for(int i = 0 ; i < filters_biases->偏移长度 ; ++ i)
	//{

	//	
	//	for(int j = 0 ; j < numOutputPixels; ++ j)
	//		*outd++ += *biases;

	//	biases++;

	//}

	//用乘法处理偏移
	if (biases) {
      float alpha = 1 ;
      float beta = 1 ;
      gemm(//context,
                              'n', 'n',
                              
										//vl::index_t vl::TensorGeometry::getNumElements() const { return height*width*depth*size ; }
                              numOutputPixels, filters_biases->偏移长度, 1,//numOutputPixels, biases.getNumElements(), 1,
                              alpha,
                              allOnesMemory, numOutputPixels,
                              (float*)biases, 1,// (float*)biases.getMemory(), 1,
                             
                              beta,
                              (float*)output->data , numOutputPixels) ;//(float*)output.getMemory() + outputOffset, numOutputPixels) ;
                              
    }

done:
	  if (tempMemory != NULL) {
        delete []tempMemory;  tempMemory=NULL;  
	  }
	  if (allOnesMemory != NULL) {
        delete []allOnesMemory;  allOnesMemory=NULL;  
	  }
}

vl_impl_im2row再转到im2row_cpu：

void vl_impl_im2row(
                                 float* stacked,
                                 float const* data,
                                 size_t height, size_t width, size_t depth,
                                 size_t windowHeight, size_t windowWidth,
                                 size_t strideY, size_t strideX,
                                 size_t padTop, size_t padBottom, size_t padLeft, size_t padRight)
{
  im2row_cpu(stacked, data,
                    height, width, depth,
                    windowHeight, windowWidth,
                    strideY, strideX,
                    padTop, padBottom, padLeft, padRight) ;



}

im2row_cpu是针对Matlab中的列优先的图像设计的，这里我们要不要转置呢?:

/* 备忘: 必须转置 (应该是Matlab中的列优先的缘故吧) */
static inline void
im2row_cpu(float* stacked,
           float const* data,
//           size_t width,
//           size_t height,
//           size_t depth,
//           size_t windowWidth,
//           size_t windowHeight,
//           size_t strideX,
//           size_t strideY,
//           size_t padLeft,
//           size_t padRight,
//           size_t padTop,
//           size_t padBottom)//转置

		   size_t height, 
		   size_t width, 
		   size_t depth,
		   size_t windowHeight, 
		   size_t windowWidth,
		   size_t strideY, 
		   size_t strideX,
		   size_t padTop, 
		   size_t padBottom, 
		   size_t padLeft, 
		   size_t padRight)//无转置

{
	//save_矩阵 ("data.txt",data,width,height,depth);  //保存

  int numPatchesX = (width + (padLeft + padRight) - windowWidth)/strideX + 1 ;
  int numPatchesY = (height + (padTop + padBottom) - windowHeight)/strideY + 1 ;
  int numRows = windowWidth * windowHeight * depth ;
  //printf("numPatchesX:%d\n",numPatchesX);
  //printf("numPatchesY:%d\n",numPatchesY);
  //printf("numRows:%d\n",numRows);

  /*
	一次填充一行乘法图像数据（也叫层叠）。因为图像块是沿着列存储的，所以扫描一行数据会访问所有图像块一次。
	每行对应于每个图像块内的特定偏移。
	这样，当我们填充一行时，我们倾向于访问输入图像中的空间的相邻元素，特别是对于小步幅。
   */
  for (int row = 0; row < numRows ; ++row) {
    /*
     获取与层叠图像的此行对应的色块偏移。
     */
    int u = row ;
    int v = u / windowWidth ;
    int z = v / windowHeight ;
    u %= windowWidth ;
    v %= windowHeight ;

    /*
	填充此行相当于访问输出图像中出现在输出色块中给定偏移处的所有像素。
	考虑到输出色块和输入填充的子采样，这些像素由下式给出

     x_data(x) = x * strideX + u - padLeft,  0 <= x < numPatchesX
     y_data(y) = y * strideY + v - padTop,   0 <= y < numPatchesY
     z_data(z) = z.

     这里（x，y）是输出色块的空间索引。根据填充，
	这些值中的一些将读取输入图像中的像素，默认为0.特别是，
	如果x0 <= x <x1，则x落在图像中的x_data（x）上，其中：

     x_data(x) >= 0 <=> x >= (padLeft - u) / stride
                    <=> x >= ceil((padLeft - u) / stride) = x0
     x_data(x) <= width-1 <=> x <= (width-1 + padLeft - u) / stride
                          <=> x <= floor((width-1 + padLeft - u) / stride)
                          <=> x <  floor((width-1 + padLeft - u) / stride) + 1 = x1

     和y相同。请注意，虽然通常x0 <= x1，但有一些特殊情况，其中x1 <x0。这在下面的循环中说明。
     */

    int x0 = static_min(numPatchesX, ceil_divide(padLeft - u, strideX)) ;
    int y0 = static_min(numPatchesY, ceil_divide(padTop - v, strideY)) ;
    int x1 = static_min(numPatchesX, floor_divide(width-1 + padLeft - u, strideX) + 1) ;
    int y1 = static_min(numPatchesY, floor_divide(height-1 + padTop - v, strideY) + 1) ;
    int x ;
    int y ;

    for (y = 0 ; y < y0 ; ++y) {
      for (x = 0 ; x < numPatchesX ; ++x) {
        *stacked++ = 0 ;
      }
    }
    for ( ; y < y1 ; ++y) {
      for (x = 0 ; x < x0 ; ++x) {
        *stacked++ = 0 ;
      }
      int y_data = y * strideY + v - padTop ;
      int x_data = x * strideX + u - padLeft ;
      float const * b = data + (z * height + y_data) * width + x_data ;
      for ( ; x < x1 ; ++x) {
        *stacked++ = *b ;
        b += strideX ;
      }
      for ( ; x < numPatchesX ; ++x) {
        *stacked++ = 0 ;
      }
    }
    for ( ; y < numPatchesY ; ++y) {
      for (x = 0 ; x < numPatchesX ; ++x) {
        *stacked++ = 0 ;
      }
    }
  }
}

矩阵乘法和转置矩阵乘法都在这里了gemm（只实现了blas_sgemm的部分功能，在这里已经可以对付了）：

inline void
gemm(
                     char op1, char op2,
                     int m, int n, int k,//矩阵(行,列): A( m , k ), B( k, n ), C( m , n )
                     float alpha,
                     float const * a, int lda,
                     float const * b, int ldb,
                     float beta,
                     float * c, int ldc)
{


	//printf("sgemm...\n");
	//C := alpha * op ( A ) * op ( B ) + beta * C,
	//beta=0 相当于清0
	//beta=1 相当于加上原来的值

int i;
	if(beta==0.0) //这里完成 C = 0（beta） * C
		{
			float * c1=c;
			for(i=0;i<m*n;i++)
			{
				*c1++=0;
			}
		}
if((op2 == 'N') || (op2 == 'n')) //这里完成 C = 1（alpha）* A * B
	{
		int q1,q2;
		int p1,p2,p3;
		int o1,o2;
		o1=-m;p1=-m;
		for(i=0;i<m;i++)  
		{     
			p1++;p3=p1;
			q1=-k;
			o1++;o2=o1;//-m+i
			for(int j=0;j<n;j++)  
			{  
				q1+=k;q2=q1;//  j*k+q
				p2=p1;//q*m+i
				o1+=m;//j*m+i
				for(int q=0;q<k;q++)
				{  
					//c[j*m+i]+=a[q*m+i]*b[j*k+q]; 
					p1+=m;
					c[o1]+=a[p1]*b[q1++];  
					//c[o1]+=s_a[p1]*b[q1++];  
				}  
				q1=q2;
				p1=p2;
				  
			} 
			o1=o2;
			p1=p3;
		}  
	}
	else
	{//printf("start:t!\n");   这里完成 C = 1（alpha）* A * B'
		//就是把 b 转置成 tmpb 再矩阵相乘 
		float  * tmpb=new float[k*n*sizeof(float)];
		if(tmpb==NULL)
			cout<<"内存分配出错了，gemm"<<endl;

		float  * tb2=tmpb;
		for(i=0;i<n;i++)
		{
			for(int j=0;j<k;j++)
			{

				*tb2++ =b[j*n+i];
			}
		}
		int q1,q2;
		int p1,p2,p3;
		int o1,o2;
		o1=-m;p1=-m;
		for(i=0;i<m;i++)  
		{     
			p1++;p3=p1;
			q1=-k;
			o1++;o2=o1;//-m+i
			for(int j=0;j<n;j++)  
			{  
				q1+=k;q2=q1;//  j*k+q
				p2=p1;//q*m+i
				o1+=m;//j*m+i
				for(int q=0;q<k;q++)
				{  
					//c[j*m+i]+=a[q*m+i]*b[j*k+q]; 
					p1+=m;
					c[o1]+=a[p1]*tmpb[q1++];  
				}  
				q1=q2;
				p1=p2;
				  
			} 
			o1=o2;
			p1=p3;
		}  
        delete []tmpb;  tmpb=NULL;  
	}
///*/
	//printf("sgemm end\n");

//void cblas_sgemm(char transa, char transb, int m, int n, int k, float alpha,
//		float a[], int lda, float b[], int ldb, float beta, float c[], int ldc)

//	cblas_sgemm//这个是一些博客上的C++版的sgemm,好象不能用
//		(op1, op2,
//        m, n, k,
//        alpha,
//        (float*)a, lda,
//        (float*)b, ldb,
//        beta,
//        c, ldc) ;
	//sgemm //这个是原始的，找不到文件
//		(&op1, &op2,
//        &m, &n, &k,
//        &alpha,
//        (float*)a, &lda,
//        (float*)b, &ldb,
//        &beta,
//        c, &ldc) ;
	
}

这是正向卷积乘法。其中有个偏移乘法放到后面一起说明。