分析CUDA Samples中可分离卷积工程convolutionSeparable中的计算需求,实现一版CUDA代码。
#include "include/SepConv.cuh"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <cooperative_groups.h>
#include <iostream>
#include <sstream>
const int SMCount = 36;
const int SMThreadCount = 1024;
const int SMWarpCount = 32;
const int WarpCap = 32;
const int SharedMemSize = (48 * 1024);
const int RowCount = 3072;
const int ColCount = 3072;
const int MaskRadius = 16;
const int MaskSize = (2 * MaskRadius + 1);
const int WorkLoad = ((SMCount * SharedMemSize) / (SMCount * SMThreadCount * sizeof(float)));
const int DealLoopCount = 1 + (RowCount * ColCount * sizeof(float)) / (SMCount * SharedMemSize);
const int YLoadEdge = (int)(SharedMemSize / sizeof(float) / (WarpCap + 1));
const int DeviceMemXCount = (int)((ColCount) / WarpCap);//设备内存加载,一个warp在X方向每次迭代消耗32float,同时两边的半径大小的数据不需要计算
const int Cell = (YLoadEdge - MaskRadius * 2);//设备内存加载,不同迭代计算次数覆盖的设备内存由重叠区,需要减去一个半径,因此每个warp每次迭代消耗Y方向数量为372 - 16 = 356 错误,下一行计算出的结果是从356行开始,因此加载的数据从356 - 16 = 340开始
const int DeviceMemYCount = (int)(1 + (RowCount / Cell));//设备内存加载,在Y方向上的需要迭代的次数
const int LoadCout = 1 + ((SharedMemSize / sizeof(float)) / ((WarpCap + 1) * SMWarpCount));//32个warp,每个warp一次加载消耗33个float内存
#define checkCUDARtn(Func) \
{\
cudaError_t rtn = Func; \
if (rtn != cudaSuccess)\
{\
std::cout << "Function " << #Func << " error,reason " << cudaGetErrorString(rtn) << std::endl; \
}\
}\
__constant__ float GaussKernel[MaskSize];
inline __device__ bool InRange(int idx)
{
return ((idx % ColCount) >= MaskRadius) && ((idx % ColCount) < (ColCount - MaskRadius));
}
__global__ void kSepConvX(float* d_src, float* d_dst)
{
__shared__ float sm[SharedMemSize / sizeof(float)];
int tid = threadIdx.y * WarpCap + threadIdx.x;
//#pragma unroll /*循环展开不一定会带来性能提升*/
for (size_t it = 0; it < DealLoopCount; it++)
{
//设备内存数据加载至共享内存,是可以保证整个block正常工作的,不会存在block内部不同warp执行不同指令的情况
//=>每个block的设备内存偏移,计算逻辑:每次迭代加载满共享内存 + 每个block的负载
int Blkoffset = it * (SMCount * SharedMemSize / (sizeof(float)))/*每次迭代加载满共享内存*/ + WorkLoad * WarpCap * SMWarpCount * blockIdx.x/*每个block的负载*/;
if (Blkoffset < RowCount * ColCount)
{
float* fBlockBase = d_src + Blkoffset;
#pragma unroll
for (size_t i = 0; i < WorkLoad; i++)
{
sm[tid + i * SMWarpCount * WarpCap] = fBlockBase[tid + i * SMWarpCount * WarpCap];//加载设备内存,按照block内的线程索引加载,共享内存同理赋值
}
}
__syncthreads();//在if中同步,如果矩阵尺寸不是刚好可以通过整个warp处理,有可能死锁
int TargetPos = 0;
if (Blkoffset < RowCount * ColCount)
{
#pragma unroll
for (size_t iLd = 0; iLd < WorkLoad; iLd++)
{
int TargetPos = tid + iLd * SMWarpCount * WarpCap;//一个block内部的偏移量
if (InRange(Blkoffset + TargetPos))
{
float Tmp = 0;
#pragma unroll
for (size_t iMsk = 0; iMsk < MaskSize; iMsk++)
{
Tmp += GaussKernel[iMsk] * sm[TargetPos - MaskRadius + iMsk];
}
d_dst[Blkoffset + TargetPos] = Tmp;
}
}
}
}
}
__global__ void kSepConvY(float* d_src, float* d_dst)
{
__shared__ float sm[YLoadEdge][WarpCap + 1];//y-x
//init shared memory
for (size_t i = 0; i < WorkLoad; i++)
{
if (threadIdx.y + i * SMWarpCount < YLoadEdge)
{
sm[threadIdx.y + i * SMWarpCount][threadIdx.x] = 0;
//sm[threadIdx.x + threadIdx.y * WarpCap + i * WarpCap * SMWarpCount] = 0;
if (threadIdx.x == 0)
{
sm[threadIdx.y + i * SMWarpCount][32] = 0;
}
}
}
__syncthreads();
//load global memory
for (size_t i = 0; i < DeviceMemXCount * DeviceMemYCount; i += SMCount)//以warp为基本单元的全局索引进行迭代
{
//为了保证全局内存加载的性能,在X方向全局内存加载数量为32个float
int IndexY = (i + blockIdx.x) / DeviceMemXCount;//以32 * 356为一个块,计算块间索引
int IndexX = (i + blockIdx.x) % DeviceMemXCount;//以32 * 356为一个块,计算块内索引
int Deviceoffset = (IndexY * Cell) * ColCount + IndexX * WarpCap;//Block全局内存的首地址偏移位置
if (Deviceoffset < (RowCount * ColCount))//这里的判断条件只能保证warp的起始位置在有效的设备内存区间内
{
float* pCurLoadBase = d_src + Deviceoffset;
for (size_t iload = 0; iload < LoadCout; iload++)
{
int SM_Yoffset = threadIdx.y + iload * SMWarpCount;
if ((SM_Yoffset < YLoadEdge) && (Deviceoffset + SM_Yoffset * ColCount < RowCount * ColCount))//最后一次不一定是全部warp都在执行load操作 & Y方向上最后的设备内存并不能全部填满共享内存
{
float tmp = pCurLoadBase[threadIdx.x + SM_Yoffset * ColCount];
sm[SM_Yoffset][threadIdx.x] = pCurLoadBase[threadIdx.x + SM_Yoffset * ColCount];//二维共享内存中的数据与全局内存中数据分布一致
}
}
}
__syncthreads();//不能再条件判断中间添加同步,只能将一个判断条件分成两部分实现
if (Deviceoffset < RowCount * ColCount)
{
for (size_t iSourceCenter = MaskRadius; iSourceCenter < (YLoadEdge - MaskRadius); iSourceCenter += WarpCap)
{
float Tmp = 0.0f;
if (((iSourceCenter + threadIdx.x) < (YLoadEdge - MaskRadius)) && ((Deviceoffset + (iSourceCenter + MaskRadius + threadIdx.x) * ColCount) < RowCount * ColCount))
{
for (size_t iMask = 0; iMask < MaskSize; iMask++)
{
Tmp += GaussKernel[iMask] * sm[iSourceCenter - MaskRadius + iMask + threadIdx.x][threadIdx.y];
}
d_dst[Deviceoffset + (iSourceCenter + threadIdx.x) * ColCount + threadIdx.y] = Tmp;
}
}
}
}
}
void Init(float* pData)
{
for (size_t i = 0; i < RowCount; i++)
{
for (size_t j = 0; j < ColCount; j++)
{
pData[i * ColCount + j] = float(i);
}
}
}
void VerifyX(float* pData)
{
for (size_t i = 0; i < RowCount; i++)
{
for (int j = MaskRadius; j < ColCount - MaskRadius; j++)
{
if ((0.01f < (pData[i * ColCount + j] - (float)i)) || (-0.01f > (pData[i * ColCount + j] - (float)i)))
{
float tmp = pData[i * ColCount + j];
std::cout << "not correct row-col-val " << i << "-" << j << "-" << pData[i * ColCount + j] << std::endl;
//return;
}
}
}
std::cout << "equal \n";
}
void VerifyY(float* pData)
{
for (size_t i = MaskRadius; i < RowCount - MaskRadius; i++)
{
for (int j = MaskRadius; j < ColCount - MaskRadius; j++)
{
if ((0.01f < (pData[i * ColCount + j] - (float)i)) || (-0.01f > (pData[i * ColCount + j] - (float)i)))
{
float tmp = pData[i * ColCount + j];
std::cout << "not correct row-col-val " << i << "-" << j << "-" << pData[i * ColCount + j] << std::endl;
//return;
}
}
}
std::cout << "equal \n";
}
const int IMTTWarpX = 32;
const int IMTTWarpY = 4;
const int IMTTWkLoad = 8;
const int IMTTBlockX = ColCount / (IMTTWkLoad * IMTTWarpX);//12
const int IMTTBlockY = RowCount / IMTTWarpY;//768
__global__ void kSepConvImtt(float* d_src, float* d_dst)
{
__shared__ float sm[IMTTWarpY][IMTTWkLoad * IMTTWarpX + 2 * MaskRadius];
int PosX = blockIdx.x * IMTTWarpX * IMTTWkLoad;
int PosY = blockIdx.y * IMTTWarpY + threadIdx.y;
float* localsrc = d_src + PosY * ColCount + PosX + threadIdx.x;
float* localdst = d_dst + PosY * ColCount + PosX + threadIdx.x;
#pragma unroll
for (size_t i = 0; i < IMTTWkLoad + 1; i++)
{
sm[threadIdx.y][threadIdx.x + i * IMTTWarpX] = 0;
}
#pragma unroll
for (size_t i = 0; i < IMTTWkLoad; i++)
{
sm[threadIdx.y][MaskRadius + threadIdx.x + i * IMTTWarpX] = localsrc[i * IMTTWarpX];
}
int DevHaloOff = (threadIdx.x >> 4) * (IMTTWkLoad * 2 + 1) * MaskRadius + (threadIdx.x & 0xf) - MaskRadius;
int SMHaloOff = (threadIdx.x >> 4) * (IMTTWkLoad * 2 + 1) * MaskRadius + (threadIdx.x & 0xf);
bool b = ((DevHaloOff + PosX >= 0) && (DevHaloOff + PosX < ColCount));
sm[threadIdx.y][SMHaloOff] = ((DevHaloOff + PosX >= 0) && (DevHaloOff + PosX < ColCount)) ? localsrc[DevHaloOff] : 0;
__syncthreads();
#pragma unroll
for (size_t iwk = 1; iwk <= IMTTWkLoad; iwk++)
{
float tmp = 0;
#pragma unroll
for (int iMsk = -MaskRadius; iMsk <= MaskRadius; iMsk++)
{
tmp += GaussKernel[iMsk + MaskRadius] * sm[threadIdx.y][iwk * IMTTWarpX + threadIdx.x - MaskRadius + iMsk];
}
localdst[(iwk - 1) * IMTTWarpX] = tmp;
}
}
void CallImmt()
{
dim3 Grid;
dim3 Blk;
Blk.x = IMTTWarpX;
Blk.y = IMTTWarpY;
Grid.x = IMTTBlockX;
Grid.y = IMTTBlockY;
float* h_src = (float*)malloc(RowCount * ColCount * sizeof(float));
Init(h_src);
float* h_dst = (float*)malloc(RowCount * ColCount * sizeof(float));
float h_mask[MaskSize];
for (size_t i = 0; i < MaskSize; i++)
{
h_mask[i] = (1.0f / MaskSize);
}
float* d_src,* d_dst;
checkCUDARtn(cudaMemcpyToSymbol(GaussKernel, h_mask, sizeof(float) * MaskSize));
checkCUDARtn(cudaMalloc(&d_src, RowCount * ColCount * sizeof(float)));
checkCUDARtn(cudaMalloc(&d_dst, RowCount * ColCount * sizeof(float)));
checkCUDARtn(cudaMemcpy(d_src, h_src, RowCount * ColCount * sizeof(float), cudaMemcpyHostToDevice));
kSepConvImtt << <Grid,Blk >> > (d_src, d_dst);
checkCUDARtn(cudaDeviceSynchronize());
checkCUDARtn(cudaMemcpy(h_dst, d_dst, RowCount * ColCount * sizeof(float), cudaMemcpyDeviceToHost));
VerifyX(h_dst);
free(h_src);
free(h_dst);
cudaFree(d_src);
cudaFree(d_dst);
return;
}
void CallFuncSepConv()
{
CallImmt();
dim3 Grid;
dim3 Blk;
Blk.x = 32;
Blk.y = (SMThreadCount / 32);
Grid.x = SMCount;
float* h_src = (float*)malloc(RowCount * ColCount * sizeof(float));
Init(h_src);
float* h_dst = (float*)malloc(RowCount * ColCount * sizeof(float));
float h_mask[MaskSize];
for (size_t i = 0; i < MaskSize; i++)
{
h_mask[i] = (1.0f / MaskSize);
}
float* d_src, * d_tmp, * d_dst;
checkCUDARtn(cudaMemcpyToSymbol(GaussKernel, h_mask, sizeof(float) * MaskSize));
checkCUDARtn(cudaMalloc(&d_src, RowCount * ColCount * sizeof(float)));
checkCUDARtn(cudaMalloc(&d_tmp, RowCount * ColCount * sizeof(float)));
checkCUDARtn(cudaMalloc(&d_dst, RowCount * ColCount * sizeof(float)));
checkCUDARtn(cudaMemcpy(d_src, h_src, RowCount * ColCount * sizeof(float), cudaMemcpyHostToDevice));
kSepConvX << <Grid, Blk >> > (d_src, d_tmp);
kSepConvY << <Grid, Blk >> > (d_tmp, d_dst);
checkCUDARtn(cudaDeviceSynchronize());
checkCUDARtn(cudaMemcpy(h_dst, d_dst, RowCount * ColCount * sizeof(float), cudaMemcpyDeviceToHost));
VerifyY(h_dst);
free(h_src);
free(h_dst);
cudaFree(d_src);
cudaFree(d_dst);
cudaFree(d_tmp);
return;
}