学习共享内存+纹理内存的使用方法
dct8*8.cu
float WrapperCUDA1(byte *ImgSrc, byte *ImgDst, int Stride, ROI Size)
{
//prepare channel format descriptor for passing texture into kernels
cudaChannelFormatDesc floattex = cudaCreateChannelDesc<float>();
//allocate device memory
cudaArray *Src; //因为需要绑定纹理,所以定义为cudaAarry
float *Dst;
size_t DstStride;
checkCudaErrors(cudaMallocArray(&Src, &floattex, Size.width, Size.height));
checkCudaErrors(cudaMallocPitch((void **)(&Dst), &DstStride, Size.width * sizeof(float), Size.height)); //对齐操作
DstStride /= sizeof(float);
//convert source image to float representation
int ImgSrcFStride;
float *ImgSrcF = MallocPlaneFloat(Size.width, Size.height, &ImgSrcFStride); //对齐操作
CopyByte2Float(ImgSrc, Stride, ImgSrcF, ImgSrcFStride, Size); //整型转浮点
AddFloatPlane(-128.0f, ImgSrcF, ImgSrcFStride, Size);
//copy from host memory to device
checkCudaErrors(cudaMemcpy2DToArray(Src, 0, 0,
ImgSrcF, ImgSrcFStride * sizeof(float),
Size.width * sizeof(float), Size.height,
cudaMemcpyHostToDevice));//浮点数据拷贝至cudaArray
//setup execution parameters
dim3 threads(BLOCK_SIZE, BLOCK_SIZE); //8*8的邻域
dim3 grid(Size.width / BLOCK_SIZE, Size.height / BLOCK_SIZE);
//create and start CUDA timer
StopWatchInterface *timerCUDA = 0;
sdkCreateTimer(&timerCUDA);
sdkResetTimer(&timerCUDA);
//execute DCT kernel and benchmark
checkCudaErrors(cudaBindTextureToArray(TexSrc, Src));
//!绑定纹理,实现对邻域内像素的快速访问。因为邻域内不同行像素是非连续内存,属于分散式的访问模式,如果通过全局内存访问将无法触发合并访问,需要耗费较多的访存指令
for (int i=0; i<BENCHMARK_SIZE; i++) //执行10次,测试平均耗时
{
sdkStartTimer(&timerCUDA);
CUDAkernel1DCT<<< grid, threads >>>(Dst, (int) DstStride, 0, 0); //kernel函数,没有输入,因为通过纹理,纹理定义为全局变量
checkCudaErrors(cudaDeviceSynchronize());
sdkStopTimer(&timerCUDA);
}
checkCudaErrors(cudaUnbindTexture(TexSrc));
getLastCudaError("Kernel execution failed");
// finalize CUDA timer
float TimerCUDASpan = sdkGetAverageTimerValue(&timerCUDA);
sdkDeleteTimer(&timerCUDA);
// execute Quantization kernel
CUDAkernelQuantizationFloat<<< grid, threads >>>(Dst, (int) DstStride);
getLastCudaError("Kernel execution failed");
//copy quantized coefficients from host memory to device array
checkCudaErrors(cudaMemcpy2DToArray(Src, 0, 0,
Dst, DstStride *sizeof(float),
Size.width *sizeof(float), Size.height,
cudaMemcpyDeviceToDevice));
// execute IDCT kernel
checkCudaErrors(cudaBindTextureToArray(TexSrc, Src));
CUDAkernel1IDCT<<< grid, threads >>>(Dst, (int) DstStride, 0, 0);
checkCudaErrors(cudaUnbindTexture(TexSrc));
getLastCudaError("Kernel execution failed");
//copy quantized image block to host
checkCudaErrors(cudaMemcpy2D(ImgSrcF, ImgSrcFStride *sizeof(float),
Dst, DstStride *sizeof(float),
Size.width *sizeof(float), Size.height,
cudaMemcpyDeviceToHost));
//convert image back to byte representation
AddFloatPlane(128.0f, ImgSrcF, ImgSrcFStride, Size);
CopyFloat2Byte(ImgSrcF, ImgSrcFStride, ImgDst, Stride, Size);
//clean up memory
checkCudaErrors(cudaFreeArray(Src));
checkCudaErrors(cudaFree(Dst));
FreePlane(ImgSrcF);
//return time taken by the operation
return TimerCUDASpan;
}
NVIDIA Corporation\CUDA Samples\v8.0\3_Imaging\dct8x8\dct8x8_kernel1.cuh
__global__ void CUDAkernel1DCT(float *Dst, int ImgWidth, int OffsetXBlocks, int OffsetYBlocks)
{
// Block index
const int bx = blockIdx.x + OffsetXBlocks; //OffsetXBlocks为零
const int by = blockIdx.y + OffsetYBlocks;
// Thread index (current coefficient)
const int tx = threadIdx.x;
const int ty = threadIdx.y;
// Texture coordinates
const float tex_x = (float)((bx << BLOCK_SIZE_LOG2) + tx) + 0.5f;
const float tex_y = (float)((by << BLOCK_SIZE_LOG2) + ty) + 0.5f;
//!计算x、y方向下标时,通过左移3位代替乘8
//!纹理可以访问浮点值下标
//copy current image pixel to the first block
CurBlockLocal1[(ty << BLOCK_SIZE_LOG2) + tx ] = tex2D(TexSrc, tex_x, tex_y);
//!CurBlockLocal1为共享内存,大小为8*8,刚好存放一个邻域内的像素
//!2维纹理对随机访问有优化
//synchronize threads to make sure the block is copied
__syncthreads(); //等待 获取完邻域内所有像素之后,才继续往下执行
//calculate the multiplication of DCTv8matrixT * A and place it in the second block
float curelem = 0;
int DCTv8matrixIndex = 0 * BLOCK_SIZE + ty;
int CurBlockLocal1Index = 0 * BLOCK_SIZE + tx;
#pragma unroll //循环展开,cuda编译器优化项
for (int i=0; i<BLOCK_SIZE; i++)
{
curelem += DCTv8matrix[DCTv8matrixIndex] * CurBlockLocal1[CurBlockLocal1Index]; //DCT模板运算
DCTv8matrixIndex += BLOCK_SIZE;
CurBlockLocal1Index += BLOCK_SIZE; //邻域内下一个像素的索引
}
CurBlockLocal2[(ty << BLOCK_SIZE_LOG2) + tx ] = curelem; //运算结果
//synchronize threads to make sure the first 2 matrices are multiplied and the result is stored in the second block
__syncthreads(); //等待邻域内所有像素都运算完毕后,继续往下执行
//calculate the multiplication of (DCTv8matrixT * A) * DCTv8matrix and place it in the first block
curelem = 0;
int CurBlockLocal2Index = (ty << BLOCK_SIZE_LOG2) + 0;
DCTv8matrixIndex = 0 * BLOCK_SIZE + tx;
#pragma unroll
for (int i=0; i<BLOCK_SIZE; i++)
{
curelem += CurBlockLocal2[CurBlockLocal2Index] * DCTv8matrix[DCTv8matrixIndex];
CurBlockLocal2Index += 1;
DCTv8matrixIndex += BLOCK_SIZE;
}
CurBlockLocal1[(ty << BLOCK_SIZE_LOG2) + tx ] = curelem;
//!充分利用共享内存CurBlockLocal1,将第一次运算的输入作为第二次运算的输出
//synchronize threads to make sure the matrices are multiplied and the result is stored back in the first block
__syncthreads();
//copy current coefficient to its place in the result array
Dst[ FMUL(((by << BLOCK_SIZE_LOG2) + ty), ImgWidth) + ((bx << BLOCK_SIZE_LOG2) + tx) ] = CurBlockLocal1[(ty << BLOCK_SIZE_LOG2) + tx ];
}
另外还给出了一个in-place版本的实现 NVIDIA Corporation\CUDA Samples\v8.0\3_Imaging\dct8x8\dct8x8_kernel2.cuh,所谓的in-place是指计算dct的过程是直接在同一个变量(共享内存)上进行的,这样就更加快了