CUDA sample源码分析，dct8*8

最新推荐文章于 2024-07-12 11:35:07 发布

Kelvin_Ngan

最新推荐文章于 2024-07-12 11:35:07 发布

阅读量1.3k

点赞数 1

分类专栏： cuda 文章标签： cuda

本文链接：https://blog.csdn.net/Kelvin_Yan/article/details/53908418

版权

cuda 专栏收录该内容

47 篇文章 5 订阅

订阅专栏

学习共享内存+纹理内存的使用方法

dct8*8.cu

float WrapperCUDA1(byte *ImgSrc, byte *ImgDst, int Stride, ROI Size)
{
    //prepare channel format descriptor for passing texture into kernels
    cudaChannelFormatDesc floattex = cudaCreateChannelDesc<float>();

    //allocate device memory
    cudaArray *Src;   //因为需要绑定纹理，所以定义为cudaAarry
    float *Dst;    
    size_t DstStride;
    checkCudaErrors(cudaMallocArray(&Src, &floattex, Size.width, Size.height));
    checkCudaErrors(cudaMallocPitch((void **)(&Dst), &DstStride, Size.width * sizeof(float), Size.height));   //对齐操作
    DstStride /= sizeof(float);

    //convert source image to float representation
    int ImgSrcFStride;
    float *ImgSrcF = MallocPlaneFloat(Size.width, Size.height, &ImgSrcFStride);   //对齐操作
    CopyByte2Float(ImgSrc, Stride, ImgSrcF, ImgSrcFStride, Size); //整型转浮点
    AddFloatPlane(-128.0f, ImgSrcF, ImgSrcFStride, Size);

    //copy from host memory to device
    checkCudaErrors(cudaMemcpy2DToArray(Src, 0, 0,
                                        ImgSrcF, ImgSrcFStride * sizeof(float),
                                        Size.width * sizeof(float), Size.height,
                                        cudaMemcpyHostToDevice));//浮点数据拷贝至cudaArray

    //setup execution parameters
    dim3 threads(BLOCK_SIZE, BLOCK_SIZE);  //8*8的邻域
    dim3 grid(Size.width / BLOCK_SIZE, Size.height / BLOCK_SIZE);

    //create and start CUDA timer
    StopWatchInterface *timerCUDA = 0;
    sdkCreateTimer(&timerCUDA);
    sdkResetTimer(&timerCUDA);

    //execute DCT kernel and benchmark
    checkCudaErrors(cudaBindTextureToArray(TexSrc, Src));  
    //！绑定纹理，实现对邻域内像素的快速访问。因为邻域内不同行像素是非连续内存，属于分散式的访问模式，如果通过全局内存访问将无法触发合并访问，需要耗费较多的访存指令

    for (int i=0; i<BENCHMARK_SIZE; i++)  //执行10次，测试平均耗时
    {
        sdkStartTimer(&timerCUDA);
        CUDAkernel1DCT<<< grid, threads >>>(Dst, (int) DstStride, 0, 0); //kernel函数，没有输入，因为通过纹理，纹理定义为全局变量
        checkCudaErrors(cudaDeviceSynchronize());
        sdkStopTimer(&timerCUDA);
    }

    checkCudaErrors(cudaUnbindTexture(TexSrc));
    getLastCudaError("Kernel execution failed");

    // finalize CUDA timer
    float TimerCUDASpan = sdkGetAverageTimerValue(&timerCUDA);
    sdkDeleteTimer(&timerCUDA);

    // execute Quantization kernel
    CUDAkernelQuantizationFloat<<< grid, threads >>>(Dst, (int) DstStride);
    getLastCudaError("Kernel execution failed");

    //copy quantized coefficients from host memory to device array
    checkCudaErrors(cudaMemcpy2DToArray(Src, 0, 0,
                                        Dst, DstStride *sizeof(float),
                                        Size.width *sizeof(float), Size.height,
                                        cudaMemcpyDeviceToDevice));

    // execute IDCT kernel
    checkCudaErrors(cudaBindTextureToArray(TexSrc, Src));
    CUDAkernel1IDCT<<< grid, threads >>>(Dst, (int) DstStride, 0, 0);
    checkCudaErrors(cudaUnbindTexture(TexSrc));
    getLastCudaError("Kernel execution failed");

    //copy quantized image block to host
    checkCudaErrors(cudaMemcpy2D(ImgSrcF, ImgSrcFStride *sizeof(float),
                                 Dst, DstStride *sizeof(float),
                                 Size.width *sizeof(float), Size.height,
                                 cudaMemcpyDeviceToHost));

    //convert image back to byte representation
    AddFloatPlane(128.0f, ImgSrcF, ImgSrcFStride, Size);
    CopyFloat2Byte(ImgSrcF, ImgSrcFStride, ImgDst, Stride, Size);

    //clean up memory
    checkCudaErrors(cudaFreeArray(Src));
    checkCudaErrors(cudaFree(Dst));
    FreePlane(ImgSrcF);

    //return time taken by the operation
    return TimerCUDASpan;
}

NVIDIA Corporation\CUDA Samples\v8.0\3_Imaging\dct8x8\dct8x8_kernel1.cuh

__global__ void CUDAkernel1DCT(float *Dst, int ImgWidth, int OffsetXBlocks, int OffsetYBlocks)
{
    // Block index
    const int bx = blockIdx.x + OffsetXBlocks;  //OffsetXBlocks为零
    const int by = blockIdx.y + OffsetYBlocks;

    // Thread index (current coefficient)
    const int tx = threadIdx.x;
    const int ty = threadIdx.y;

    // Texture coordinates
    const float tex_x = (float)((bx << BLOCK_SIZE_LOG2) + tx) + 0.5f;
    const float tex_y = (float)((by << BLOCK_SIZE_LOG2) + ty) + 0.5f;
//！计算x、y方向下标时，通过左移3位代替乘8
//！纹理可以访问浮点值下标

    //copy current image pixel to the first block
    CurBlockLocal1[(ty << BLOCK_SIZE_LOG2) + tx ] = tex2D(TexSrc, tex_x, tex_y);   
    //！CurBlockLocal1为共享内存，大小为8*8，刚好存放一个邻域内的像素
    //！2维纹理对随机访问有优化

    //synchronize threads to make sure the block is copied
    __syncthreads();   //等待 获取完邻域内所有像素之后，才继续往下执行

    //calculate the multiplication of DCTv8matrixT * A and place it in the second block
    float curelem = 0;
    int DCTv8matrixIndex = 0 * BLOCK_SIZE + ty;
    int CurBlockLocal1Index = 0 * BLOCK_SIZE + tx;
#pragma unroll   //循环展开，cuda编译器优化项

    for (int i=0; i<BLOCK_SIZE; i++)
    {
        curelem += DCTv8matrix[DCTv8matrixIndex] * CurBlockLocal1[CurBlockLocal1Index];  //DCT模板运算
        DCTv8matrixIndex += BLOCK_SIZE;  
        CurBlockLocal1Index += BLOCK_SIZE; //邻域内下一个像素的索引
    }

    CurBlockLocal2[(ty << BLOCK_SIZE_LOG2) + tx ] = curelem;  //运算结果

    //synchronize threads to make sure the first 2 matrices are multiplied and the result is stored in the second block
    __syncthreads();  //等待邻域内所有像素都运算完毕后，继续往下执行

    //calculate the multiplication of (DCTv8matrixT * A) * DCTv8matrix and place it in the first block
    curelem = 0;
    int CurBlockLocal2Index = (ty << BLOCK_SIZE_LOG2) + 0;
    DCTv8matrixIndex = 0 * BLOCK_SIZE + tx;
#pragma unroll

    for (int i=0; i<BLOCK_SIZE; i++)
    {
        curelem += CurBlockLocal2[CurBlockLocal2Index] * DCTv8matrix[DCTv8matrixIndex];
        CurBlockLocal2Index += 1;
        DCTv8matrixIndex += BLOCK_SIZE;
    }

    CurBlockLocal1[(ty << BLOCK_SIZE_LOG2) + tx ] = curelem;  
    //！充分利用共享内存CurBlockLocal1，将第一次运算的输入作为第二次运算的输出

    //synchronize threads to make sure the matrices are multiplied and the result is stored back in the first block
    __syncthreads();

    //copy current coefficient to its place in the result array
    Dst[ FMUL(((by << BLOCK_SIZE_LOG2) + ty), ImgWidth) + ((bx << BLOCK_SIZE_LOG2) + tx) ] = CurBlockLocal1[(ty << BLOCK_SIZE_LOG2) + tx ];
}