CUDA sample源码分析,dct8*8

学习共享内存+纹理内存的使用方法

dct8*8.cu

float WrapperCUDA1(byte *ImgSrc, byte *ImgDst, int Stride, ROI Size)
{
    //prepare channel format descriptor for passing texture into kernels
    cudaChannelFormatDesc floattex = cudaCreateChannelDesc<float>();

    //allocate device memory
    cudaArray *Src;   //因为需要绑定纹理,所以定义为cudaAarry
    float *Dst;    
    size_t DstStride;
    checkCudaErrors(cudaMallocArray(&Src, &floattex, Size.width, Size.height));
    checkCudaErrors(cudaMallocPitch((void **)(&Dst), &DstStride, Size.width * sizeof(float), Size.height));   //对齐操作
    DstStride /= sizeof(float);

    //convert source image to float representation
    int ImgSrcFStride;
    float *ImgSrcF = MallocPlaneFloat(Size.width, Size.height, &ImgSrcFStride);   //对齐操作
    CopyByte2Float(ImgSrc, Stride, ImgSrcF, ImgSrcFStride, Size); //整型转浮点
    AddFloatPlane(-128.0f, ImgSrcF, ImgSrcFStride, Size);

    //copy from host memory to device
    checkCudaErrors(cudaMemcpy2DToArray(Src, 0, 0,
                                        ImgSrcF, ImgSrcFStride * sizeof(float),
                                        Size.width * sizeof(float), Size.height,
                                        cudaMemcpyHostToDevice));//浮点数据拷贝至cudaArray

    //setup execution parameters
    dim3 threads(BLOCK_SIZE, BLOCK_SIZE);  //8*8的邻域
    dim3 grid(Size.width / BLOCK_SIZE, Size.height / BLOCK_SIZE);

    //create and start CUDA timer
    StopWatchInterface *timerCUDA = 0;
    sdkCreateTimer(&timerCUDA);
    sdkResetTimer(&timerCUDA);

    //execute DCT kernel and benchmark
    checkCudaErrors(cudaBindTextureToArray(TexSrc, Src));  
    //!绑定纹理,实现对邻域内像素的快速访问。因为邻域内不同行像素是非连续内存,属于分散式的访问模式,如果通过全局内存访问将无法触发合并访问,需要耗费较多的访存指令

    for (int i=0; i<BENCHMARK_SIZE; i++)  //执行10次,测试平均耗时
    {
        sdkStartTimer(&timerCUDA);
        CUDAkernel1DCT<<< grid, threads >>>(Dst, (int) DstStride, 0, 0); //kernel函数,没有输入,因为通过纹理,纹理定义为全局变量
        checkCudaErrors(cudaDeviceSynchronize());
        sdkStopTimer(&timerCUDA);
    }

    checkCudaErrors(cudaUnbindTexture(TexSrc));
    getLastCudaError("Kernel execution failed");

    // finalize CUDA timer
    float TimerCUDASpan = sdkGetAverageTimerValue(&timerCUDA);
    sdkDeleteTimer(&timerCUDA);

    // execute Quantization kernel
    CUDAkernelQuantizationFloat<<< grid, threads >>>(Dst, (int) DstStride);
    getLastCudaError("Kernel execution failed");

    //copy quantized coefficients from host memory to device array
    checkCudaErrors(cudaMemcpy2DToArray(Src, 0, 0,
                                        Dst, DstStride *sizeof(float),
                                        Size.width *sizeof(float), Size.height,
                                        cudaMemcpyDeviceToDevice));

    // execute IDCT kernel
    checkCudaErrors(cudaBindTextureToArray(TexSrc, Src));
    CUDAkernel1IDCT<<< grid, threads >>>(Dst, (int) DstStride, 0, 0);
    checkCudaErrors(cudaUnbindTexture(TexSrc));
    getLastCudaError("Kernel execution failed");

    //copy quantized image block to host
    checkCudaErrors(cudaMemcpy2D(ImgSrcF, ImgSrcFStride *sizeof(float),
                                 Dst, DstStride *sizeof(float),
                                 Size.width *sizeof(float), Size.height,
                                 cudaMemcpyDeviceToHost));

    //convert image back to byte representation
    AddFloatPlane(128.0f, ImgSrcF, ImgSrcFStride, Size);
    CopyFloat2Byte(ImgSrcF, ImgSrcFStride, ImgDst, Stride, Size);

    //clean up memory
    checkCudaErrors(cudaFreeArray(Src));
    checkCudaErrors(cudaFree(Dst));
    FreePlane(ImgSrcF);

    //return time taken by the operation
    return TimerCUDASpan;
}

NVIDIA Corporation\CUDA Samples\v8.0\3_Imaging\dct8x8\dct8x8_kernel1.cuh

__global__ void CUDAkernel1DCT(float *Dst, int ImgWidth, int OffsetXBlocks, int OffsetYBlocks)
{
    // Block index
    const int bx = blockIdx.x + OffsetXBlocks;  //OffsetXBlocks为零
    const int by = blockIdx.y + OffsetYBlocks;

    // Thread index (current coefficient)
    const int tx = threadIdx.x;
    const int ty = threadIdx.y;

    // Texture coordinates
    const float tex_x = (float)((bx << BLOCK_SIZE_LOG2) + tx) + 0.5f;
    const float tex_y = (float)((by << BLOCK_SIZE_LOG2) + ty) + 0.5f;
//!计算x、y方向下标时,通过左移3位代替乘8
//!纹理可以访问浮点值下标

    //copy current image pixel to the first block
    CurBlockLocal1[(ty << BLOCK_SIZE_LOG2) + tx ] = tex2D(TexSrc, tex_x, tex_y);   
    //!CurBlockLocal1为共享内存,大小为8*8,刚好存放一个邻域内的像素
    //!2维纹理对随机访问有优化

    //synchronize threads to make sure the block is copied
    __syncthreads();   //等待 获取完邻域内所有像素之后,才继续往下执行

    //calculate the multiplication of DCTv8matrixT * A and place it in the second block
    float curelem = 0;
    int DCTv8matrixIndex = 0 * BLOCK_SIZE + ty;
    int CurBlockLocal1Index = 0 * BLOCK_SIZE + tx;
#pragma unroll   //循环展开,cuda编译器优化项

    for (int i=0; i<BLOCK_SIZE; i++)
    {
        curelem += DCTv8matrix[DCTv8matrixIndex] * CurBlockLocal1[CurBlockLocal1Index];  //DCT模板运算
        DCTv8matrixIndex += BLOCK_SIZE;  
        CurBlockLocal1Index += BLOCK_SIZE; //邻域内下一个像素的索引
    }

    CurBlockLocal2[(ty << BLOCK_SIZE_LOG2) + tx ] = curelem;  //运算结果

    //synchronize threads to make sure the first 2 matrices are multiplied and the result is stored in the second block
    __syncthreads();  //等待邻域内所有像素都运算完毕后,继续往下执行

    //calculate the multiplication of (DCTv8matrixT * A) * DCTv8matrix and place it in the first block
    curelem = 0;
    int CurBlockLocal2Index = (ty << BLOCK_SIZE_LOG2) + 0;
    DCTv8matrixIndex = 0 * BLOCK_SIZE + tx;
#pragma unroll

    for (int i=0; i<BLOCK_SIZE; i++)
    {
        curelem += CurBlockLocal2[CurBlockLocal2Index] * DCTv8matrix[DCTv8matrixIndex];
        CurBlockLocal2Index += 1;
        DCTv8matrixIndex += BLOCK_SIZE;
    }

    CurBlockLocal1[(ty << BLOCK_SIZE_LOG2) + tx ] = curelem;  
    //!充分利用共享内存CurBlockLocal1,将第一次运算的输入作为第二次运算的输出

    //synchronize threads to make sure the matrices are multiplied and the result is stored back in the first block
    __syncthreads();

    //copy current coefficient to its place in the result array
    Dst[ FMUL(((by << BLOCK_SIZE_LOG2) + ty), ImgWidth) + ((bx << BLOCK_SIZE_LOG2) + tx) ] = CurBlockLocal1[(ty << BLOCK_SIZE_LOG2) + tx ];
}

另外还给出了一个in-place版本的实现 NVIDIA Corporation\CUDA Samples\v8.0\3_Imaging\dct8x8\dct8x8_kernel2.cuh,所谓的in-place是指计算dct的过程是直接在同一个变量(共享内存)上进行的,这样就更加快了

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值