CUDA中纹理Texture的使用
//main.cu
#include <stdio.h>
extern "C" void Blend_GPU( unsigned char* aImg1, unsigned char* aImg2, unsigned char* aImg3, int width, int height, int channel );//声明要调用的Blend_GPU函数
//这个Blend_CPU函数是在CPU上执行,完成和GPU上运行一样的功能,以作对比,功能很简单吧,就不介绍了
void Blend_CPU( unsigned char* aImg1, unsigned char* aImg2,
{
}
//main函数,入口,初始化数据
void main( int argc, char** argv )
{
// 分配4个空间,类型都是unsinged char型,都写在一行了
unsigned char *aImg1 = new unsigned char[ width*height*channel ],
*aImg2 = new unsigned char[ width*height*channel ],
*aRS1 = new unsigned char[ width*height*channel ],
*aRS2 = new unsigned char[ width*height*channel ];
//初始化数据,aImg1数组里都放0,aImg2里都放200
for( int i = 0; i < width * height * channel; ++ i )
{
aImg1[i] = 0;
aImg2[i] = 200;
}
// 调用CPU端程序
Blend_CPU( aImg1, aImg2, aRS1, width, height, channel );
// 调用Blend_GPU函数,Blend_GPU中会调用gpu端的kernel函数
Blend_GPU( aImg1, aImg2, aRS2, width, height, channel );
// 测试CPU端和GPU端执行的结果是不是一样,不是一样给出错误提示
for( int i = 0; i < width * height * channel; ++ i )
}
//blend_gpu.cu
#define BLOCK_DIM 512
//声明纹理参考,用来绑定纹理,其实也就是个纹理标识
texture<unsigned char, 1, cudaReadModeElementType> rT1;
texture<unsigned char, 1, cudaReadModeElementType> rT2;
//声明函数
extern "C" void Blend_GPU( unsigned char* aImg1, unsigned char* aImg2,unsigned char* aRS,int width, int height, int channel );
//核心代码,在gpu端执行的kernel,
__global__ void Blending_Texture( unsigned char* aRS, int size )
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
if( index < size )
aRS[index] = 0.5 * tex1Dfetch( rT1, index )+ 0.5 * tex1Dfetch( rT2, index );
}
void Blend_GPU( unsigned char* aImg1, unsigned char* aImg2,unsigned char* aRS,int width, int height, int channel )
{
int size = height * width * channel;
int data_size = size * sizeof( unsigned char );
//开辟3个空间
cudaMalloc( (void**)&dev_A, data_size );
cudaMalloc( (void**)&dev_B, data_size );
cudaMalloc( (void**)&dev_C, data_size );
//将host端的数据拷贝到device端
cudaMemcpy( dev_A, aImg1, data_size, cudaMemcpyHostToDevice );
cudaMemcpy( dev_B, aImg2, data_size, cudaMemcpyHostToDevice );
//将纹理参考绑定到device端的两数组 ☆device就是pgu
cudaBindTexture(0, rT1, dev_A );
cudaBindTexture(0, rT2, dev_B );
//调用kernel
Blending_Texture<<< ceil( (float)size / BLOCK_DIM ), BLOCK_DIM >>>( dev_C, size );
//将结果拷贝到host端 ☆host就是CPU
cudaMemcpy( aRS, dev_C, data_size, cudaMemcpyDeviceToHost );
//取消绑定
cudaUnbindTexture(rT1);
cudaUnbindTexture(rT2);
//释放内存空间
cudaFree(dev_A);
cudaFree(dev_B);
cudaFree(dev_C);
}