CUDA使用纹理内存

最新推荐文章于 2024-03-01 23:37:58 发布

wozhengtao

最新推荐文章于 2024-03-01 23:37:58 发布

阅读量1.9k

点赞数

纹理内存位于设备端，global memory也位于设备端，但是texture memory的访问速度较global memory要快。

因为纹理内存有cache, 只有当cache没有命中的时候才会去访问device memory，否则访问texture cache具有很小的延迟。

另外，texture cache的2D定位已经进行了优化，对于同一线程束的线程访问位置临近的texture memory时效率非常高。

还有，texture memory的stream fetching 也进行了优化，所以即使cache没有命中，对texture memory的访问延迟也不会很高。

初始化纹理内存：

[cpp]view plaincopy 
   
 texture<uchar, 3, cudaReadModeNormalizedFloat> tex;  
 cudaArray *d_volumeArray = 0;  
   
 extern "C"  
 void initCudaTexture(const uchar *h_volume, cudaExtent volumeSize)  
 {  
     cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<uchar>();  
   
     cutilSafeCall(cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize));  
   
     cudaMemcpy3DParms copyParams = {0};  
     copyParams.srcPtr = make_cudaPitchedPtr((void*)h_volume, volumeSize.width*sizeof(uchar), volumeSize.width, volumeSize.height);  
     copyParams.dstArray = d_volumeArray;  
     copyParams.extent   = volumeSize;  
     copyParams.kind     = cudaMemcpyHostToDevice;  
     cutilSafeCall(cudaMemcpy3D(©Params));  
   
     tex.normalized = true;  
     tex.filterMode = cudaFilterModeLinear;  
     tex.addressMode[0] = cudaAddressModeWrap;  
     tex.addressMode[1] = cudaAddressModeWrap;  
     tex.addressMode[2] = cudaAddressModeWrap;  
   
     cutilSafeCall(cudaBindTextureToArray(tex, d_volumeArray, channelDesc));  
 }  

在上述代码中已经将h_volume中的数据拷贝到设备端的d_volumeArray中，然后又将其绑定到一个纹理内存。
下面在kernel函数中，对纹理进行访问，并将数据保存到PBO中，然后绘制。PBO的使用在前面已经写过了，可以参考之。

[cpp]view plaincopy 
   
 __global__ void kernel(uint *d_output, uint imageW, uint imageH, float w)  
 {  
     uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;  
     uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;  
   
     float u = x / (float)imageW;  
     float v = y / (float)imageH;  
   
     float voxel = tex3D(tex, u, v, w);  
       
   
     if ((x < imageW) && (y < imageH))  
     {  
         uint i = __umul24(y, imageW) + x;  
         d_output[i] = voxel * 255;  
   
 //      CUPRINTF("%d  ", d_output[i]);  
     }  
 }  

代码：

[cpp]view plaincopy 
   
 //main.cpp  
   
 #include <gl/glew.h>  
 #include <cuda_runtime.h>  
 #include <cutil_inline.h>  
 #include <cutil_gl_inline.h>  
 #include <cutil_gl_error.h>  
 #include <rendercheck_gl.h>  
   
 typedef unsigned int  uint;  
 typedef unsigned char uchar;  
   
 unsigned int window_width = 512;  
 unsigned int window_height = 512;  
   
 unsigned int timer = 0;  
   
 bool animFlag = true;  
 float animTime = 0.0;  
 float animInc  = 0.1;  
   
 float w = 0.5;  
   
 GLuint pbo = NULL;  
 struct cudaGraphicsResource *cuda_pbo_resource;  
   
 cudaExtent volumeSize = make_cudaExtent(32, 32, 32);  
   
 extern "C"  
 void initCudaTexture(const uchar *h_volume, cudaExtent volumeSize);  
   
 extern "C"  
 void launch_kernel(uint *d_output, uint imageW, uint imageH, float w);  
   
 void createPBO(GLuint *pbo)  
 {  
     if(pbo)  
     {  
         glGenBuffers(1, pbo);  
         glBindBuffer(GL_PIXEL_UNPACK_BUFFER, *pbo);  
         glBufferData(GL_PIXEL_UNPACK_BUFFER, window_width*window_height*sizeof(GLubyte)*4, 0, GL_STREAM_DRAW);  
         glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);  
   
         //cudaGLRegisterBufferObject(*pbo);  
         cudaGraphicsGLRegisterBuffer(&cuda_pbo_resource, *pbo, cudaGraphicsMapFlagsWriteDiscard);  
     }  
   
 }  
   
 void deletePBO(GLuint *pbo)  
 {  
     if (pbo)  
     {  
         //  cudaGLUnregisterBufferObject(*pbo);  
         cudaGraphicsUnregisterResource(cuda_pbo_resource);  
   
         glBindBuffer(GL_ARRAY_BUFFER, *pbo);  
         glDeleteBuffers(1, pbo);  
   
         *pbo = NULL;  
     }  
 }  
   
 void cleanupCuda()  
 {  
     if(pbo) deletePBO(&pbo);  
 }  
   
 void runCuda()  
 {  
   
     unsigned int *d_output = NULL;  
     size_t num_bytes;  
   
     //  cudaGLMapBufferObject((void**)&d_output, pbo);  
     cudaGraphicsMapResources(1, &cuda_pbo_resource, 0);  
     cudaGraphicsResourceGetMappedPointer((void**)&d_output, &num_bytes, cuda_pbo_resource);  
   
     launch_kernel(d_output, window_width, window_height, w);  
   
     cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0);  
     //  cudaGLUnmapBufferObject(pbo);  
   
   
 }  
   
 uchar *loadVolumeData(const char *filename)  
 {  
     size_t size = volumeSize.width * volumeSize.height * volumeSize.depth;  
   
     FILE *fp = fopen(filename, "rb");  
     if (!fp)  
     {  
         fprintf(stderr, "Error openging file '%s'\n", filename);  
         return 0;  
     }  
   
     uchar *data = (uchar *)malloc(size);  
     size_t read = fread(data, 1, size, fp);  
     fclose(fp);  
   
     printf("Read '%s', %lu bytes\n", filename, read);  
   
     return data;  
   
 }  
   
   
 void initCuda(int argc, char **argv)  
 {  
     if(cutCheckCmdLineFlag(argc, (const char**)argv, "device"))  
         cutilGLDeviceInit(argc, argv);  
     else  
         cudaGLSetGLDevice(cutGetMaxGflopsDeviceId());  
   
     createPBO(&pbo);  
   
     uchar *data = loadVolumeData("Bucky.raw");  
   
     initCudaTexture(data, volumeSize);  
   
     atexit(cleanupCuda);  
   
     runCuda();  
 }  
   
 void computeFPS()  
 {  
     static int fpsCount = 0;  
     static int fpsLimit = 100;  
   
     fpsCount++;  
   
     if(fpsCount == fpsLimit)  
     {  
         char fps[256];  
         float ifps = 1.0f / (cutGetAverageTimerValue(timer) / 1000.0f);  
         sprintf(fps, "Cuda GL Interop Wrapper: %3.1f fps", ifps);  
   
         glutSetWindowTitle(fps);  
         fpsCount = 0;  
   
         cutilCheckError(cutResetTimer(timer));    
     }  
 }  
   
 void display()  
 {  
     runCuda();  
   
     glClear(GL_COLOR_BUFFER_BIT);  
   
     glDisable(GL_DEPTH_TEST);  
     glRasterPos2i(0, 0);  
     glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo);  
     glDrawPixels(window_width, window_height, GL_RGBA, GL_UNSIGNED_BYTE, 0);  
     glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);  
   
     glutSwapBuffers();  
     glutReportErrors();  
     /*if (animFlag) 
     { 
          
         animTime += animInc; 
     }*/  
 //  glutPostRedisplay();  
 }  
   
 void fpsDisplay()  
 {  
     cutilCheckError(cutStartTimer(timer));  
   
     display();  
   
     cutilCheckError(cutStopTimer(timer));  
   
     computeFPS();  
 }  
   
 void keyboard(unsigned char key, int x, int y)  
 {  
   
 }  
   
 void idle()  
 {  
     if (animFlag)  
     {  
 //      animTime += animInc;  
         w += 0.01f;  
         glutPostRedisplay();  
     }  
 }  
   
 void reshape(int x, int y)  
 {  
     glViewport(0, 0, x, y);  
   
     glMatrixMode(GL_MODELVIEW);  
     glLoadIdentity();  
   
     glMatrixMode(GL_PROJECTION);  
     glLoadIdentity();  
     glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0);   
 }  
   
 CUTBoolean initGL(int argc, char **argv)  
 {  
     glutInit(&argc, argv);  
     glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE);  
     glutInitWindowSize(window_width, window_height);  
     glutCreateWindow("Cuda GL Interop Demo (adapted from NVDIA's simpleGL)");  
   
     glutDisplayFunc(fpsDisplay);  
     glutKeyboardFunc(keyboard);  
     glutReshapeFunc(reshape);  
     glutIdleFunc(idle);  
   
     glewInit();  
     if(!glewIsSupported("GL_VERSION_2_0"))  
     {  
         fprintf(stderr, "ERROR: Support for necessary OpengGL extensions missing.");  
         return CUTFalse;  
     }  
   
     glClearColor(0.0, 0.0, 0.0, 1.0);  
     glDisable(GL_DEPTH_TEST);  
   
     glViewport(0, 0, window_width, window_height);  
   
     glMatrixMode(GL_PROJECTION);  
     glLoadIdentity();  
     gluPerspective(60.0, (GLfloat)window_width / (GLfloat)window_height, 0.1, 10.0);  
   
     return CUTTrue;  
 }  
   
   
   
   
 int main(int argc, char **argv)  
 {  
   
     cutilCheckError(cutCreateTimer(&timer));  
   
     if(CUTFalse == initGL(argc, argv))  
         return CUTFalse;  
   
     initCuda(argc, argv);  
     CUT_CHECK_ERROR_GL();  
   
     runCuda();  
   
     glutDisplayFunc(fpsDisplay);  
     glutKeyboardFunc(keyboard);  
     glutIdleFunc(idle);  
   
     glutMainLoop();  
   
     cudaThreadExit();           ///  
     cutilExit(argc, argv);      //  
   
 }  

[cpp]view plaincopy 
   
 //kernelTexture.cu  
   
 #include <stdio.h>  
 #include <cutil_inline.h>  
 #include <cutil_math.h>  
 #include "cuPrintf.cu"  
   
 //The macro CUPRINTF is defined for architectures  
 //with different compute capabilities.  
 #if __CUDA_ARCH__ < 200  //Compute capability 1.x architectures  
 #define CUPRINTF cuPrintf   
 #else                       //Compute capability 2.x architectures  
 #define CUPRINTF(fmt, ...) printf("[%d, %d]:\t" fmt, \  
     blockIdx.y*gridDim.x+blockIdx.x,\  
     threadIdx.z*blockDim.x*blockDim.y+threadIdx.y*blockDim.x+threadIdx.x,\  
     __VA_ARGS__)  
 #endif  
   
   
 typedef unsigned int  uint;  
 typedef unsigned char uchar;  
   
 texture<uchar, 3, cudaReadModeNormalizedFloat> tex;  
 cudaArray *d_volumeArray = 0;  
   
   
 __global__ void kernel(uint *d_output, uint imageW, uint imageH, float w)  
 {  
     uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;  
     uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;  
   
     float u = x / (float)imageW;  
     float v = y / (float)imageH;  
   
     float voxel = tex3D(tex, u, v, w);  
       
   
     if ((x < imageW) && (y < imageH))  
     {  
         uint i = __umul24(y, imageW) + x;  
         d_output[i] = voxel * 255;  
   
 //      CUPRINTF("%d  ", d_output[i]);  
     }  
 }  
   
   
 extern "C"  
 void launch_kernel(uint *d_output, uint imageW, uint imageH, float w)  
 {  
     dim3 blockSize(16, 16, 1);  
     dim3 gridSize(imageW/blockSize.x, imageH/blockSize.y);  
   
     kernel<<<gridSize, blockSize>>>(d_output, imageW, imageH, w);  
   
       
 }  
   
   
 extern "C"  
 void initCudaTexture(const uchar *h_volume, cudaExtent volumeSize)  
 {  
     cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<uchar>();  
   
     cutilSafeCall(cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize));  
   
     cudaMemcpy3DParms copyParams = {0};  
     copyParams.srcPtr = make_cudaPitchedPtr((void*)h_volume, volumeSize.width*sizeof(uchar), volumeSize.width, volumeSize.height);  
     copyParams.dstArray = d_volumeArray;  
     copyParams.extent   = volumeSize;  
     copyParams.kind     = cudaMemcpyHostToDevice;  
     cutilSafeCall(cudaMemcpy3D(©Params));  
   
     tex.normalized = true;  
     tex.filterMode = cudaFilterModeLinear;  
     tex.addressMode[0] = cudaAddressModeWrap;  
     tex.addressMode[1] = cudaAddressModeWrap;  
     tex.addressMode[2] = cudaAddressModeWrap;  
   
     cutilSafeCall(cudaBindTextureToArray(tex, d_volumeArray, channelDesc));  
 }  

运行结果：

参考自CUDA SDK

wozhengtao

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
CUDA使用纹理内存

纹理内存位于设备端，global memory也位于设备端，但是texture memory的访问速度较global memory要快。因为纹理内存有cache, 只有当cache没有命中的时候才会去访问device memory，否则访问texture cache具有很小的延迟。另外，texture cache的2D定位已经进行了优化，对于同一线程束的线程访问位置临近的texture
复制链接

扫一扫