纹理内存位于设备端,global memory也位于设备端,但是texture memory的访问速度较global memory要快。
因为纹理内存有cache, 只有当cache没有命中的时候才会去访问device memory,否则访问texture cache具有很小的延迟。
另外,texture cache的2D定位已经进行了优化,对于同一线程束的线程访问位置临近的texture memory时效率非常高。还有,texture memory的stream fetching 也进行了优化,所以即使cache没有命中,对texture memory的访问延迟也不会很高。
初始化纹理内存:
- texture<uchar, 3, cudaReadModeNormalizedFloat> tex;
- cudaArray *d_volumeArray = 0;
- extern "C"
- void initCudaTexture(const uchar *h_volume, cudaExtent volumeSize)
- {
- cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<uchar>();
- cutilSafeCall(cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize));
- cudaMemcpy3DParms copyParams = {0};
- copyParams.srcPtr = make_cudaPitchedPtr((void*)h_volume, volumeSize.width*sizeof(uchar), volumeSize.width, volumeSize.height);
- copyParams.dstArray = d_volumeArray;
- copyParams.extent = volumeSize;
- copyParams.kind = cudaMemcpyHostToDevice;
- cutilSafeCall(cudaMemcpy3D(©Params));
- tex.normalized = true;
- tex.filterMode = cudaFilterModeLinear;
- tex.addressMode[0] = cudaAddressModeWrap;
- tex.addressMode[1] = cudaAddressModeWrap;
- tex.addressMode[2] = cudaAddressModeWrap;
- cutilSafeCall(cudaBindTextureToArray(tex, d_volumeArray, channelDesc));
- }
在上述代码中已经将h_volume中的数据拷贝到设备端的d_volumeArray中,然后又将其绑定到一个纹理内存。
下面在kernel函数中,对纹理进行访问,并将数据保存到PBO中,然后绘制。PBO的使用在前面已经写过了,可以参考之。
- __global__ void kernel(uint *d_output, uint imageW, uint imageH, float w)
- {
- uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
- uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
- float u = x / (float)imageW;
- float v = y / (float)imageH;
- float voxel = tex3D(tex, u, v, w);
- if ((x < imageW) && (y < imageH))
- {
- uint i = __umul24(y, imageW) + x;
- d_output[i] = voxel * 255;
- // CUPRINTF("%d ", d_output[i]);
- }
- }
代码:
- //main.cpp
- #include <gl/glew.h>
- #include <cuda_runtime.h>
- #include <cutil_inline.h>
- #include <cutil_gl_inline.h>
- #include <cutil_gl_error.h>
- #include <rendercheck_gl.h>
- typedef unsigned int uint;
- typedef unsigned char uchar;
- unsigned int window_width = 512;
- unsigned int window_height = 512;
- unsigned int timer = 0;
- bool animFlag = true;
- float animTime = 0.0;
- float animInc = 0.1;
- float w = 0.5;
- GLuint pbo = NULL;
- struct cudaGraphicsResource *cuda_pbo_resource;
- cudaExtent volumeSize = make_cudaExtent(32, 32, 32);
- extern "C"
- void initCudaTexture(const uchar *h_volume, cudaExtent volumeSize);
- extern "C"
- void launch_kernel(uint *d_output, uint imageW, uint imageH, float w);
- void createPBO(GLuint *pbo)
- {
- if(pbo)
- {
- glGenBuffers(1, pbo);
- glBindBuffer(GL_PIXEL_UNPACK_BUFFER, *pbo);
- glBufferData(GL_PIXEL_UNPACK_BUFFER, window_width*window_height*sizeof(GLubyte)*4, 0, GL_STREAM_DRAW);
- glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
- //cudaGLRegisterBufferObject(*pbo);
- cudaGraphicsGLRegisterBuffer(&cuda_pbo_resource, *pbo, cudaGraphicsMapFlagsWriteDiscard);
- }
- }
- void deletePBO(GLuint *pbo)
- {
- if (pbo)
- {
- // cudaGLUnregisterBufferObject(*pbo);
- cudaGraphicsUnregisterResource(cuda_pbo_resource);
- glBindBuffer(GL_ARRAY_BUFFER, *pbo);
- glDeleteBuffers(1, pbo);
- *pbo = NULL;
- }
- }
- void cleanupCuda()
- {
- if(pbo) deletePBO(&pbo);
- }
- void runCuda()
- {
- unsigned int *d_output = NULL;
- size_t num_bytes;
- // cudaGLMapBufferObject((void**)&d_output, pbo);
- cudaGraphicsMapResources(1, &cuda_pbo_resource, 0);
- cudaGraphicsResourceGetMappedPointer((void**)&d_output, &num_bytes, cuda_pbo_resource);
- launch_kernel(d_output, window_width, window_height, w);
- cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0);
- // cudaGLUnmapBufferObject(pbo);
- }
- uchar *loadVolumeData(const char *filename)
- {
- size_t size = volumeSize.width * volumeSize.height * volumeSize.depth;
- FILE *fp = fopen(filename, "rb");
- if (!fp)
- {
- fprintf(stderr, "Error openging file '%s'\n", filename);
- return 0;
- }
- uchar *data = (uchar *)malloc(size);
- size_t read = fread(data, 1, size, fp);
- fclose(fp);
- printf("Read '%s', %lu bytes\n", filename, read);
- return data;
- }
- void initCuda(int argc, char **argv)
- {
- if(cutCheckCmdLineFlag(argc, (const char**)argv, "device"))
- cutilGLDeviceInit(argc, argv);
- else
- cudaGLSetGLDevice(cutGetMaxGflopsDeviceId());
- createPBO(&pbo);
- uchar *data = loadVolumeData("Bucky.raw");
- initCudaTexture(data, volumeSize);
- atexit(cleanupCuda);
- runCuda();
- }
- void computeFPS()
- {
- static int fpsCount = 0;
- static int fpsLimit = 100;
- fpsCount++;
- if(fpsCount == fpsLimit)
- {
- char fps[256];
- float ifps = 1.0f / (cutGetAverageTimerValue(timer) / 1000.0f);
- sprintf(fps, "Cuda GL Interop Wrapper: %3.1f fps", ifps);
- glutSetWindowTitle(fps);
- fpsCount = 0;
- cutilCheckError(cutResetTimer(timer));
- }
- }
- void display()
- {
- runCuda();
- glClear(GL_COLOR_BUFFER_BIT);
- glDisable(GL_DEPTH_TEST);
- glRasterPos2i(0, 0);
- glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo);
- glDrawPixels(window_width, window_height, GL_RGBA, GL_UNSIGNED_BYTE, 0);
- glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
- glutSwapBuffers();
- glutReportErrors();
- /*if (animFlag)
- {
- animTime += animInc;
- }*/
- // glutPostRedisplay();
- }
- void fpsDisplay()
- {
- cutilCheckError(cutStartTimer(timer));
- display();
- cutilCheckError(cutStopTimer(timer));
- computeFPS();
- }
- void keyboard(unsigned char key, int x, int y)
- {
- }
- void idle()
- {
- if (animFlag)
- {
- // animTime += animInc;
- w += 0.01f;
- glutPostRedisplay();
- }
- }
- void reshape(int x, int y)
- {
- glViewport(0, 0, x, y);
- glMatrixMode(GL_MODELVIEW);
- glLoadIdentity();
- glMatrixMode(GL_PROJECTION);
- glLoadIdentity();
- glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
- }
- CUTBoolean initGL(int argc, char **argv)
- {
- glutInit(&argc, argv);
- glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE);
- glutInitWindowSize(window_width, window_height);
- glutCreateWindow("Cuda GL Interop Demo (adapted from NVDIA's simpleGL)");
- glutDisplayFunc(fpsDisplay);
- glutKeyboardFunc(keyboard);
- glutReshapeFunc(reshape);
- glutIdleFunc(idle);
- glewInit();
- if(!glewIsSupported("GL_VERSION_2_0"))
- {
- fprintf(stderr, "ERROR: Support for necessary OpengGL extensions missing.");
- return CUTFalse;
- }
- glClearColor(0.0, 0.0, 0.0, 1.0);
- glDisable(GL_DEPTH_TEST);
- glViewport(0, 0, window_width, window_height);
- glMatrixMode(GL_PROJECTION);
- glLoadIdentity();
- gluPerspective(60.0, (GLfloat)window_width / (GLfloat)window_height, 0.1, 10.0);
- return CUTTrue;
- }
- int main(int argc, char **argv)
- {
- cutilCheckError(cutCreateTimer(&timer));
- if(CUTFalse == initGL(argc, argv))
- return CUTFalse;
- initCuda(argc, argv);
- CUT_CHECK_ERROR_GL();
- runCuda();
- glutDisplayFunc(fpsDisplay);
- glutKeyboardFunc(keyboard);
- glutIdleFunc(idle);
- glutMainLoop();
- cudaThreadExit(); ///
- cutilExit(argc, argv); //
- }
- //kernelTexture.cu
- #include <stdio.h>
- #include <cutil_inline.h>
- #include <cutil_math.h>
- #include "cuPrintf.cu"
- //The macro CUPRINTF is defined for architectures
- //with different compute capabilities.
- #if __CUDA_ARCH__ < 200 //Compute capability 1.x architectures
- #define CUPRINTF cuPrintf
- #else //Compute capability 2.x architectures
- #define CUPRINTF(fmt, ...) printf("[%d, %d]:\t" fmt, \
- blockIdx.y*gridDim.x+blockIdx.x,\
- threadIdx.z*blockDim.x*blockDim.y+threadIdx.y*blockDim.x+threadIdx.x,\
- __VA_ARGS__)
- #endif
- typedef unsigned int uint;
- typedef unsigned char uchar;
- texture<uchar, 3, cudaReadModeNormalizedFloat> tex;
- cudaArray *d_volumeArray = 0;
- __global__ void kernel(uint *d_output, uint imageW, uint imageH, float w)
- {
- uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
- uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
- float u = x / (float)imageW;
- float v = y / (float)imageH;
- float voxel = tex3D(tex, u, v, w);
- if ((x < imageW) && (y < imageH))
- {
- uint i = __umul24(y, imageW) + x;
- d_output[i] = voxel * 255;
- // CUPRINTF("%d ", d_output[i]);
- }
- }
- extern "C"
- void launch_kernel(uint *d_output, uint imageW, uint imageH, float w)
- {
- dim3 blockSize(16, 16, 1);
- dim3 gridSize(imageW/blockSize.x, imageH/blockSize.y);
- kernel<<<gridSize, blockSize>>>(d_output, imageW, imageH, w);
- }
- extern "C"
- void initCudaTexture(const uchar *h_volume, cudaExtent volumeSize)
- {
- cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<uchar>();
- cutilSafeCall(cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize));
- cudaMemcpy3DParms copyParams = {0};
- copyParams.srcPtr = make_cudaPitchedPtr((void*)h_volume, volumeSize.width*sizeof(uchar), volumeSize.width, volumeSize.height);
- copyParams.dstArray = d_volumeArray;
- copyParams.extent = volumeSize;
- copyParams.kind = cudaMemcpyHostToDevice;
- cutilSafeCall(cudaMemcpy3D(©Params));
- tex.normalized = true;
- tex.filterMode = cudaFilterModeLinear;
- tex.addressMode[0] = cudaAddressModeWrap;
- tex.addressMode[1] = cudaAddressModeWrap;
- tex.addressMode[2] = cudaAddressModeWrap;
- cutilSafeCall(cudaBindTextureToArray(tex, d_volumeArray, channelDesc));
- }
运行结果:
参考自CUDA SDK