cuda与opengl互操作之PBO

最新推荐文章于 2023-04-20 00:02:54 发布

wozhengtao

最新推荐文章于 2023-04-20 00:02:54 发布

阅读量1.4k

点赞数

PBO（像素缓冲区对象）也可以映射到CUDA地址空间，CUDA的kernel函数可以讲计算结果直接写到PBO中，然后将 PBO的内容复制到texture，进行绘制。

具体使用步骤：

1、创建PBO

[cpp]view plaincopy 
   
 // Generate a buffer ID called a PBO (Pixel Buffer Object)  
 glGenBuffers(1,pbo);  
 // Make this the current UNPACK buffer (OpenGL is state-based)  
 glBindBuffer(GL_PIXEL_UNPACK_BUFFER, *pbo);  
 // Allocate data for the buffer. 4-channel 8-bit image  
 glBufferData(GL_PIXEL_UNPACK_BUFFER, size_tex_data, NULL, GL_DYNAMIC_COPY);  

2、注册PBO

[cpp]view plaincopy 
   
 struct cudaGraphicsResource *cuda_pbo_resource;  
 cudaGraphicsGLRegisterBuffer(&cuda_pbo_resource, *pbo, cudaGraphicsMapFlagsWriteDiscard);  

3、映射PBO

[cpp]view plaincopy 
   
 cudaGraphicsMapResources(1, &cuda_pbo_resource, 0);  

[cpp]view plaincopy 
   
 cudaGraphicsResourceGetMappedPointer((void**)&d_output, &num_bytes, cuda_pbo_resource);  

4、使用

[cpp]view plaincopy 
   
 launch_kernel(d_output, window_width, window_height, w);  

5、解除映射

[cpp]view plaincopy 
   
 cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0);  

6、解除注册

[cpp]view plaincopy 
   
 cudaGraphicsUnregisterResource(cuda_pbo_resource);  

7、删除PBO

[cpp]view plaincopy 
   
 glBindBuffer(GL_ARRAY_BUFFER, *pbo);  
 glDeleteBuffers(1, pbo);  

8、绘制

[cpp]view plaincopy 
   
 glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo);  
   
 glBindTexture(GL_TEXTURE_2D, textureID);  
   
 glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, 128, 128,/*window_width, window_height,*/   
         GL_RGBA, GL_UNSIGNED_BYTE, NULL);  
   
 glBegin(GL_QUADS);  
 glTexCoord2f(0.0f,1.0f); glVertex3f(0.0f,0.0f,0.0f);  
 glTexCoord2f(0.0f,0.0f); glVertex3f(0.0f,1.0f,0.0f);  
 glTexCoord2f(1.0f,0.0f); glVertex3f(1.0f,1.0f,0.0f);  
 glTexCoord2f(1.0f,1.0f); glVertex3f(1.0f,0.0f,0.0f);  
 glEnd();  

代码：

[cpp]view plaincopy 
   
 //myPBO.cpp  
   
 #include <gl/glew.h>  
 #include <cuda_runtime.h>  
 #include <cutil_inline.h>  
 #include <cutil_gl_inline.h>  
 #include <cutil_gl_error.h>  
 #include <rendercheck_gl.h>  
 #include <sdkHelper.h>  
   
 extern void initCuda(int argc, char **argv);  
 extern void runCuda();  
   
   
 unsigned int window_width = 500;  
 unsigned int window_height = 500;  
 unsigned int image_width = 128;//window_width;  
 unsigned int image_height = 128;//window_height;  
 unsigned int timer = 0;  
   
 int animFlag = 1;   
 float animTime = 0.0f;  
 float animInc = 0.1f;  
   
 GLuint pbo = NULL;  
 GLuint textureID = NULL;  
   
 struct cudaGraphicsResource *cuda_pbo_resource;  
   
 extern "C" void launch_kernel(uchar4* , unsigned int, unsigned int, float);  
   
 void createPBO(GLuint *pbo)  
 {  
     if (pbo)  
     {  
         int num_texels = image_width * image_height;  
         int num_values = num_texels * 4;  
   
         int size_tex_data = sizeof(GLubyte) * num_values;  
   
         glGenBuffers(1, pbo);  
         glBindBuffer(GL_PIXEL_UNPACK_BUFFER, *pbo);  
         glBufferData(GL_PIXEL_UNPACK_BUFFER, size_tex_data, NULL, GL_DYNAMIC_COPY);  
   
         cudaGraphicsGLRegisterBuffer(&cuda_pbo_resource, *pbo, cudaGraphicsMapFlagsWriteDiscard);  
     }  
 }  
   
 void deletePBO(GLuint *pbo)  
 {  
     if (pbo)  
     {  
         cudaGraphicsUnregisterResource(cuda_pbo_resource);  
   
         glBindBuffer(GL_PIXEL_UNPACK_BUFFER, *pbo);  
         glDeleteBuffers(1, pbo);  
   
         *pbo = NULL;  
     }  
 }  
   
 void createTexture(GLuint *textureID, unsigned int size_x, unsigned int size_y)  
 {  
     glEnable(GL_TEXTURE_2D);  
   
     glGenTextures(1, textureID);  
     glBindTexture(GL_TEXTURE_2D, *textureID);  
   
     glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, image_width, image_height, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);  
   
     glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);  
     glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);  
 }  
   
 void deleteTexture(GLuint *tex)  
 {  
     glDeleteTextures(1, tex);  
   
     *tex = NULL;  
 }  
   
 void cleanupCuda()  
 {  
     if(pbo) deletePBO(&pbo);  
     if(textureID) deleteTexture(&textureID);  
 }  
   
 void runCuda()  
 {  
     uchar4 *dptr = NULL;  
     size_t num_bytes;  
   
     cudaGraphicsMapResources(1, &cuda_pbo_resource, 0);  
     cudaGraphicsResourceGetMappedPointer((void**)&dptr, &num_bytes, cuda_pbo_resource);  
   
     launch_kernel(dptr, image_width, image_height, animTime);  
   
     cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0);  
   
 }  
   
 void initCuda(int argc, char **argv)  
 {  
     if(cutCheckCmdLineFlag(argc, (const char**)argv, "device"))  
         cutilGLDeviceInit(argc, argv);  
     else  
         cudaGLSetGLDevice(cutGetMaxGflopsDeviceId());  
   
     createPBO(&pbo);  
     createTexture(&textureID, image_width, image_height);  
   
     atexit(cleanupCuda);  
   
     runCuda();  
 }  
   
 void computeFPS()  
 {  
     static int fpsCount = 0;  
     static int fpsLimit = 100;  
   
     fpsCount++;  
   
     if(fpsCount == fpsLimit)  
     {  
         char fps[256];  
         float ifps = 1.0f / (cutGetAverageTimerValue(timer) / 1000.0f);  
         sprintf(fps, "Cuda GL Interop Wrapper: %3.1f fps", ifps);  
   
         glutSetWindowTitle(fps);  
         fpsCount = 0;  
   
         cutilCheckError(cutResetTimer(timer));    
     }  
 }  
   
 void display()  
 {  
     runCuda();  
   
     glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo);  
   
     glBindTexture(GL_TEXTURE_2D, textureID);  
   
     glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, 128, 128,/*window_width, window_height,*/   
         GL_RGBA, GL_UNSIGNED_BYTE, NULL);  
   
     glBegin(GL_QUADS);  
     glTexCoord2f(0.0f,1.0f); glVertex3f(0.0f,0.0f,0.0f);  
     glTexCoord2f(0.0f,0.0f); glVertex3f(0.0f,1.0f,0.0f);  
     glTexCoord2f(1.0f,0.0f); glVertex3f(1.0f,1.0f,0.0f);  
     glTexCoord2f(1.0f,1.0f); glVertex3f(1.0f,0.0f,0.0f);  
     glEnd();  
   
   
     glutSwapBuffers();  
   
     if(animFlag) {  
         glutPostRedisplay();  
         animTime += animInc;  
     }  
 }  
   
 void fpsDisplay()  
 {  
     cutilCheckError(cutStartTimer(timer));  
   
     display();  
   
     cutilCheckError(cutStopTimer(timer));  
   
     computeFPS();  
 }  
   
   
 CUTBoolean initGL(int argc, char **argv)  
 {  
     glutInit(&argc, argv);  
     glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE);  
     glutInitWindowSize(window_width, window_height);  
     glutCreateWindow("Cuda GL Interop Demo (adapted from NVDIA's simpleGL)");  
   
     glutDisplayFunc(fpsDisplay);  
   
     glewInit();  
     if(!glewIsSupported("GL_VERSION_2_0"))  
     {  
         fprintf(stderr, "ERROR: Support for necessary OpengGL extensions missing.");  
         return CUTFalse;  
     }  
   
     glViewport(0, 0, window_width, window_height);  
   
     glClearColor(0.0, 0.0, 0.0, 1.0);  
     glDisable(GL_DEPTH_TEST);  
   
     glMatrixMode(GL_MODELVIEW);  
     glLoadIdentity();  
   
     glMatrixMode(GL_PROJECTION);  
     glLoadIdentity();  
   
     glOrtho(0.0f, 1.0f, 0.0f, 1.0f, 0.0f, 1.0f);  
   
     return CUTTrue;  
 }  
   
   
   
 int main(int argc, char **argv)  
 {  
     cutilCheckError(cutCreateTimer(&timer));  
   
     if (CUTFalse == initGL(argc, argv))  
         return CUTFalse;  
   
     initCuda(argc, argv);  
     CUT_CHECK_ERROR_GL();  
   
     glutDisplayFunc(fpsDisplay);  
   
     glutMainLoop();  
       
     cudaThreadExit();           ///  
     cutilExit(argc, argv);      //  
 }  

[cpp]view plaincopy 
   
 //kernelPBO.cu  
   
 #include <stdio.h>  
   
 float gain=0.75f;  
 float xStart=2.f;  
 float yStart=1.f;  
 float zOffset = 0.0f;  
 #define Z_PLANE 50.f  
   
 __constant__ unsigned char c_perm[256];  
 __shared__ unsigned char s_perm[256]; // shared memory copy of permuation array  
 unsigned char* d_perm=NULL; // global memory copy of permutation array  
 // host version of permutation array  
 const static unsigned char h_perm[] = {151,160,137,91,90,15,  
 131,13,201,95,96,53,194,233,7,225,140,36,103,30,69,142,8,99,37,240,21,10,23,  
 190, 6,148,247,120,234,75,0,26,197,62,94,252,219,203,117,35,11,32,57,177,33,  
 88,237,149,56,87,174,20,125,136,171,168, 68,175,74,165,71,134,139,48,27,166,  
 77,146,158,231,83,111,229,122,60,211,133,230,220,105,92,41,55,46,245,40,244,  
 102,143,54, 65,25,63,161, 1,216,80,73,209,76,132,187,208, 89,18,169,200,196,  
 135,130,116,188,159,86,164,100,109,198,173,186, 3,64,52,217,226,250,124,123,  
 5,202,38,147,118,126,255,82,85,212,207,206,59,227,47,16,58,17,182,189,28,42,  
 223,183,170,213,119,248,152,2,44,154,163, 70,221,153,101,155,167, 43,172,9,  
 129,22,39,253, 19,98,108,110,79,113,224,232,178,185, 112,104,218,246,97,228,  
 251,34,242,193,238,210,144,12,191,179,162,241, 81,51,145,235,249,14,239,107,  
 49,192,214, 31,181,199,106,157,184,84,204,176,115,121,50,45,127, 4,150,254,  
 138,236,205,93,222,114,67,29,24,72,243,141,128,195,78,66,215,61,156,180  
 };  
   
 __device__ inline int perm(int i) { return(s_perm[i&0xff]); }  
 __device__ inline float fade(float t) { return t * t * t * (t * (t * 6.f - 15.f) + 10.f); }  
 __device__ inline float lerpP(float t, float a, float b) { return a + t * (b - a); }  
 __device__ inline float grad(int hash, float x, float y, float z) {  
     int h = hash & 15;                      // CONVERT LO 4 BITS OF HASH CODE  
     float u = h<8 ? x : y,                 // INTO 12 GRADIENT DIRECTIONS.  
         v = h<4 ? y : h==12||h==14 ? x : z;  
     return ((h&1) == 0 ? u : -u) + ((h&2) == 0 ? v : -v);  
 }  
   
 __device__ float inoise(float x, float y, float z) {  
     int X = ((int)floorf(x)) & 255, // FIND UNIT CUBE THAT  
         Y = ((int)floorf(y)) & 255,   // CONTAINS POINT.  
         Z = ((int)floorf(z)) & 255;  
     x -= floorf(x);               // FIND RELATIVE X,Y,Z  
     y -= floorf(y);               // OF POINT IN CUBE.  
     z -= floorf(z);  
     float u = fade(x),            // COMPUTE FADE CURVES  
         v = fade(y),                // FOR EACH OF X,Y,Z.  
         w = fade(z);  
     int A = perm(X)+Y, AA = perm(A)+Z, AB = perm(A+1)+Z, // HASH COORDINATES OF  
         B = perm(X+1)+Y, BA = perm(B)+Z, BB = perm(B+1)+Z; // THE 8 CUBE CORNERS,  
   
     return lerpP(w, lerpP(v, lerpP(u, grad(perm(AA), x  , y  , z   ), // AND ADD  
         grad(perm(BA), x-1.f, y  , z   )),   // BLENDED  
         lerpP(u, grad(perm(AB), x  , y-1.f, z   ),    // RESULTS  
         grad(perm(BB), x-1.f, y-1.f, z   ))),     // FROM  8  
         lerpP(v, lerpP(u, grad(perm(AA+1), x  , y  , z-1.f ),  // CORNERS  
         grad(perm(BA+1), x-1.f, y  , z-1.f )),    // OF CUBE  
         lerpP(u, grad(perm(AB+1), x  , y-1.f, z-1.f ),  
         grad(perm(BB+1), x-1.f, y-1.f, z-1.f ))));  
 #ifdef ORIG  
     return(perm(X));  
 #endif  
   
 }  
   
 __device__ inline float height2d(float x, float y, int octaves,  
                                  float lacunarity = 2.0f, float gain = 0.5f)  
 {  
     float freq = 1.0f, amp = 0.5f;  
     float sum = 0.f;    
     for(int i=0; i<octaves; i++) {  
         sum += inoise(x*freq,y*freq, Z_PLANE)*amp;  
         freq *= lacunarity;  
         amp *= gain;  
     }  
     return sum;  
 }  
   
 __device__ inline uchar4 colorElevation(float texHeight)  
 {  
     uchar4 pos;  
   
     // color textel (r,g,b,a)  
     if (texHeight < -1.000f) pos = make_uchar4(000, 000, 128, 255); //deeps  
     else if (texHeight < -.2500f) pos = make_uchar4(000, 000, 255, 255); //shallow  
     else if (texHeight < 0.0000f) pos = make_uchar4(000, 128, 255, 255); //shore  
     else if (texHeight < 0.0625f) pos = make_uchar4(240, 240, 064, 255); //sand  
     else if (texHeight < 0.1250f) pos = make_uchar4(032, 160, 000, 255); //grass  
     else if (texHeight < 0.3750f) pos = make_uchar4(224, 224, 000, 255); //dirt  
     else if (texHeight < 0.7500f) pos = make_uchar4(128, 128, 128, 255); //rock  
     else                          pos = make_uchar4(255, 255, 255, 255); //snow  
   
     return(pos);  
 }  
   
 void checkCUDAError(const char *msg) {  
     cudaError_t err = cudaGetLastError();  
     if( cudaSuccess != err) {  
         fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );   
         exit(EXIT_FAILURE);   
     }  
 }   
   
 //Simple kernel fills an array with perlin noise  
 __global__ void k_perlin(uchar4* pos, unsigned int width, unsigned int height,   
                          float2 start, float2 delta, float gain, float zOffset,  
                          unsigned char* d_perm)  
 {  
     int idx = blockIdx.x * blockDim.x + threadIdx.x;  
     float xCur = start.x + ((float) (idx%width)) * delta.x;  
     float yCur = start.y + ((float) (idx/width)) * delta.y;  
   
     if(threadIdx.x < 256)  
         // Optimization: this causes bank conflicts  
         s_perm[threadIdx.x] = d_perm[threadIdx.x];  
     // this synchronization can be important if there are more that 256 threads  
     __syncthreads();  
   
   
     // Each thread creates one pixel location in the texture (textel)  
     if(idx < width*height) {  
         float h = height2d(xCur, yCur, 4, 2.f, 0.75f) + zOffset;  
   
         pos[idx] = colorElevation(h);  
     }  
 }  
   
   
 // Wrapper for the __global__ call that sets up the kernel call  
 extern "C" void launch_kernel(uchar4* pos, unsigned int image_width,   
                               unsigned int image_height, float time)  
 {  
     int nThreads=256; // must be equal or larger than 256! (see s_perm)  
     int totalThreads = image_height * image_width;  
     int nBlocks = totalThreads/nThreads;   
     nBlocks += ((totalThreads%nThreads)>0)?1:0;  
   
     float xExtent = 10.f;  
     float yExtent = 10.f;  
     float xDelta = xExtent/(float)image_width;  
     float yDelta = yExtent/(float)image_height;  
   
     if(!d_perm) { // for convenience allocate and copy d_perm here  
         cudaMalloc((void**) &d_perm,sizeof(h_perm));  
         cudaMemcpy(d_perm,h_perm,sizeof(h_perm),cudaMemcpyHostToDevice);  
         checkCUDAError("d_perm malloc or copy failed!");  
     }  
   
     k_perlin<<< nBlocks, nThreads>>>(pos, image_width, image_height,  
         make_float2(xStart, yStart),  
         make_float2(xDelta, yDelta),  
         gain, zOffset, d_perm);  
   
     // make certain the kernel has completed   
     cudaThreadSynchronize();  
     checkCUDAError("kernel failed!");  
 }  

[cpp]view plaincopy 
   
 //another kernelPBO.cu  
   
 #include <stdio.h>  
   
 void checkCUDAError(const char *msg)  
 {  
     cudaError_t err = cudaGetLastError();  
   
     if(cudaSuccess != err)  
     {  
         fprintf(stderr, "CUDA error: %s: %s. \n", msg, cudaGetErrorString(err));  
         exit(EXIT_FAILURE);  
     }  
 }  
   
 __global__ void kernel(uchar4 *pos, unsigned int width, unsigned int height, float time)  
 {  
     int index = blockIdx.x * blockDim.x + threadIdx.x;  
   
     unsigned int x = index % width;  
     unsigned int y = index / width;  
   
     if (index < width *height)  
     {  
         unsigned char r = (x + (int)time) & 0xff;  
         unsigned char g = (y + (int)time) & 0xff;  
         unsigned char b = ((x+y) + (int)time) & 0xff;  
   
         pos[index].w = 0;  
         pos[index].x = r;   
         pos[index].y = g;  
         pos[index].z = b;  
   
     }  
 }  
   
 extern "C" void launch_kernel(uchar4* pos, unsigned int image_width, unsigned int image_height, float time)  
 {  
     int nThreads = 256;  
     int totalThreads = image_height * image_width;  
     int nBlocks = totalThreads / nThreads;  
     nBlocks += ((totalThreads%nThreads)>0)?1:0;  
   
     kernel<<<nBlocks, nThreads>>>(pos, image_width, image_height, time);  
   
     cudaThreadSynchronize();  
   
     checkCUDAError("kernel failed!");  
 }  

运行结果：

wozhengtao

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
cuda与opengl互操作之PBO

PBO（像素缓冲区对象）也可以映射到CUDA地址空间，CUDA的kernel函数可以讲计算结果直接写到PBO中，然后将 PBO的内容复制到texture，进行绘制。具体使用步骤：1、创建PBO[cpp] view plain copy// Generate a buffer ID called a PBO (Pixel Buf
复制链接

扫一扫