PBO(像素缓冲区对象)也可以映射到CUDA地址空间,CUDA的kernel函数可以讲计算结果直接写到PBO中,然后将 PBO的内容复制到texture,进行绘制。
具体使用步骤:
1、创建PBO
- // Generate a buffer ID called a PBO (Pixel Buffer Object)
- glGenBuffers(1,pbo);
- // Make this the current UNPACK buffer (OpenGL is state-based)
- glBindBuffer(GL_PIXEL_UNPACK_BUFFER, *pbo);
- // Allocate data for the buffer. 4-channel 8-bit image
- glBufferData(GL_PIXEL_UNPACK_BUFFER, size_tex_data, NULL, GL_DYNAMIC_COPY);
2、注册PBO
- struct cudaGraphicsResource *cuda_pbo_resource;
- cudaGraphicsGLRegisterBuffer(&cuda_pbo_resource, *pbo, cudaGraphicsMapFlagsWriteDiscard);
3、映射PBO
- cudaGraphicsMapResources(1, &cuda_pbo_resource, 0);
- cudaGraphicsResourceGetMappedPointer((void**)&d_output, &num_bytes, cuda_pbo_resource);
- launch_kernel(d_output, window_width, window_height, w);
5、解除映射
- cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0);
6、解除注册
- cudaGraphicsUnregisterResource(cuda_pbo_resource);
7、删除PBO
- glBindBuffer(GL_ARRAY_BUFFER, *pbo);
- glDeleteBuffers(1, pbo);
8、绘制
- glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo);
- glBindTexture(GL_TEXTURE_2D, textureID);
- glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, 128, 128,/*window_width, window_height,*/
- GL_RGBA, GL_UNSIGNED_BYTE, NULL);
- glBegin(GL_QUADS);
- glTexCoord2f(0.0f,1.0f); glVertex3f(0.0f,0.0f,0.0f);
- glTexCoord2f(0.0f,0.0f); glVertex3f(0.0f,1.0f,0.0f);
- glTexCoord2f(1.0f,0.0f); glVertex3f(1.0f,1.0f,0.0f);
- glTexCoord2f(1.0f,1.0f); glVertex3f(1.0f,0.0f,0.0f);
- glEnd();
代码:
- //myPBO.cpp
- #include <gl/glew.h>
- #include <cuda_runtime.h>
- #include <cutil_inline.h>
- #include <cutil_gl_inline.h>
- #include <cutil_gl_error.h>
- #include <rendercheck_gl.h>
- #include <sdkHelper.h>
- extern void initCuda(int argc, char **argv);
- extern void runCuda();
- unsigned int window_width = 500;
- unsigned int window_height = 500;
- unsigned int image_width = 128;//window_width;
- unsigned int image_height = 128;//window_height;
- unsigned int timer = 0;
- int animFlag = 1;
- float animTime = 0.0f;
- float animInc = 0.1f;
- GLuint pbo = NULL;
- GLuint textureID = NULL;
- struct cudaGraphicsResource *cuda_pbo_resource;
- extern "C" void launch_kernel(uchar4* , unsigned int, unsigned int, float);
- void createPBO(GLuint *pbo)
- {
- if (pbo)
- {
- int num_texels = image_width * image_height;
- int num_values = num_texels * 4;
- int size_tex_data = sizeof(GLubyte) * num_values;
- glGenBuffers(1, pbo);
- glBindBuffer(GL_PIXEL_UNPACK_BUFFER, *pbo);
- glBufferData(GL_PIXEL_UNPACK_BUFFER, size_tex_data, NULL, GL_DYNAMIC_COPY);
- cudaGraphicsGLRegisterBuffer(&cuda_pbo_resource, *pbo, cudaGraphicsMapFlagsWriteDiscard);
- }
- }
- void deletePBO(GLuint *pbo)
- {
- if (pbo)
- {
- cudaGraphicsUnregisterResource(cuda_pbo_resource);
- glBindBuffer(GL_PIXEL_UNPACK_BUFFER, *pbo);
- glDeleteBuffers(1, pbo);
- *pbo = NULL;
- }
- }
- void createTexture(GLuint *textureID, unsigned int size_x, unsigned int size_y)
- {
- glEnable(GL_TEXTURE_2D);
- glGenTextures(1, textureID);
- glBindTexture(GL_TEXTURE_2D, *textureID);
- glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, image_width, image_height, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
- glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
- glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
- }
- void deleteTexture(GLuint *tex)
- {
- glDeleteTextures(1, tex);
- *tex = NULL;
- }
- void cleanupCuda()
- {
- if(pbo) deletePBO(&pbo);
- if(textureID) deleteTexture(&textureID);
- }
- void runCuda()
- {
- uchar4 *dptr = NULL;
- size_t num_bytes;
- cudaGraphicsMapResources(1, &cuda_pbo_resource, 0);
- cudaGraphicsResourceGetMappedPointer((void**)&dptr, &num_bytes, cuda_pbo_resource);
- launch_kernel(dptr, image_width, image_height, animTime);
- cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0);
- }
- void initCuda(int argc, char **argv)
- {
- if(cutCheckCmdLineFlag(argc, (const char**)argv, "device"))
- cutilGLDeviceInit(argc, argv);
- else
- cudaGLSetGLDevice(cutGetMaxGflopsDeviceId());
- createPBO(&pbo);
- createTexture(&textureID, image_width, image_height);
- atexit(cleanupCuda);
- runCuda();
- }
- void computeFPS()
- {
- static int fpsCount = 0;
- static int fpsLimit = 100;
- fpsCount++;
- if(fpsCount == fpsLimit)
- {
- char fps[256];
- float ifps = 1.0f / (cutGetAverageTimerValue(timer) / 1000.0f);
- sprintf(fps, "Cuda GL Interop Wrapper: %3.1f fps", ifps);
- glutSetWindowTitle(fps);
- fpsCount = 0;
- cutilCheckError(cutResetTimer(timer));
- }
- }
- void display()
- {
- runCuda();
- glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo);
- glBindTexture(GL_TEXTURE_2D, textureID);
- glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, 128, 128,/*window_width, window_height,*/
- GL_RGBA, GL_UNSIGNED_BYTE, NULL);
- glBegin(GL_QUADS);
- glTexCoord2f(0.0f,1.0f); glVertex3f(0.0f,0.0f,0.0f);
- glTexCoord2f(0.0f,0.0f); glVertex3f(0.0f,1.0f,0.0f);
- glTexCoord2f(1.0f,0.0f); glVertex3f(1.0f,1.0f,0.0f);
- glTexCoord2f(1.0f,1.0f); glVertex3f(1.0f,0.0f,0.0f);
- glEnd();
- glutSwapBuffers();
- if(animFlag) {
- glutPostRedisplay();
- animTime += animInc;
- }
- }
- void fpsDisplay()
- {
- cutilCheckError(cutStartTimer(timer));
- display();
- cutilCheckError(cutStopTimer(timer));
- computeFPS();
- }
- CUTBoolean initGL(int argc, char **argv)
- {
- glutInit(&argc, argv);
- glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE);
- glutInitWindowSize(window_width, window_height);
- glutCreateWindow("Cuda GL Interop Demo (adapted from NVDIA's simpleGL)");
- glutDisplayFunc(fpsDisplay);
- glewInit();
- if(!glewIsSupported("GL_VERSION_2_0"))
- {
- fprintf(stderr, "ERROR: Support for necessary OpengGL extensions missing.");
- return CUTFalse;
- }
- glViewport(0, 0, window_width, window_height);
- glClearColor(0.0, 0.0, 0.0, 1.0);
- glDisable(GL_DEPTH_TEST);
- glMatrixMode(GL_MODELVIEW);
- glLoadIdentity();
- glMatrixMode(GL_PROJECTION);
- glLoadIdentity();
- glOrtho(0.0f, 1.0f, 0.0f, 1.0f, 0.0f, 1.0f);
- return CUTTrue;
- }
- int main(int argc, char **argv)
- {
- cutilCheckError(cutCreateTimer(&timer));
- if (CUTFalse == initGL(argc, argv))
- return CUTFalse;
- initCuda(argc, argv);
- CUT_CHECK_ERROR_GL();
- glutDisplayFunc(fpsDisplay);
- glutMainLoop();
- cudaThreadExit(); ///
- cutilExit(argc, argv); //
- }
- //kernelPBO.cu
- #include <stdio.h>
- float gain=0.75f;
- float xStart=2.f;
- float yStart=1.f;
- float zOffset = 0.0f;
- #define Z_PLANE 50.f
- __constant__ unsigned char c_perm[256];
- __shared__ unsigned char s_perm[256]; // shared memory copy of permuation array
- unsigned char* d_perm=NULL; // global memory copy of permutation array
- // host version of permutation array
- const static unsigned char h_perm[] = {151,160,137,91,90,15,
- 131,13,201,95,96,53,194,233,7,225,140,36,103,30,69,142,8,99,37,240,21,10,23,
- 190, 6,148,247,120,234,75,0,26,197,62,94,252,219,203,117,35,11,32,57,177,33,
- 88,237,149,56,87,174,20,125,136,171,168, 68,175,74,165,71,134,139,48,27,166,
- 77,146,158,231,83,111,229,122,60,211,133,230,220,105,92,41,55,46,245,40,244,
- 102,143,54, 65,25,63,161, 1,216,80,73,209,76,132,187,208, 89,18,169,200,196,
- 135,130,116,188,159,86,164,100,109,198,173,186, 3,64,52,217,226,250,124,123,
- 5,202,38,147,118,126,255,82,85,212,207,206,59,227,47,16,58,17,182,189,28,42,
- 223,183,170,213,119,248,152,2,44,154,163, 70,221,153,101,155,167, 43,172,9,
- 129,22,39,253, 19,98,108,110,79,113,224,232,178,185, 112,104,218,246,97,228,
- 251,34,242,193,238,210,144,12,191,179,162,241, 81,51,145,235,249,14,239,107,
- 49,192,214, 31,181,199,106,157,184,84,204,176,115,121,50,45,127, 4,150,254,
- 138,236,205,93,222,114,67,29,24,72,243,141,128,195,78,66,215,61,156,180
- };
- __device__ inline int perm(int i) { return(s_perm[i&0xff]); }
- __device__ inline float fade(float t) { return t * t * t * (t * (t * 6.f - 15.f) + 10.f); }
- __device__ inline float lerpP(float t, float a, float b) { return a + t * (b - a); }
- __device__ inline float grad(int hash, float x, float y, float z) {
- int h = hash & 15; // CONVERT LO 4 BITS OF HASH CODE
- float u = h<8 ? x : y, // INTO 12 GRADIENT DIRECTIONS.
- v = h<4 ? y : h==12||h==14 ? x : z;
- return ((h&1) == 0 ? u : -u) + ((h&2) == 0 ? v : -v);
- }
- __device__ float inoise(float x, float y, float z) {
- int X = ((int)floorf(x)) & 255, // FIND UNIT CUBE THAT
- Y = ((int)floorf(y)) & 255, // CONTAINS POINT.
- Z = ((int)floorf(z)) & 255;
- x -= floorf(x); // FIND RELATIVE X,Y,Z
- y -= floorf(y); // OF POINT IN CUBE.
- z -= floorf(z);
- float u = fade(x), // COMPUTE FADE CURVES
- v = fade(y), // FOR EACH OF X,Y,Z.
- w = fade(z);
- int A = perm(X)+Y, AA = perm(A)+Z, AB = perm(A+1)+Z, // HASH COORDINATES OF
- B = perm(X+1)+Y, BA = perm(B)+Z, BB = perm(B+1)+Z; // THE 8 CUBE CORNERS,
- return lerpP(w, lerpP(v, lerpP(u, grad(perm(AA), x , y , z ), // AND ADD
- grad(perm(BA), x-1.f, y , z )), // BLENDED
- lerpP(u, grad(perm(AB), x , y-1.f, z ), // RESULTS
- grad(perm(BB), x-1.f, y-1.f, z ))), // FROM 8
- lerpP(v, lerpP(u, grad(perm(AA+1), x , y , z-1.f ), // CORNERS
- grad(perm(BA+1), x-1.f, y , z-1.f )), // OF CUBE
- lerpP(u, grad(perm(AB+1), x , y-1.f, z-1.f ),
- grad(perm(BB+1), x-1.f, y-1.f, z-1.f ))));
- #ifdef ORIG
- return(perm(X));
- #endif
- }
- __device__ inline float height2d(float x, float y, int octaves,
- float lacunarity = 2.0f, float gain = 0.5f)
- {
- float freq = 1.0f, amp = 0.5f;
- float sum = 0.f;
- for(int i=0; i<octaves; i++) {
- sum += inoise(x*freq,y*freq, Z_PLANE)*amp;
- freq *= lacunarity;
- amp *= gain;
- }
- return sum;
- }
- __device__ inline uchar4 colorElevation(float texHeight)
- {
- uchar4 pos;
- // color textel (r,g,b,a)
- if (texHeight < -1.000f) pos = make_uchar4(000, 000, 128, 255); //deeps
- else if (texHeight < -.2500f) pos = make_uchar4(000, 000, 255, 255); //shallow
- else if (texHeight < 0.0000f) pos = make_uchar4(000, 128, 255, 255); //shore
- else if (texHeight < 0.0625f) pos = make_uchar4(240, 240, 064, 255); //sand
- else if (texHeight < 0.1250f) pos = make_uchar4(032, 160, 000, 255); //grass
- else if (texHeight < 0.3750f) pos = make_uchar4(224, 224, 000, 255); //dirt
- else if (texHeight < 0.7500f) pos = make_uchar4(128, 128, 128, 255); //rock
- else pos = make_uchar4(255, 255, 255, 255); //snow
- return(pos);
- }
- void checkCUDAError(const char *msg) {
- cudaError_t err = cudaGetLastError();
- if( cudaSuccess != err) {
- fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
- exit(EXIT_FAILURE);
- }
- }
- //Simple kernel fills an array with perlin noise
- __global__ void k_perlin(uchar4* pos, unsigned int width, unsigned int height,
- float2 start, float2 delta, float gain, float zOffset,
- unsigned char* d_perm)
- {
- int idx = blockIdx.x * blockDim.x + threadIdx.x;
- float xCur = start.x + ((float) (idx%width)) * delta.x;
- float yCur = start.y + ((float) (idx/width)) * delta.y;
- if(threadIdx.x < 256)
- // Optimization: this causes bank conflicts
- s_perm[threadIdx.x] = d_perm[threadIdx.x];
- // this synchronization can be important if there are more that 256 threads
- __syncthreads();
- // Each thread creates one pixel location in the texture (textel)
- if(idx < width*height) {
- float h = height2d(xCur, yCur, 4, 2.f, 0.75f) + zOffset;
- pos[idx] = colorElevation(h);
- }
- }
- // Wrapper for the __global__ call that sets up the kernel call
- extern "C" void launch_kernel(uchar4* pos, unsigned int image_width,
- unsigned int image_height, float time)
- {
- int nThreads=256; // must be equal or larger than 256! (see s_perm)
- int totalThreads = image_height * image_width;
- int nBlocks = totalThreads/nThreads;
- nBlocks += ((totalThreads%nThreads)>0)?1:0;
- float xExtent = 10.f;
- float yExtent = 10.f;
- float xDelta = xExtent/(float)image_width;
- float yDelta = yExtent/(float)image_height;
- if(!d_perm) { // for convenience allocate and copy d_perm here
- cudaMalloc((void**) &d_perm,sizeof(h_perm));
- cudaMemcpy(d_perm,h_perm,sizeof(h_perm),cudaMemcpyHostToDevice);
- checkCUDAError("d_perm malloc or copy failed!");
- }
- k_perlin<<< nBlocks, nThreads>>>(pos, image_width, image_height,
- make_float2(xStart, yStart),
- make_float2(xDelta, yDelta),
- gain, zOffset, d_perm);
- // make certain the kernel has completed
- cudaThreadSynchronize();
- checkCUDAError("kernel failed!");
- }
- //another kernelPBO.cu
- #include <stdio.h>
- void checkCUDAError(const char *msg)
- {
- cudaError_t err = cudaGetLastError();
- if(cudaSuccess != err)
- {
- fprintf(stderr, "CUDA error: %s: %s. \n", msg, cudaGetErrorString(err));
- exit(EXIT_FAILURE);
- }
- }
- __global__ void kernel(uchar4 *pos, unsigned int width, unsigned int height, float time)
- {
- int index = blockIdx.x * blockDim.x + threadIdx.x;
- unsigned int x = index % width;
- unsigned int y = index / width;
- if (index < width *height)
- {
- unsigned char r = (x + (int)time) & 0xff;
- unsigned char g = (y + (int)time) & 0xff;
- unsigned char b = ((x+y) + (int)time) & 0xff;
- pos[index].w = 0;
- pos[index].x = r;
- pos[index].y = g;
- pos[index].z = b;
- }
- }
- extern "C" void launch_kernel(uchar4* pos, unsigned int image_width, unsigned int image_height, float time)
- {
- int nThreads = 256;
- int totalThreads = image_height * image_width;
- int nBlocks = totalThreads / nThreads;
- nBlocks += ((totalThreads%nThreads)>0)?1:0;
- kernel<<<nBlocks, nThreads>>>(pos, image_width, image_height, time);
- cudaThreadSynchronize();
- checkCUDAError("kernel failed!");
- }
运行结果: