Cuda编程总结2013-10-120:32:46
--------余家奎
参加书籍:NVIDIACUDA C Programming Guide
OpenGL编程指南
学习cuda例子中的总结
1、__constant__和__device__,__shared__的使用说明...2
3、用cudaMemcpyPitch和cudaMemcpy2D实现二位数组的分配和拷贝...5
4、cudaMalloc3D()和cudaMemcpy3D()函数的用法...7
7、页锁定主机存储器Page-locked Host memory.16
11、Formatted output---printf函数在device的函数中,但是其需要其的compute copability至少为2.0 27
12、Asserting在设备端的函数中,但是其要求其计算能力至少为2.0.28
13、Per Thread Allocation On heap每个线程在堆上分配...29
14、Per Thread Block Allocation每个线程块在堆上分配空间...29
15、Allocation Persisting Between Kernel Launches在堆上分配...30
如有错误的地方还请指正。。。谢谢
1、__constant__和__device__,__shared__的使用说明
其对应的程序:
//
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
// __constant__ int device_global_var=5;
// __device__ int device_global_var=5;
__shared__ int device_global_var ;
__global__ void kernel()
{
__shared__ int xx;
}
int main()
{
int host_var=5;
cudaMemcpyToSymbol(device_global_var,&host_var,sizeof(int));
printf("value=%d\n",host_var);
cudaMemcpyFromSymbol(&host_var,device_global_var,sizeof(int));
printf("device_value=%d\n",host_var);
system("pause");
return 0;
}
2、分配二位数组实现两个二位数组相加
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#define N 16
__device__ intdevice_a[N][N],device_b[N][N],device_c[N][N];
__global__ void VecAdd(inta[N][N],int b[N][N],intc[N][N])
{
int global_threadId_x=blockIdx.x*blockDim.x+threadIdx.x;
int global_threadId_y=blockIdx.y*blockDim.y+threadIdx.y;
if (global_threadId_x<N &&global_threadId_y <N)
{
c[global_threadId_y][global_threadId_x]=a[global_threadId_y][global_threadId_x]+
b[global_threadId_y][global_threadId_x];
}
}
void printfArray(int data[N][N])
{
for (inti=0;i<N;i++)
{
for (intj=0;j<N;j++)
{
printf("%d ",data[i][j]);
}
printf("\n");
}
}
void host_Add(int a[N][N],int b[N][N],intc[N][N])
{
for (inti=0;i<N;i++)
{
for (int j=0;j<N;j++)
{
c[i][j]=a[i][j]+b[i][j];
}
}
}
int main()
{
int i,j;
int k=0;
int a[N][N],b[N][N];
int c[N][N];
for (i=0;i<N;i++)
{
for (j=0;j<N;j++)
{
a[i][j]=k;
b[i][j]=k;
k++;
}
}
int tempA[N][N];
//int (*device_aa)[N];
int **device_aa;
cudaMalloc((void**)&device_aa,sizeof(int)*N*N);
cudaMemcpyToSymbol(device_a,a,sizeof(int)*N*N);
cudaMemcpyFromSymbol(tempA,device_a,sizeof(int)*N*N);
printf("tempA====\n");
printfArray(tempA);
system("pause");
return 0;
}
3、用cudaMemcpyPitch和cudaMemcpy2D实现二位数组的分配和拷贝
代码:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
// kernel which copies data from d_arrayto destinationArray
__global__ void CopyData(float*d_array,
float* destinationArray,
size_tpitch,
int columnCount,
int rowCount)
{
for (int row = 0; row< rowCount; row++)
{
// update the pointer to point to the beginning of the nextrow
float* rowData = (float*)(((char*)d_array) +(row * pitch));
for (int column = 0;column < columnCount; column++)
{
rowData[column] =123.0; // make every value in the array123.0
destinationArray[(row*columnCount) + column] = rowData[column];
}
}
}
int main(int argc,char**argv)
{
int columnCount = 15;
int rowCount = 10;
float* d_array; // thedevice array which memory will be allocated to
float* d_destinationArray; //the device array
// allocate memory on the host
float* h_array = new float[columnCount*rowCount];
// the pitch value assigned by cudaMallocPitch
// (which ensures correct data structure alignment)
size_tpitch;
//allocated the device memory for source array
cudaMallocPitch(&d_array, &pitch,columnCount * sizeof(float), rowCount);
//allocate the device memory for destination array
cudaMalloc(&d_destinationArray,columnCount*rowCount*sizeof(float));
//call the kernel which copies values from d_array tod_destinationArray
CopyData<<<100, 512>>>(d_array, d_destinationArray,pitch, columnCount, rowCount);
//copy the data back to the host memory
float *h_result=(float*)malloc(sizeof(float)*columnCount*rowCount);
memset(h_result,0,sizeof(float)*columnCount*rowCount);
cudaMemcpy2D(h_result,columnCount*sizeof(float),d_array,pitch,columnCount*sizeof(float),rowCount,cudaMemcpyDeviceToHost);
cudaMemcpy(h_array,
d_destinationArray,
columnCount*rowCount*sizeof(float),
cudaMemcpyDeviceToHost);
for(int i = 0 ; i< rowCount ; i++)
{
for(int j = 0 ; j < columnCount ; j++)
{
cout << "h_result["<< (i*columnCount) + j <<"]="<< h_result[(i*columnCount) + j] << endl;
}
}
system("pause");
printf("h_array==\n");
//print out the values (all the values are 123.0)
for(int i = 0 ; i< rowCount ; i++)
{
for(int j = 0 ; j< columnCount ; j++)
{
cout<< "h_array[" <<(i*columnCount) + j <<"]="<< h_array[(i*columnCount) + j] << endl;
}
}
system("pause");
}
4、cudaMalloc3D()和cudaMemcpy3D()函数的用法
代码:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdlib.h>
#include <stdio.h>
// Device code
__global__ void MyKernel(cudaPitchedPtrdevPitchedPtr,cudaExtent extent)
{
char* devPtr = (char*)devPitchedPtr.ptr;
size_t pitch= devPitchedPtr.pitch;
size_tslicePitch = pitch * extent.height;
for(int k=0; k <extent.depth; k++){
char* slice = devPtr + k * slicePitch;
for(int j=0; j<extent.height; j++){
float3* row = (float3*)(slice+j*pitch);
for (inti=0;i<extent.width;i++)
{
row[i].x=2;
row[i].y=3;
row[i].z=4;
}
}
}
}
const int x=6;
const int y=60;
const int z=66;
int main(){
size_tbuf_pf=900000000;
// cudaPrintfInit(buf_pf);
cudaError_tstatus = cudaSuccess;
//======== Mem Host
float3 *mem_host = (float3*)malloc(sizeof(float3)*x*y*z);
float3 *mem_host2 = (float3*)malloc(sizeof(float3)*x*y*z);
for(int i=0;i<x*y*z;i++){
mem_host[i].x=10;
mem_host[i].y=100;
mem_host[i].z=1000;
}
//======== Mem Device
cudaExtentextent;
extent.width=x*sizeof(float3);
extent.height=y;
extent.depth=z;
cudaPitchedPtrmem_device;
status=cudaMalloc3D(&mem_device,extent);
// if(status!= cudaSuccess){fprintf(stderr, "Malloc: %s\n", cudaGetErrorString(status));}
//
// //========Cpy HostToDevice
//
// cudaMemcpy3DParmsp = { 0 };
// p.srcPtr= make_cudaPitchedPtr((void*)mem_host, x*sizeof(float3),x,y);
// p.dstPtr= mem_device;
// p.extent= extent;
// p.kind= cudaMemcpyHostToDevice;
// status=cudaMemcpy3D(&p);
// if(status!= cudaSuccess){fprintf(stderr, "MemcpyHtD: %s\n",cudaGetErrorString(status));}
MyKernel<<<1,1>>>(mem_device,extent);
//======== Cpy DeviceToHost !!!!!!! UNTESTED !!!!!!!!
cudaMemcpy3DParmsq = {0};
q.srcPtr =mem_device;
q.dstPtr =make_cudaPitchedPtr((void*)mem_host2,x*sizeof(float3),x,y);
q.extent=extent;
q.kind =cudaMemcpyDeviceToHost;
status=cudaMemcpy3D(&q);
if(status != cudaSuccess){fprintf(stderr,"MemcpyDtoH: %s\n",cudaGetErrorString(status));}
for(int i=0;i<x*y*z;i++)
printf("%f %f %f\n",mem_host2[i].x,mem_host2[i].y,mem_host2[i].z);
cudaFree(mem_device.ptr);
system("pause");
}
5、不带共享存储器的矩阵的相乘
代码:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef struct
{
int width;
int height;
float *element;
}Matrix;
#define BLOCK_SIZE 16
__global__ void MatMulKernel(const Matrix,const Matrix,Matrix);
void printMatrix(const Matrix &A)
{
for (inti=0;i<A.height;i++)
{
for (intj=0;j<A.width;j++)
{
printf("%f ",A.element[i*A.width+j]);
}
printf("\n");
}
}
void MatMul(const Matrix &A,const Matrix &B,Matrix &C)
{
printf("matrix A");
printMatrix(A);
printf("matrix B");
printMatrix(B);
system("pause");
Matrix d_A;
d_A.width=A.width;
d_A.height=A.height;
size_tsize=A.width*A.height*sizeof(float);
cudaMalloc(&d_A.element,size);
cudaMemcpy(d_A.element,A.element,size,cudaMemcpyHostToDevice);
Matrix d_B;
d_B.width=B.width;
d_B.height=B.height;
size=B.width*B.height*sizeof(float);
cudaMalloc(&d_B.element,size);
cudaMemcpy(d_B.element,B.element,size,cudaMemcpyHostToDevice);
Matrix d_C;
d_C.width=C.width;
d_C.height=C.height;
size=C.width*C.height*sizeof(float);
cudaMalloc(&d_C.element,size);
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 dimGrid(B.width/dimBlock.x,A.height/dimBlock.y);
MatMulKernel<<<dimGrid,dimBlock>>>(d_A,d_B,d_C);
cudaMemcpy(C.element,d_C.element,size,cudaMemcpyDeviceToHost);
cudaFree(d_A.element);
cudaFree(d_B.element);
cudaFree(d_C.element);
for (inti=0;i<C.height;++i)
{
for (intj=0;j<C.width;++j)
{
printf("%f ",C.element[i*C.width+j]);
}
printf("\n");
}
system("pause");
}
void SetMatrixValue(Matrix &A,int value)
{
for (inti=0;i<A.height;++i)
{
for (intj=0;j<A.width;++j)
{
A.element[i*A.width+j]=value;
}
}
}
void main()
{
MatrixA,B,C;
A.width=128;
A.height=128;
A.element=(float*)malloc(A.width*A.height*sizeof(float));
SetMatrixValue(A,2);
B.width=128;
B.height=128;
B.element=(float*)malloc(B.width*B.height*sizeof(float));
//memset(B.element,2,sizeof(float)*B.width*B.height);
SetMatrixValue(B,2);
C.width=128;
C.height=128;
C.element=(float*)malloc(C.width*C.height*sizeof(float));
//memset(C.element,2,sizeof(float)*C.width*C.height);
MatMul(A,B,C);
for (inti=0;i<C.height;++i)
{
for (intj=0;j<C.width;++j)
{
printf("%f ",C.element[i*C.width+j]);
}
printf("\n");
}
system("pause");
}
__global__ void MatMulKernel(Matrix A,MatrixB,Matrix C)
{
float CValue=0;
int row=blockIdx.y*blockDim.y+threadIdx.y;
int col=blockIdx.x*blockDim.x+threadIdx.x;
for (inte=0;e<A.width;++e)
{
CValue+=A.element[row*A.width+e]*B.element[e*B.width+col];
}
C.element[row*C.width+col]=CValue;
}
6、带shared memory的矩阵相乘
代码:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#define BLOCK_SIZE 16
typedef struct
{
int width;
int height;
int stride;
float *elements;
}Matrix;
__device__ float GetElement(constMatrix A, int row,intcol)
{
return A.elements[row*A.stride+col];
}
__device__ void SetElement(Matrix A,int row,int col,float value)
{
A.elements[row*A.stride+col]=value;
}
__device__ Matrix GetSubMatrix(Matrix A,int row,int col)
{
Matrix Asub;
Asub.width=BLOCK_SIZE;
Asub.height=BLOCK_SIZE;
Asub.stride=A.stride;
Asub.elements=&A.elements[A.stride*BLOCK_SIZE*row+BLOCK_SIZE*col];
return Asub;
}
__global__ void MatMulKernel(const Matrix,const Matrix,Matrix);
void MatMul(const Matrix &A,const Matrix &B,Matrix &C)
{
Matrix d_A;
d_A.width=A.width;
d_A.height=A.height;
d_A.stride=A.width;
size_tsize=d_A.width*d_A.height*sizeof(float);
cudaMalloc(&d_A.elements,size);
cudaMemcpy(d_A.elements,A.elements,size,cudaMemcpyHostToDevice);
Matrix d_B;
d_B.width=B.width;
d_B.height=B.height;
d_B.stride=B.width;
size=B.width*B.height*sizeof(float);
cudaMalloc(&d_B.elements,size);
cudaMemcpy(d_B.elements,B.elements,size,cudaMemcpyHostToDevice);
Matrix d_C;
d_C.width=C.width;
d_C.height=C.height;
d_C.stride=C.width;
size=C.width*C.height*sizeof(float);
cudaMalloc(&d_C.elements,size);
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 dimGrid(B.width/BLOCK_SIZE,B.height/BLOCK_SIZE);
MatMulKernel<<<dimGrid,dimBlock>>>(d_A,d_B,d_C);
cudaMemcpy(C.elements,d_C.elements,size,cudaMemcpyDeviceToHost);
cudaFree(d_A.elements);
cudaFree(d_B.elements);
cudaFree(d_C.elements);
}
__global__ void MatMulKernel(Matrix A,MatrixB,Matrix C)
{
int blockRow=blockIdx.y;
int blockCol=blockIdx.x;
MatrixCsub=GetSubMatrix(C,blockRow,blockCol);
float Cvalue=0;
int row=threadIdx.y;
int col=threadIdx.x;
for (intm=0;m<A.width/BLOCK_SIZE;++m)
{
MatrixAsub=GetSubMatrix(A,blockRow,m);
MatrixBsub=GetSubMatrix(B,m,blockCol);
__shared__ floatAs[BLOCK_SIZE][BLOCK_SIZE];
__shared__ floatBs[BLOCK_SIZE][BLOCK_SIZE];
As[row][col]=GetElement(Asub,row,col);
Bs[row][col]=GetElement(Bsub,row,col);
__syncthreads();
for (inte=0;e<BLOCK_SIZE;++e)
{
Cvalue+=As[row][e]*Bs[e][col];
}
__syncthreads();
}
SetElement(Csub,row,col,Cvalue);
}
void SetMatValue(Matrix A,int value)
{
for (inti=0;i<A.height;++i)
{
for (intj=0;j<A.width;++j)
{
A.elements[i*A.width+j]=value;
}
}
}
void PrintMat(const Matrix A)
{
for (int i=0;i<A.height;++i)
{
for(intj=0;j<A.width;++j)
{
printf("%f ",A.elements[i*A.width+j]);
}
printf("\n");
}
}
void main()
{
MatrixA,B,C;
A.width=128;
A.height=128;
A.elements=(float*)malloc(A.width*A.height*sizeof(float));
SetMatValue(A,2);
B.width=128;
B.height=128;
B.elements=(float*)malloc(B.width*B.height*sizeof(float));
SetMatValue(B,2);
C.width=128;
C.height=128;
C.elements=(float *)malloc(C.width*C.height*sizeof(float));
MatMul(A,B,C);
PrintMat(C);
system("pause");
}
7、页锁定主机存储器Page-lockedHost memory
代码:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#define N 256
void SetArrayValue(float *pData,int size,float value)
{
for (inti=0;i<size;++i)
{
pData[i]=value;
}
}
__global__ void VecAdd(float*A,float *B,float*C)
{
int index=threadIdx.x;
C[index]=A[index]+B[index];
}
void PrintArray(float *data,int size)
{
for(inti=0;i<size;++i)
{
if((i+1)%10==0)
{
printf("\n");
}
printf("%f ",data[i]);
}
}
void main()
{
//cudaSetDeviceFlags(cudaDeviceMapHost);---可有可无
cudaDevicePropdeviceProp;
cudaGetDeviceProperties(&deviceProp,0);
if (deviceProp.integrated)
{
printf("GPU is integrated\n");
return;
}
if(!deviceProp.canMapHostMemory)
{
printf("can't map host memory\n");
return;
}
float *hostPtrA;
cudaHostAlloc(&hostPtrA,sizeof(float)*N,cudaHostAllocDefault| cudaHostAllocMapped);
SetArrayValue(hostPtrA,N,29);
PrintArray(hostPtrA,N);
system("pause");
float *hostPtrB;
cudaHostAlloc(&hostPtrB,sizeof(float)*N,cudaHostAllocDefault| cudaHostAllocMapped);
SetArrayValue(hostPtrB,N,31);
float *devPtrA,*devPtrB;
cudaHostGetDevicePointer(&devPtrA,hostPtrA,0);
cudaHostGetDevicePointer(&devPtrB,hostPtrB,0);
float *hostPtrC;
cudaHostAlloc(&hostPtrC,sizeof(float)*N,cudaHostAllocDefault| cudaHostAllocMapped);
float *devPtrC;
cudaHostGetDevicePointer(&devPtrC,hostPtrC,0);
VecAdd<<<1,N>>>(devPtrA,devPtrB,devPtrC);
cudaDeviceSynchronize();
for(inti=0;i<N;i++)
{
if ((i+1)%10==0)
{
printf("\n");
}
printf("%f ",hostPtrC[i]);
}
system("pause");
}
8、纹理存储的使用texturememory
代码:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#define size 256
texture<float,cudaTextureType2D,cudaReadModeElementType>texRef;
__global__ void transformKernel(float *output,intwidth,int height,floattheta)
{
unsigned int x=blockIdx.x*blockDim.x+threadIdx.x;
unsigned int y=blockIdx.y*blockDim.y+threadIdx.y;
float u=x/(float)width;
float v=y/(float)height;
u-=0.5f;
v-=0.5f;
float tu=u*cosf(theta)-v*sin(theta)+0.5f;
float tv=v*cosf(theta)+u*sinf(theta)+0.5f;
output[y*width+x]=tex2D(texRef,tu,tv);
}
void main()
{
int width=25,height=25;
cudaChannelFormatDescchannelDesc=cudaCreateChannelDesc(32,0,0,0,cudaChannelFormatKindFloat);
cudaArray*cuArray;
cudaMallocArray(&cuArray,&channelDesc,width,height);
float *h_data=(float*)malloc(width*height*sizeof(float));
for (inti=0;i<height;++i)
{
for (intj=0;j<width;++j)
{
h_data[i*width+j]=i*width+j;
}
}
cudaMemcpyToArray(cuArray,0,0,h_data,width*height*sizeof(float),cudaMemcpyHostToDevice);
texRef.addressMode[0]=cudaAddressModeWrap;
texRef.addressMode[1]=cudaAddressModeWrap;
texRef.filterMode=cudaFilterModeLinear;
texRef.normalized=true;
cudaBindTextureToArray(texRef,cuArray,channelDesc);
float *output;
cudaMalloc(&output,width*height*sizeof(float));
dim3 dimBlock(16,16);
dim3dimGrid((width+dimBlock.x-1)/dimBlock.x,(height+dimBlock.y-1)/dimBlock.y);
float angle=30;
transformKernel<<<dimGrid,dimBlock>>>(output,width,height,angle);
float *hostPtr=(float*)malloc(sizeof(float)*width*height);
cudaMemcpy(hostPtr,output,sizeof(float)*width*height,cudaMemcpyDeviceToHost);
for (inti=0;i<height;++i)
{
for (intj=0;j<width;++j)
{
printf("%f ",hostPtr[i*width+j]);
}
printf("\n");
}
free(hostPtr);
cudaFreeArray(cuArray);
cudaFree(output);
system("pause");
}
9、surface Memory的使用方法
代码:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
surface<void,2> inputSurfRef;
surface<void,2>outputSurfRef;
__global__ void copyKernel(intwidth,int height)
{
unsigned int x=blockIdx.x*blockDim.x+threadIdx.x;
unsigned int y=blockIdx.y*blockDim.y+threadIdx.y;
if(x<width && y<height)
{
uchar4 data;
surf2Dread(&data,inputSurfRef,x*4,y);
surf2Dwrite(data,outputSurfRef,x*4,y);
}
}
void main()
{
int width=256,height=256;
unsigned int*h_data=(unsigned int*)malloc(width*height*sizeof(unsignedint));
for (inti=0;i<height;++i)
{
for (intj=0;j<width;++j)
{
h_data[i*width+j]=3;
}
}
int size=width*height*sizeof(unsignedint);
cudaChannelFormatDescchannelDesc=cudaCreateChannelDesc(8,8,8,8,cudaChannelFormatKindUnsigned);
cudaArray*cuInputArray;
cudaMallocArray(&cuInputArray,&channelDesc,width,height,cudaArraySurfaceLoadStore);
cudaArray*cuOutputArray;
cudaMallocArray(&cuOutputArray,&channelDesc,width,height,cudaArraySurfaceLoadStore);
cudaMemcpyToArray(cuInputArray,0,0,h_data,size,cudaMemcpyHostToDevice);
cudaBindSurfaceToArray(inputSurfRef,cuInputArray);
cudaBindSurfaceToArray(outputSurfRef,cuOutputArray);
dim3 dimBlock(16,16);
dim3dimGrid((width+dimBlock.x-1)/dimBlock.x,(height+dimBlock.y-1)/dimBlock.y);
copyKernel<<<dimGrid,dimBlock>>>(width,height);
unsigned int*host_output=(unsigned int*)malloc(sizeof(unsignedint)*width*height);
cudaMemcpyFromArray(host_output,cuOutputArray,0,0,size,cudaMemcpyDeviceToHost);
for (inti=0;i<height;++i)
{
for (intj=0;j<width;++j)
{
printf("%u ",host_output[i*width+j]);
}
printf("\n");
}
system("pause");
free(host_output);
free(h_data);
cudaFreeArray(cuInputArray);
cudaFreeArray(cuOutputArray);
}
10、opengl和cuda的交互
代码:https://devtalk.nvidia.com/default/topic/502692/how-to-use-open_gl/
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <gl/glew.h>//要放在下面这一句的前面
#include "cuda_gl_interop.h"
#include <stdio.h>
#include <stdlib.h>
//#include <gl/GL.h>
#include <gl/glut.h>
GLuint postionsVBO=1;
struct cudaGraphicsResource * postionsVBO_CUDA;
int width=256;
int height=256;
__device__ float dev_time=1;
float host_time=1;
__global__ void createVertices(float4 *positions,floattime,unsignedintwidth ,unsignedintheight);
void init()
{
glClearColor(0.f,0.f,0.f,1.f);
glClear(GL_DEPTH_BUFFER_BIT| GL_COLOR_BUFFER_BIT);
}
void reshape(int width,int height)
{
glMatrixMode(GL_PROJECTION);
glLoadIdentity();
if (width>height)
{
gluPerspective(45,(GLfloat)width/height,0.001,1000);
}else
{
gluPerspective(45,(GLfloat)height/width,0.001,1000);
}
glMatrixMode(GL_MATRIX_MODE);
glLoadIdentity();
}
void display()
{
float4 *positions;
cudaGraphicsMapResources(1,&postionsVBO_CUDA,0);
size_tnumb_bytes;
cudaGraphicsResourceGetMappedPointer((void**)&positions,&numb_bytes,postionsVBO_CUDA);
dim3 dimBlock(16,16,1);
dim3 dimGrid(width/dimBlock.x,height/dimBlock.y,1);
createVertices<<<dimGrid,dimBlock>>>(positions,dev_time,width,height);
dev_time++;
cudaMemcpy(&time,&host_time,sizeof(float),cudaMemcpyHostToDevice);
cudaGraphicsUnmapResources(1,&postionsVBO_CUDA,0);
glClear(GL_COLOR_BUFFER_BIT| GL_DEPTH_BUFFER_BIT);
glBindBuffer(GL_ARRAY_BUFFER,postionsVBO);
glVertexPointer(4,GL_FLOAT,0,0);
glEnableClientState(GL_VERTEX_ARRAY);
glDrawArrays(GL_POINTS,0,width*height);
glDisableClientState(GL_VERTEX_ARRAY);
glutSwapBuffers();
glutPostRedisplay();
}
__global__ void createVertices(float4 *positions,floattime,unsignedintwidth ,unsignedintheight)
{
unsigned int x=blockIdx.x*blockDim.x+threadIdx.x;
unsigned int y=blockIdx.y*blockDim.y+threadIdx.y;
float u=x/(float)width;
float v=y/(float)height;
u=u*2.f-1.f;
v=v*2.f-1.f;
float freq=4.f;
float w=sinf(u*freq+time)*cosf(v*freq+time)*0.5f;
positions[y*width+x]=make_float4(u,w,v,1.f);
}
int main(int argc,char*argv[])
{
cudaGLSetGLDevice(0);
glutInit(&argc,argv);
glutInitDisplayMode(GLUT_DOUBLE| GLUT_RGB);
glutInitWindowPosition(0,0);
glutInitWindowSize(100,100);
glutCreateWindow("opengl-cuda");
init();
glutDisplayFunc(display);
glutReshapeFunc(reshape);
glGenBuffers(1,&postionsVBO);
glBindBuffer(GL_ARRAY_BUFFER,postionsVBO);
unsigned intsize=width*height*4*sizeof(float);
glBufferData(GL_ARRAY_BUFFER,size,0,GL_DYNAMIC_DRAW);
glBindBuffer(GL_ARRAY_BUFFER,0);
cudaGraphicsGLRegisterBuffer(&postionsVBO_CUDA,postionsVBO,cudaGraphicsMapFlagsWriteDiscard);
glutMainLoop();
}
11、Formattedoutput---printf函数在device的函数中,但是其需要其的compute copability至少为2.0
代码:
#include <cuda.h>
#include <helper_cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
__global__ void bcast()
{
printf("%d\n",threadIdx.x);
}
void main()
{
bcast<<<1,32>>>();
cudaDeviceSynchronize();
system("pause");
}
12、Asserting在设备端的函数中,但是其要求其计算能力至少为2.0
代码:
#include <cuda.h>
#include <helper_cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <assert.h>
#include <stdlib.h>
__global__ void testAssert(void)
{
int is_one=1;
int should_be_one=0;
assert(is_one);
assert(should_be_one);
}
void main()
{
testAssert<<<1,1>>>();
cudaDeviceSynchronize();
cudaDeviceReset();
system("pause");
}
13、Per ThreadAllocation On heap每个线程在堆上分配
代码:
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <helper_cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
__global__ void mallocTestPerThread()
{
char *ptr=(char *)malloc(100);
printf("Thread %d got pointer:%p\n",threadIdx.x,ptr);
free(ptr);
}
int main()
{
cudaDeviceSetLimit(cudaLimitMallocHeapSize,128*1024*1024);
mallocTestPerThread<<<1,5>>>();
cudaDeviceSynchronize();
system("pause");
return 0;
}
14、Per Thread BlockAllocation每个线程块在堆上分配空间
代码:
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <helper_cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
__global__ void mallocTestPerThreadBlock()
{
__shared__ int *data;
if (threadIdx.x==0)
{
data=(int*)malloc(blockDim.x*64);
}
__syncthreads();
if (data==NULL)
{
return;
}
int *ptr=data;
for (inti=0;i<64;++i)
{
ptr[i*blockDim.x+threadIdx.x]=threadIdx.x;
}
__syncthreads();
if (threadIdx.x==0)
{
free(data);
}
}
int main()
{
cudaDeviceSetLimit(cudaLimitMallocHeapSize,128*1024*1024);
mallocTestPerThreadBlock<<<10,128>>>();
cudaDeviceSynchronize();
system("pause");
return 0;
}
15、AllocationPersisting Between Kernel Launches在堆上分配
代码:
#include <cuda.h>
#include <helper_cuda.h>
#include <stdio.h>
#include <stdlib.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define NUM_BLOCKS 20
__device__ int *dataptr[NUM_BLOCKS];
__global__ void allocmem()
{
if (threadIdx.x==0)
{
dataptr[blockIdx.x]=(int*)malloc(blockDim.x*4);
}
__syncthreads();
if (dataptr[blockIdx.x]==NULL)
{
return;
}
dataptr[blockIdx.x][threadIdx.x]=0;
}
__global__ void usemem()
{
int *ptr=dataptr[blockIdx.x];
if (ptr!=NULL)
{
ptr[threadIdx.x]+=threadIdx.x;
}
}
__global__ void freemem()
{
int *ptr=dataptr[blockIdx.x];
if(ptr!=NULL)
printf("Block %d,Thread=%d:final value=%d\n",blockIdx.x,threadIdx.x,ptr[threadIdx.x]);
if (threadIdx.x==0)
{
free(ptr);
}
}
int main()
{
cudaDeviceSetLimit(cudaLimitMallocHeapSize,128*1024*1024);
allocmem<<<NUM_BLOCKS,10>>>();
usemem<<<NUM_BLOCKS,10>>>();
usemem<<<NUM_BLOCKS,10>>>();
usemem<<<NUM_BLOCKS,10>>>();
freemem<<<NUM_BLOCKS,10>>>();
cudaDeviceSynchronize();
system("pause");
return 0;
}