1.使用线程实现小型矢量加法add
#include <stdio.h>
#define N 10
__global__ void add( int *a, int *b, int *c ) {
int tid = threadIdx.x;
if (tid < N)
c[tid] = a[tid] + b[tid];
}
int main( void ) {
int a[N], b[N], c[N];
int *dev_a, *dev_b, *dev_c;
// allocate the memory on the GPU
cudaMalloc( (void**)&dev_a, N * sizeof(int) ) ;
cudaMalloc( (void**)&dev_b, N * sizeof(int) ) ;
cudaMalloc( (void**)&dev_c, N * sizeof(int) ) ;
// fill the arrays 'a' and 'b' on the CPU
for (int i=0; i<N; i++) {
a[i] = i;
b[i] = i * i;
}
// copy the arrays 'a' and 'b' to the GPU
cudaMemcpy( dev_a, a, N * sizeof(int),
cudaMemcpyHostToDevice ) ;
cudaMemcpy( dev_b, b, N * sizeof(int),
cudaMemcpyHostToDevice ) ;
add<<<1,N>>>( dev_a, dev_b, dev_c );
// copy the array 'c' back from the GPU to the CPU
cudaMemcpy( c, dev_c, N * sizeof(int),
cudaMemcpyDeviceToHost ) ;
// display the results
for (int i=0; i<N; i++) {
printf( "%d + %d = %d\n", a[i], b[i], c[i] );
}
// free the memory allocated on the GPU
cudaFree( dev_a ) ;
cudaFree( dev_b ) ;
cudaFree( dev_c ) ;
return 0;
}
输出:
2.使用线程实现大型矢量加法add
note:
1.线程块不能超过66635
2.单个线程块包含的线程数不能超过512
#include <stdio.h>
#define N (33 * 1024)
__global__ void add( int *a, int *b, int *c ) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
while (tid < N) {
c[tid] = a[tid] + b[tid];
tid += blockDim.x * gridDim.x;
}
}
int main( void ) {
int *a, *b, *c;
int *dev_a, *dev_b, *dev_c;
// allocate the memory on the CPU
a = (int*)malloc( N * sizeof(int) );
b = (int*)malloc( N * sizeof(int) );
c = (int*)malloc( N * sizeof(int) );
// allocate the memory on the GPU
cudaMalloc( (void**)&dev_a, N * sizeof(int) ) ;
cudaMalloc( (void**)&dev_b, N * sizeof(int) ) ;
cudaMalloc( (void**)&dev_c, N * sizeof(int) ) ;
// fill the arrays 'a' and 'b' on the CPU
for (int i=0; i<N; i++) {
a[i] = i;
b[i] = 2 * i;
}
// copy the arrays 'a' and 'b' to the GPU
cudaMemcpy( dev_a, a, N * sizeof(int),
cudaMemcpyHostToDevice );
cudaMemcpy( dev_b, b, N * sizeof(int),
cudaMemcpyHostToDevice );
add<<<128,128>>>( dev_a, dev_b, dev_c );
//or add<<<(N+127)/128,128>>>( dev_a, dev_b, dev_c );
// copy the array 'c' back from the GPU to the CPU
cudaMemcpy( c, dev_c, N * sizeof(int),
cudaMemcpyDeviceToHost );
// verify that the GPU did the work we requested
bool success = true;
for (int i=0; i<N; i++) {
if ((a[i] + b[i]) != c[i]) {
printf( "Error: %d + %d != %d\n", a[i], b[i], c[i] );
success = false;
}
}
if (success) printf( "We did it!\n" );
// free the memory we allocated on the GPU
cudaFree( dev_a ) ;
cudaFree( dev_b ) ;
cudaFree( dev_c ) ;
// free the memory we allocated on the CPU
free( a );
free( b );
free( c );
return 0;
}
输出:
We did it!
Note:
当线程数量足够时,调用GPU时使用以下调用方式可减少资源浪费
add<<<(N+127)/128,128>>>( dev_a, dev_b, dev_c );
当线程数量不够时,可以让每个线程执行多次,如
__global__ void add( int *a, int *b, int *c ) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
while (tid < N) {
c[tid] = a[tid] + b[tid];
tid += blockDim.x * gridDim.x;
//由于只有blockDim.x * gridDim.x个CPU,因此每次递增blockDim.x * gridDim.x
}
}
类似于CPU中:
void add( int *a, int *b, int *c ) {
int tid = 0;
while (tid < N) {
c[tid] = a[tid] + b[tid];
tid ++;//由于只有一个CPU,因此每次递增1
}
}
3.在GPU上使用线程实现纹波效果
#include <GL/glut.h>
struct CPUAnimBitmap {
unsigned char *pixels;
int width, height;
void *dataBlock;
void (*fAnim)(void*,int);
void (*animExit)(void*);
void (*clickDrag)(void*,int,int,int,int);
int dragStartX, dragStartY;
CPUAnimBitmap( int w, int h, void *d = NULL ) {
width = w;
height = h;
pixels = new unsigned char[width * height * 4];
dataBlock = d;
clickDrag = NULL;
}
~CPUAnimBitmap() {
delete [] pixels;
}
unsigned char* get_ptr( void ) const { return pixels; }
long image_size( void ) const { return width * height * 4; }
void click_drag( void (*f)(void*,int,int,int,int)) {
clickDrag = f;
}
void anim_and_exit( void (*f)(void*,int), void(*e)(void*) ) {
CPUAnimBitmap** bitmap = get_bitmap_ptr();
*bitmap = this;
fAnim = f;
animExit = e;
// a bug in the Windows GLUT implementation prevents us from
// passing zero arguments to glutInit()
int c=1;
char* dummy = "";
glutInit( &c, &dummy );
glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA );
glutInitWindowSize( width, height );
glutCreateWindow( "bitmap" );
glutKeyboardFunc(Key);
glutDisplayFunc(Draw);
if (clickDrag != NULL)
glutMouseFunc( mouse_func );
glutIdleFunc( idle_func );
glutMainLoop();
}
// static method used for glut callbacks
static CPUAnimBitmap** get_bitmap_ptr( void ) {
static CPUAnimBitmap* gBitmap;
return &gBitmap;
}
// static method used for glut callbacks
static void mouse_func( int button, int state,
int mx, int my ) {
if (button == GLUT_LEFT_BUTTON) {
CPUAnimBitmap* bitmap = *(get_bitmap_ptr());
if (state == GLUT_DOWN) {
bitmap->dragStartX = mx;
bitmap->dragStartY = my;
} else if (state == GLUT_UP) {
bitmap->clickDrag( bitmap->dataBlock,
bitmap->dragStartX,
bitmap->dragStartY,
mx, my );
}
}
}
// static method used for glut callbacks
static void idle_func( void ) {
static int ticks = 1;
CPUAnimBitmap* bitmap = *(get_bitmap_ptr());
bitmap->fAnim( bitmap->dataBlock, ticks++ );
glutPostRedisplay();
}
// static method used for glut callbacks
static void Key(unsigned char key, int x, int y) {
switch (key) {
case 27:
CPUAnimBitmap* bitmap = *(get_bitmap_ptr());
bitmap->animExit( bitmap->dataBlock );
//delete bitmap;
exit(0);
}
}
// static method used for glut callbacks
static void Draw( void ) {
CPUAnimBitmap* bitmap = *(get_bitmap_ptr());
glClearColor( 0.0, 0.0, 0.0, 1.0 );
glClear( GL_COLOR_BUFFER_BIT );
glDrawPixels( bitmap->width, bitmap->height, GL_RGBA, GL_UNSIGNED_BYTE, bitmap->pixels );
glutSwapBuffers();
}
};
#define DIM 1024
#define PI 3.1415926535897932f
__global__ void kernel( unsigned char *ptr, int ticks ) {
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
// now calculate the value at that position
float fx = x - DIM/2;
float fy = y - DIM/2;
float d = sqrtf( fx * fx + fy * fy );
unsigned char grey = (unsigned char)(128.0f + 127.0f *
cos(d/10.0f - ticks/7.0f) /
(d/10.0f + 1.0f));
ptr[offset*4 + 0] = grey;
ptr[offset*4 + 1] = grey;
ptr[offset*4 + 2] = grey;
ptr[offset*4 + 3] = 255;
}
struct DataBlock {
unsigned char *dev_bitmap;
CPUAnimBitmap *bitmap;
};
void generate_frame( DataBlock *d, int ticks ) {
dim3 blocks(DIM/16,DIM/16);
dim3 threads(16,16);
kernel<<<blocks,threads>>>( d->dev_bitmap, ticks );
cudaMemcpy( d->bitmap->get_ptr(),
d->dev_bitmap,
d->bitmap->image_size(),
cudaMemcpyDeviceToHost ) ;
}
// clean up memory allocated on the GPU
void cleanup( DataBlock *d ) {
cudaFree( d->dev_bitmap );
}
int main( void ) {
DataBlock data;
CPUAnimBitmap bitmap( DIM, DIM, &data );
data.bitmap = &bitmap;
cudaMalloc( (void**)&data.dev_bitmap,
bitmap.image_size() ) ;
bitmap.anim_and_exit( (void (*)(void*,int))generate_frame,
(void (*)(void*))cleanup );
}
编译,输出:
4.共享内存和同步
关键字:__share__
用于单个线程块中多个线程之间的通信与协作,其他线程块上的线程无法读取和写入该线程块的共享内存中的变量。数据传输效率高。
4.1 点积运算
#include <stdio.h>
#define imin(a,b) (a<b?a:b)
const int N = 33 * 1024;
const int threadsPerBlock = 256;
const int blocksPerGrid =
imin( 32, (N+threadsPerBlock-1) / threadsPerBlock );
__global__ void dot( float *a, float *b, float *c ) {
__shared__ float cache[threadsPerBlock];
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int cacheIndex = threadIdx.x;
float temp = 0;
while (tid < N) {
temp += a[tid] * b[tid];
tid += blockDim.x * gridDim.x;
}
// set the cache values
cache[cacheIndex] = temp;
// synchronize threads in this block
__syncthreads();
// for reductions, threadsPerBlock must be a power of 2
// because of the following code
int i = blockDim.x/2;
while (i != 0) {
if (cacheIndex < i)
cache[cacheIndex] += cache[cacheIndex + i];
__syncthreads();
i /= 2;
}
if (cacheIndex == 0)
c[blockIdx.x] = cache[0];
}
int main( void ) {
float *a, *b, c, *partial_c;
float *dev_a, *dev_b, *dev_partial_c;
// allocate memory on the cpu side
a = (float*)malloc( N*sizeof(float) );
b = (float*)malloc( N*sizeof(float) );
partial_c = (float*)malloc( blocksPerGrid*sizeof(float) );
// allocate the memory on the GPU
cudaMalloc( (void**)&dev_a,
N*sizeof(float) ) ;
cudaMalloc( (void**)&dev_b,
N*sizeof(float) ) ;
cudaMalloc( (void**)&dev_partial_c,
blocksPerGrid*sizeof(float) ) ;
// fill in the host memory with data
for (int i=0; i<N; i++) {
a[i] = i;
b[i] = i*2;
}
// copy the arrays 'a' and 'b' to the GPU
cudaMemcpy( dev_a, a, N*sizeof(float),
cudaMemcpyHostToDevice ) ;
cudaMemcpy( dev_b, b, N*sizeof(float),
cudaMemcpyHostToDevice ) ;
dot<<<blocksPerGrid,threadsPerBlock>>>( dev_a, dev_b,
dev_partial_c );
// copy the array 'c' back from the GPU to the CPU
cudaMemcpy( partial_c, dev_partial_c,
blocksPerGrid*sizeof(float),
cudaMemcpyDeviceToHost ) ;
// finish up on the CPU side
c = 0;
for (int i=0; i<blocksPerGrid; i++) {
c += partial_c[i];
}
#define sum_squares(x) (x*(x+1)*(2*x+1)/6)
printf( "Does GPU value %.6g = %.6g?\n", c,
2 * sum_squares( (float)(N - 1) ) );
// free memory on the gpu side
cudaFree( dev_a );
cudaFree( dev_b );
cudaFree( dev_partial_c );
// free memory on the cpu side
free( a );
free( b );
free( partial_c );
}
NOTE:__syncthreads()
不能放在if语句下面,他必须被所有线程执行,否则会出事情。当if语句不被执行时,线程将会一直被等待。
输出:
Does GPU value 2.57236e+13 = 2.57236e+13?
4.2 位图
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* NVIDIA Corporation and its licensors retain all intellectual property and
* proprietary rights in and to this software and related documentation.
* Any use, reproduction, disclosure, or distribution of this software
* and related documentation without an express license agreement from
* NVIDIA Corporation is strictly prohibited.
*
* Please refer to the applicable NVIDIA end user license agreement (EULA)
* associated with this source code for terms and conditions that govern
* your use of this NVIDIA software.
*
*/
#include <stdio.h>
#include <GL/glut.h>
struct CPUBitmap {
unsigned char *pixels;
int x, y;
void *dataBlock;
void (*bitmapExit)(void*);
CPUBitmap( int width, int height, void *d = NULL ) {
pixels = new unsigned char[width * height * 4];
x = width;
y = height;
dataBlock = d;
}
~CPUBitmap() {
delete [] pixels;
}
unsigned char* get_ptr( void ) const { return pixels; }
long image_size( void ) const { return x * y * 4; }
void display_and_exit( void(*e)(void*) = NULL ) {
CPUBitmap** bitmap = get_bitmap_ptr();
*bitmap = this;
bitmapExit = e;
// a bug in the Windows GLUT implementation prevents us from
// passing zero arguments to glutInit()
int c=1;
char* dummy = "";
glutInit( &c, &dummy );
glutInitDisplayMode( GLUT_SINGLE | GLUT_RGBA );
glutInitWindowSize( x, y );
glutCreateWindow( "bitmap" );
glutKeyboardFunc(Key);
glutDisplayFunc(Draw);
glutMainLoop();
}
// static method used for glut callbacks
static CPUBitmap** get_bitmap_ptr( void ) {
static CPUBitmap *gBitmap;
return &gBitmap;
}
// static method used for glut callbacks
static void Key(unsigned char key, int x, int y) {
switch (key) {
case 27:
CPUBitmap* bitmap = *(get_bitmap_ptr());
if (bitmap->dataBlock != NULL && bitmap->bitmapExit != NULL)
bitmap->bitmapExit( bitmap->dataBlock );
exit(0);
}
}
// static method used for glut callbacks
static void Draw( void ) {
CPUBitmap* bitmap = *(get_bitmap_ptr());
glClearColor( 0.0, 0.0, 0.0, 1.0 );
glClear( GL_COLOR_BUFFER_BIT );
glDrawPixels( bitmap->x, bitmap->y, GL_RGBA, GL_UNSIGNED_BYTE, bitmap->pixels );
glFlush();
}
};
#define DIM 1024
#define PI 3.1415926535897932f
__global__ void kernel( unsigned char *ptr ) {
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
__shared__ float shared[16][16];
// now calculate the value at that position
const float period = 128.0f;
shared[threadIdx.x][threadIdx.y] =
255 * (sinf(x*2.0f*PI/ period) + 1.0f) *
(sinf(y*2.0f*PI/ period) + 1.0f) / 4.0f;
// removing this syncthreads shows graphically what happens
// when it doesn't exist. this is an example of why we need it.
__syncthreads();
ptr[offset*4 + 0] = 0;
ptr[offset*4 + 1] = shared[15-threadIdx.x][15-threadIdx.y];
ptr[offset*4 + 2] = 0;
ptr[offset*4 + 3] = 255;
}
// globals needed by the update routine
struct DataBlock {
unsigned char *dev_bitmap;
};
int main( void ) {
DataBlock data;
CPUBitmap bitmap( DIM, DIM, &data );
unsigned char *dev_bitmap;
cudaMalloc( (void**)&dev_bitmap,
bitmap.image_size() );
data.dev_bitmap = dev_bitmap;
dim3 grids(DIM/16,DIM/16);
dim3 threads(16,16);
kernel<<<grids,threads>>>( dev_bitmap );
cudaMemcpy( bitmap.get_ptr(), dev_bitmap,
bitmap.image_size(),
cudaMemcpyDeviceToHost ) ;
cudaFree( dev_bitmap );
bitmap.display_and_exit();
}
删除其中的__syncthreads
,得到以下输出:
添加其中的__syncthreads
,得到以下输出: