这里是书上的一个例子,Julia集,实现并行的部分主要是计算每个像素点的值。
#ifndef __BITMAP_H__
#define __BITMAP_H__
#include <windows.h>
#include <GL/glut.h>
class Bitmap {
private:
unsigned char *pixels;
int x, y;
void *dataBlock; //回收时候使用的。。具体还不是很明白
void (*bitmapExit)(void*);
static Bitmap** get_bitmap_ptr( void ) {
static Bitmap *gBitmap; //初始化才执行,以后gBitmap都存在,不会被回收,所以可以直接获取这个值
return &gBitmap;
}
static void Draw( void ) {
Bitmap* bitmap = *(get_bitmap_ptr());
glClearColor( 0.0, 0.0, 0.0, 1.0 );
glClear( GL_COLOR_BUFFER_BIT );
glDrawPixels( bitmap->x, bitmap->y, GL_RGBA, GL_UNSIGNED_BYTE, bitmap->pixels );
glFlush();
}
static void Key(unsigned char key, int x, int y) {
switch (key) {
case 27: //ESC键
Bitmap* bitmap = *(get_bitmap_ptr());
if (bitmap->dataBlock != NULL && bitmap->bitmapExit != NULL)
bitmap->bitmapExit( bitmap->dataBlock );
exit(0);
}
}
public:
Bitmap( int width, int height, void *d = NULL ) {
pixels = new unsigned char[width * height * 4];
x = width;
y = height;
dataBlock = d;
}
~Bitmap() {
delete[] pixels;
}
unsigned char* get_ptr( void ) const { return pixels; }
long image_size( void ) const { return x * y * 4; }
void display_and_exit( void(*e)(void*) = NULL ) {
Bitmap** bitmap = get_bitmap_ptr();
*bitmap = this; //让bitmap指向当前定义的bitmap
bitmapExit = e;
int c=1;
char* dummy = "";
glutInit( &c, &dummy );
glutInitDisplayMode( GLUT_SINGLE | GLUT_RGBA );
glutInitWindowSize( x, y );
glutCreateWindow( "bitmap" );
glutDisplayFunc(Draw);//这个Draw需要用静态的,否则会被认为是Draw是void (Bitmap::*)()而不是void (*)()指针
glutKeyboardFunc(Key);
glutMainLoop();
}
};
#endif
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include "bitmap.h"
#define DIM 1000
struct cuComplex {
float r;
float i;
__device__ cuComplex( float a, float b ) : r(a), i(b) {}
__device__ float magnitude2( void ) { return r * r + i * i; }
__device__ cuComplex operator*( const cuComplex& a ) {
return cuComplex(r*a.r - i*a.i, i*a.r + r*a.i);
}
__device__ cuComplex operator+( const cuComplex& a ) {
return cuComplex( r+a.r, i+a.i );
}
};
__device__ int julia( int x, int y ){
const float scale = 1.5;
float jx = scale * (float)(DIM/2 - x)/(DIM/2);
float jy = scale * (float)(DIM/2 - y)/(DIM/2);
cuComplex c(-0.8, 0.156);
cuComplex a(jx, jy);
for (int i=0; i<200; i++){
a = a * a + c;
if (a.magnitude2() > 1000)
return 0;
}
return 1;
}
__global__ void kernel( unsigned char * ptr ){
int x = blockIdx.x;
int y = blockIdx.y;
int offset = x + y * gridDim.x;
int juliaValue = julia( x, y );
ptr[offset * 4 + 0] = 255 * juliaValue;
ptr[offset * 4 + 1] = 0;
ptr[offset * 4 + 2] = 0;
ptr[offset * 4 + 3] = 255;
}
int main(){
Bitmap bitmap( DIM, DIM );
unsigned char * dev_bitmap;
//在GPU上分配内存
cudaMalloc((void**)&dev_bitmap, bitmap.image_size());
//声明一个二维线程格
dim3 grid(DIM, DIM);
//将dim3变量传递给CUDA运行时
kernel<<<grid, 1>>>(dev_bitmap);
cudaMemcpy(bitmap.get_ptr(), dev_bitmap, bitmap.image_size(), cudaMemcpyDeviceToHost);
bitmap.display_and_exit();
return 0;
}
整个程序是比较简单的,不熟悉的部分是配合opengl的使用。运行结果如下;
接下来的实例中,有没有__syncthreads()对结果又直接的影响。
#include "bitmap.h"
#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define DIM 1024
#define PI 3.1415926535897932
__global__ void kernel(unsigned char * ptr){
int x=threadIdx.x+blockIdx.x*blockDim.x; //定位x方向
int y=threadIdx.y+blockIdx.y*blockDim.y; //定位y方向
int offset=x+y*blockDim.x*gridDim.x; //定位到线程的位置
__shared__ float shared[16][16]; //用来计算像素点的RGB值
const float period=128.0f;
shared[threadIdx.x][threadIdx.y]=255*(sinf(x*2.0f*PI/period)+1.0f)*(sinf(y*2.0f*PI/period)+1.0f)/4.0f;
__syncthreads(); //如果没有这一步,每个线程写入的shared步调不一致,没有完成shared的完全赋值就已经设置了像素的RGB
ptr[offset*4+0]=0;
ptr[offset*4+1]=shared[15-threadIdx.x][15-threadIdx.y];
ptr[offset*4+2]=0;
ptr[offset*4+3]=255;
}
int main(){
Bitmap bitmap(DIM,DIM);
unsigned char* dev_bitmap;
cudaMalloc((void**)&dev_bitmap,bitmap.image_size());
dim3 grids(DIM/16,DIM/16);
dim3 blocks(16,16);
//这里grid中的block是二维的,block中的thread也是二维的
kernel<<<grids,blocks>>>(dev_bitmap);
//将设备上的dev_bitmap拷贝到bitmap的普ptr所指的单元中
cudaMemcpy(bitmap.get_ptr(),dev_bitmap,bitmap.image_size(),cudaMemcpyDeviceToHost);
bitmap.display_and_exit();
cudaFree(dev_bitmap);
return 0;
}
运行结果:
如果没有同步函数,那么负责写入到shared的线程可能还没有完成写入操作。这种情况下,运行结果为: