#include"cuda.h"
#include"book.h"
#include"cpu_bitmap.h"
#define DIM 1024
#define PI 3.1415926
__global__ void kernel(unsigned char *ptr)
{
//将threadIdx/BlockIdx映射到像素位置
int x=threadIdx.x+blockIdx.x*blockDim.x;
int y=threadIdx.y+blockIdx.y*blockDim.y;
int offset=x+y*blockDim.x*gridDim.x;
//使用共享内存缓冲区来保存计算结果,声明一个缓冲区,
//在16*16的线程块中的每个线程在该缓冲区都有一个对应的位置
__shared__ float shared[16][16];
//计算相应位置上的值
const float period=128.0f; //正弦函数的周期T=2*PI/W,则w=2*PI/T
//const float period=256.0f;
//shared[threadIdx.x][threadIdx.y]=255*(sinf(x*2.0f*PI/period)+1.0f)*(sinf(y*2.0f*PI/period)+1.0f)/4.0f;
//shared[threadIdx.x][threadIdx.y]=255*(cosf(x*2.0f*PI/period)+3.0f)*(cosf(y*2.0f*PI/period)+3.0f)/8.0f;
shared[threadIdx.x][threadIdx.y]=255*(cosf((x+PI/2.0f)*2.0f*PI/period)*sinf(y*2.0f*PI/period));
__syncthreads(); /线程同步,以保证所有写入操作完成
//将值保存到像素,保留x,y的次序
ptr[offset*4+0]=0;
ptr[offset*4+1]=shared[15-threadIdx.x][15-threadIdx.y];
ptr[offset*4+2]=192;
ptr[offset*4+3]=255;
}
int main(void)
{
CPUBitmap bitmap(DIM,DIM);
unsigned char *dev_bitmap;
HANDLE_ERROR(cudaMalloc((void **)&dev_bitmap,bitmap.image_size()));
dim3 grids(DIM/16,DIM/16);
dim3 threads(16,16);
kernel<<<grids,threads>>>(dev_bitmap);
HANDLE_ERROR(cudaMemcpy(bitmap.get_ptr(),dev_bitmap,bitmap.image_size(),cudaMemcpyDeviceToHost));
bitmap.display_and_exit();
cudaFree(dev_bitmap);
}