记录事件
流程:
1 cudaEvent_t start, stop;
声明开始、停止的cudaEvent_t 变量
2 float elapsedTime;
运行时间变量
3开启计时器
CHECK( cudaEventCreate( &start ) );
CHECK( cudaEventCreate( &stop ) );
4 CHECK( cudaEventRecord( start, 0 ) );
开始计时
5 需要计时的代码 …
6停止和统计时间
CHECK( cudaEventRecord( stop, 0 ) );
CHECK( cudaEventSynchronize( stop ) );
CHECK( cudaEventElapsedTime( &elapsedTime, start, stop ) );
printf( "Time taken: %3.1f ms\n", elapsedTime );
cuda stream使用
目的:每个stream相当于cpu的进程,多个stream可以运行多个kernel程序
cudaStream_t stream0, stream1; //声明stream
CHECK( cudaStreamCreate( &stream0 ) ); //初始化stream
CHECK( cudaStreamCreate( &stream1 ) );
// enqueue copies of a in stream0 and stream1即分配给每个stream内存空间
CHECK( cudaMemcpyAsync( dev_a0, host_a+i, N * sizeof(int), cudaMemcpyHostToDevice, stream0 ) );
CHECK( cudaMemcpyAsync( dev_a1, host_a+i+N, N * sizeof(int), cudaMemcpyHostToDevice, stream1 ) );
// enqueue copies of b in stream0 and stream1,注意,和cudaMemcpy不同
CHECK( cudaMemcpyAsync( dev_b0, host_b+i, N * sizeof(int), cudaMemcpyHostToDevice, stream0 ) );
CHECK( cudaMemcpyAsync( dev_b1, host_b+i+N, N * sizeof(int), cudaMemcpyHostToDevice, stream1 ) );
//执行kernek函数
kernel<<<N/256,256,0,stream0>>>( dev_a0, dev_b0, dev_c0 );
kernel<<<N/256,256,0,stream1>>>( dev_a1, dev_b1, dev_c1 );
//最后同步一下stream
CHECK( cudaStreamSynchronize( stream0 ) );
CHECK( cudaStreamSynchronize( stream1 ) );
//销毁
CHECK( cudaStreamDestroy( stream0 ) );
CHECK( cudaStreamDestroy( stream1 ) );
使用cublas加速库
首先需要声明handle
例一 向量求和:
思想是:每个元素都分配一个线程,对于每个线程块内的元素(超出向量的部分填充0),从中间序号截断,两头两两元素对应相加,并把和写入每个线程块的share_memory的第一个元素中,直到无法再分割为止。
#include<stdio.h>
#include<stdint.h>
#include<time.h> //for time()
#include<stdlib.h> //for srand()/rand()
#include<sys/time.h> //for gettimeofday()/struct timeval
#define KEN_CHECK(r) \
{\
cudaError_t rr = r; \
if (rr != cudaSuccess)\
{\
fprintf(stderr, "CUDA Error %s, function: %s, line: %d\n", \
cudaGetErrorString(rr), __FUNCTION__, __LINE__); \
exit(-1);\
}\
}
#define N 901
#define BLOCK_SIZE 1024
#define BLOCKS ((N + BLOCK_SIZE - 1) / BLOCK_SIZE) //try next line if you can
//#define BLOCKS 666
__managed__ int source[N]; //input data
__managed__ int final_result[1] = {0}; //scalar output
__global__ void _hawk_sum_gpu(int *input, int count, int *output)
{
__shared__ int shared_mem[BLOCK_SIZE];
//**********register summation stage***********
int idx=threadIdx.x+blockIdx.x*BLOCK_SIZE;
if(idx<count)
{
shared_mem[threadIdx.x] = input[idx]; //把输入向量值赋给每个block的share memory
}
else
shared_mem[threadIdx.x]=0; //多余的位置填0
__syncthreads();
//**********shared memory summation stage***********
for (int length = BLOCK_SIZE / 2; length >= 1; length /= 2) //length=128 64...1,
{
int pair_sum = -1;
if (threadIdx.x < length)
{
pair_sum = shared_mem[threadIdx.x] + shared_mem[threadIdx.x + length]; //两两元素求和
}
__syncthreads(); //why we need two __syncthreads() here, and,
if (threadIdx.x < length)
{
shared_mem[threadIdx.x] = pair_sum;
}
__syncthreads(); //....here ?
} //the per-block partial sum is shared_mem[0]
if (blockDim.x * blockIdx.x < count) //in case that our users are naughty
{
//the final reduction performed by atomicAdd()
if (threadIdx.x == 0) atomicAdd(output, shared_mem[0]);
}
}
int _hawk_sum_cpu(int *ptr, int count)
{
int sum = 0;
for (int i = 0; i < count; i++)
{
sum += ptr[i];
}
return sum;
}
void _nanana_init(int *ptr, int count)
{
uint32_t seed = (uint32_t)time(NULL); //make huan happy
srand(seed); //reseeding the random generator
//filling the buffer with random data
for (int i = 0; i < count; i++) ptr[i] = rand();
}
double get_time()
{
struct timeval tv;
gettimeofday(&tv, NULL);
return ((double)tv.tv_usec * 0.000001 + tv.tv_sec);
}
int main()
{
//**********************************
fprintf(stderr, "nanana is filling the buffer with %d elements...\n", N);
_nanana_init(source, N);
//**********************************
//Now we are going to kick start your kernel.
cudaDeviceSynchronize(); //steady! ready! go!
//Good luck & have fun!
fprintf(stderr, "Running on GPU...\n");
double t0 = get_time();
_hawk_sum_gpu<<<BLOCKS, BLOCK_SIZE>>>(source, N, final_result);
KEN_CHECK(cudaGetLastError()); //checking for launch failures
KEN_CHECK(cudaDeviceSynchronize()); //checking for run-time failurs
double t1 = get_time();
int A = final_result[0];
fprintf(stderr, "GPU sum: %u\n", A);
//**********************************
//Now we are going to exercise your CPU...
fprintf(stderr, "Running on CPU...\n");
double t2 = get_time();
int B = _hawk_sum_cpu(source, N);
double t3 = get_time();
fprintf(stderr, "CPU sum: %u\n", B);
//******The last judgement**********
if (A == B)
{
fprintf(stderr, "Test Passed!\n");
}
else
{
fprintf(stderr, "Test failed!\n");
exit(-1);
}
//****and some timing details*******
fprintf(stderr, "GPU time %.3f ms\n", (t1 - t0) * 1000.0);
fprintf(stderr, "CPU time %.3f ms\n", (t3 - t2) * 1000.0);
return 0;
}
例2 求元素最大值
方式与上例一样,两两比较大小,需要注意,在比较的for循环中,只进行length次的比较。此外,最后share memory中存放的首位元素即最大最小值,但每个Block都有一个并行的独立的share memory,最终需要对block number个的 share memory做求最大值或最小值
#include<stdio.h>
#include<stdint.h>
#include<time.h> //for time()
#include<stdlib.h> //for srand()/rand()
#include<sys/time.h> //for gettimeofday()/struct timeval
#include<math.h>
#define KEN_CHECK(r) \
{\
cudaError_t rr = r; \
if (rr != cudaSuccess)\
{\
fprintf(stderr, "CUDA Error %s, function: %s, line: %d\n", \
cudaGetErrorString(rr), __FUNCTION__, __LINE__); \
exit(-1);\
}\
}
#define N 900000000
#define BLOCK_SIZE 1024
#define BLOCKS ((N + BLOCK_SIZE - 1) / BLOCK_SIZE) //try next line if you can
//#define BLOCKS 666
__managed__ int source[N]; //input data
__managed__ int final_result[2 * 1] = {INT_MAX, INT_MIN}; //output
__global__ void minmax_gpu(int *input, int count,int *output)
{
__shared__ int find_min[BLOCK_SIZE];
__shared__ int find_max[BLOCK_SIZE];
int idx=threadIdx.x+blockDim.x*blockIdx.x;
if(idx<count)
{
find_min[threadIdx.x]=input[idx];
find_max[threadIdx.x]=input[idx];
}
else
{
find_min[threadIdx.x]=input[0];
find_max[threadIdx.x]=input[0];
}
__syncthreads();
for(int length=BLOCK_SIZE/2;length>=1;length=length/2)
{
//printf("min:%d",input);
if(threadIdx.x<length) //important
{
if(find_min[threadIdx.x]<find_min[threadIdx.x+length])
find_min[threadIdx.x]=find_min[threadIdx.x];
else
find_min[threadIdx.x]=find_min[threadIdx.x+length];
if(find_max[threadIdx.x]>find_max[threadIdx.x+length])
find_max[threadIdx.x]=find_max[threadIdx.x];
else
find_max[threadIdx.x]=find_max[threadIdx.x+length];
}
__syncthreads();
}
__syncthreads();
if (threadIdx.x == 0) atomicMin(output, find_min[0]); //这是为了实现计算每个Block的
if (threadIdx.x == 0) atomicMax(output+1, find_max[0]);
//output[0]=find_min[0];
//output[1]=find_max[0];
//printf("min-%d",output[0]);
}
typedef struct
{
int min;
int max;
}cpu_result_t;
cpu_result_t _hawk_minmax_cpu(int *ptr, int count)
{
int YZP_min = INT_MAX;
int YZP_max = INT_MIN;
for (int i = 0; i < count; i++)
{
YZP_min = min(YZP_min, ptr[i]);
YZP_max = max(YZP_max, ptr[i]);
}
cpu_result_t r;
{
r.min = YZP_min;
r.max = YZP_max;
}
return r;
}
void _nanana_init(int *ptr, int count)
{
uint32_t seed = (uint32_t)time(NULL); //make huan happy
srand(seed); //reseeding the random generator
//filling the buffer with random data
//for (int i = 0; i < count; i++) ptr[i] = (rand() << 3) ^ rand();
for (int i = 0; i < count; i++) ptr[i] = rand();
}
double get_time()
{
struct timeval tv;
gettimeofday(&tv, NULL);
return ((double)tv.tv_usec * 0.000001 + tv.tv_sec);
}
int main()
{
//**********************************
fprintf(stderr, "nanana is filling the buffer with %d elements...\n", N);
_nanana_init(source, N);
//**********************************
//Now we are going to kick start your kernel.
cudaDeviceSynchronize(); //steady! ready! go!
//Good luck & have fun!
fprintf(stderr, "Running on GPU...\n");
double t0 = get_time();
minmax_gpu<<<BLOCKS, BLOCK_SIZE>>>(source, N, final_result);
KEN_CHECK(cudaGetLastError()); //checking for launch failures
KEN_CHECK(cudaDeviceSynchronize()); //checking for run-time failurs
double t1 = get_time();
int A0 = final_result[0];
int A1 = final_result[1];
fprintf(stderr, "GPU min: %d, max: %d\n", A0, A1);
//**********************************
//Now we are going to exercise your CPU...
fprintf(stderr, "Running on CPU...\n");
double t2 = get_time();
cpu_result_t B = _hawk_minmax_cpu(source, N);
double t3 = get_time();
fprintf(stderr, "CPU min: %d, max: %d\n", B.min, B.max);
//******The last judgement**********
if (A0 == B.min && A1 == B.max)
{
fprintf(stderr, "Test Passed!\n");
}
else
{
fprintf(stderr, "Test failed!\n");
exit(-1);
}
//****and some timing details*******
fprintf(stderr, "GPU time %.3f ms\n", (t1 - t0) * 1000.0);
fprintf(stderr, "CPU time %.3f ms\n", (t3 - t2) * 1000.0);
return 0;
}
求矩阵转置
注意一个原则:global memory读取的地址顺序尽可能连续,即尽可能横坐标先发生改变,因为二维矩阵是以一维数组方式存入的。而对share memory而言,不那么重要,实测区别在2倍以上。
然鹅,改变share memory的横纵坐标读取顺序(share[threadIdx.x][threadIdx.y]不会改变block的排序方式,仅仅是每个小块的顺序发生改变)
此外,share_mem的维度大小只要大于等于block size就行,因为blockDim.x不受share_mem的维度大小的影响
#include<stdio.h>
#include<stdint.h>
#include<time.h> //for time()
#include<stdlib.h> //for srand()/rand()
#include<sys/time.h> //for gettimeofday()/struct timeval
#include<assert.h>
#define KEN_CHECK(r) \
{\
cudaError_t rr = r; \
if (rr != cudaSuccess)\
{\
fprintf(stderr, "CUDA Error %s, function: %s, line: %d\n", \
cudaGetErrorString(rr), __FUNCTION__, __LINE__); \
exit(-1);\
}\
}
#define M 3000000 //three thousand and one nights
#define BLOCK_SIZE 16
__managed__ int shark[M][M]; //input matrix
__managed__ int gpu_shark_T[M][M];//GPU result
__managed__ int cpu_shark_T[M][M];//CPU result
__global__ void _ZHI_transpose(int A[M][M], int B[M][M])
{
__shared__ int transpose[BLOCK_SIZE][BLOCK_SIZE]; //tell me why?
int thread_x = threadIdx.x + blockDim.x * blockIdx.x;
int thread_y = threadIdx.y + blockDim.y * blockIdx.y;
if (thread_x < M && thread_y < M)
{
transpose[threadIdx.y][threadIdx.x] = A[thread_y][thread_x];
//B[thread_x][thread_y] = A[thread_y][thread_x];
}
__syncthreads();
int y2 = threadIdx.y + blockDim.x * blockIdx.x;
int x2 = threadIdx.x + blockDim.y * blockIdx.y;
if (y2 < M && x2 < M)
{
B[threadIdx.y + blockDim.x * blockIdx.x][threadIdx.x + blockDim.y * blockIdx.y] = transpose[threadIdx.x][threadIdx.y];
}
__syncthreads();
//if (x2 < M && y2 < M) printf("B:%d ,A:%d\n",B[y2][x2], A[y2][x2]);
}
void _sparks_transpose_cpu(int A[M][M], int B[M][M])
{
for (int j = 0; j < M; j++)
{
for (int i = 0; i < M; i++)
{
B[i][j] = A[j][i];
}
}
}
void DDBDDH_init(int A[M][M])
{
uint32_t seed = (uint32_t)time(NULL); //make huan happy
srand(seed); //reseeding the random generator
//filling the matrix with random data
for (int j = 0; j < M; j++)
{
for (int i = 0; i < M; i++)
{
A[j][i] = rand();
}
}
}
double get_time()
{
struct timeval tv;
gettimeofday(&tv, NULL);
return ((double)tv.tv_usec * 0.000001 + tv.tv_sec);
}
int main()
{
//**********************************
fprintf(stderr, "DDBDDH is filling the %dx%d maxtrix with random data\n",
M, M);
DDBDDH_init(shark);
//**********************************
//Now we are going to kick start your kernel.
cudaDeviceSynchronize(); //steady! ready! go!
//Good luck & have fun!
fprintf(stderr, "Running on GPU...\n");
double t0 = get_time();
int n = (M + BLOCK_SIZE - 1) / BLOCK_SIZE; //what the hell is this!
dim3 grid_shape(n, n);
dim3 block_shape(BLOCK_SIZE, BLOCK_SIZE);
_ZHI_transpose<<<grid_shape, block_shape>>>(shark, gpu_shark_T);
KEN_CHECK(cudaGetLastError()); //checking for launch failures
KEN_CHECK(cudaDeviceSynchronize()); //checking for run-time failurs
double t1 = get_time();
//**********************************
//Now we are going to exercise your CPU...
fprintf(stderr, "Running on CPU...\n");
double t2 = get_time();
_sparks_transpose_cpu(shark, cpu_shark_T);
double t3 = get_time();
//******The last judgement**********
for (int j = 0; j < M; j++)
{
for (int i = 0; i < M; i++)
{
if (gpu_shark_T[j][i] != cpu_shark_T[j][i])
{
fprintf(stderr, "Test failed!\n");
exit(-1);
}
}
}
fprintf(stderr, "Test Passed!\n");
//****and some timing details*******
fprintf(stderr, "GPU time %.3f ms\n", (t1 - t0) * 1000.0);
fprintf(stderr, "CPU time %.3f ms\n", (t3 - t2) * 1000.0);
return 0;
}