CUDA c编程笔记

记录事件

流程:
1 cudaEvent_t start, stop; 声明开始、停止的cudaEvent_t 变量
2 float elapsedTime; 运行时间变量
3开启计时器

  CHECK( cudaEventCreate( &start ) );
    CHECK( cudaEventCreate( &stop ) );

4 CHECK( cudaEventRecord( start, 0 ) ); 开始计时
5 需要计时的代码 …
6停止和统计时间

CHECK( cudaEventRecord( stop, 0 ) );
CHECK( cudaEventSynchronize( stop ) );
CHECK( cudaEventElapsedTime( &elapsedTime, start, stop ) );
printf( "Time taken:  %3.1f ms\n", elapsedTime );

cuda stream使用

目的:每个stream相当于cpu的进程,多个stream可以运行多个kernel程序

cudaStream_t    stream0, stream1; //声明stream
CHECK( cudaStreamCreate( &stream0 ) ); //初始化stream
CHECK( cudaStreamCreate( &stream1 ) );
 // enqueue copies of a in stream0 and stream1即分配给每个stream内存空间
CHECK( cudaMemcpyAsync( dev_a0, host_a+i, N * sizeof(int), cudaMemcpyHostToDevice, stream0 ) );
CHECK( cudaMemcpyAsync( dev_a1, host_a+i+N, N * sizeof(int), cudaMemcpyHostToDevice, stream1 ) );
// enqueue copies of b in stream0 and stream1,注意,和cudaMemcpy不同
 CHECK( cudaMemcpyAsync( dev_b0, host_b+i, N * sizeof(int), cudaMemcpyHostToDevice, stream0 ) );
CHECK( cudaMemcpyAsync( dev_b1, host_b+i+N, N * sizeof(int), cudaMemcpyHostToDevice, stream1 ) );
//执行kernek函数
kernel<<<N/256,256,0,stream0>>>( dev_a0, dev_b0, dev_c0 );
 kernel<<<N/256,256,0,stream1>>>( dev_a1, dev_b1, dev_c1 );
 //最后同步一下stream
CHECK( cudaStreamSynchronize( stream0 ) );
CHECK( cudaStreamSynchronize( stream1 ) );
//销毁
CHECK( cudaStreamDestroy( stream0 ) );
CHECK( cudaStreamDestroy( stream1 ) );

使用cublas加速库

首先需要声明handle

例一 向量求和:

思想是:每个元素都分配一个线程,对于每个线程块内的元素(超出向量的部分填充0),从中间序号截断,两头两两元素对应相加,并把和写入每个线程块的share_memory的第一个元素中,直到无法再分割为止。

#include<stdio.h>
#include<stdint.h>
#include<time.h>     //for time()
#include<stdlib.h>   //for srand()/rand()
#include<sys/time.h> //for gettimeofday()/struct timeval


#define KEN_CHECK(r) \
{\
    cudaError_t rr = r;   \
    if (rr != cudaSuccess)\
    {\
        fprintf(stderr, "CUDA Error %s, function: %s, line: %d\n",       \
		        cudaGetErrorString(rr), __FUNCTION__, __LINE__); \
        exit(-1);\
    }\
}

#define N 901
#define BLOCK_SIZE 1024
#define BLOCKS ((N + BLOCK_SIZE - 1) / BLOCK_SIZE) //try next line if you can
//#define BLOCKS 666

__managed__ int source[N];               //input data
__managed__ int final_result[1] = {0};   //scalar output

__global__ void _hawk_sum_gpu(int *input, int count, int *output)
{
    __shared__ int shared_mem[BLOCK_SIZE];

    //**********register summation stage***********
   int idx=threadIdx.x+blockIdx.x*BLOCK_SIZE;
    if(idx<count)
    {
         shared_mem[threadIdx.x] = input[idx]; //把输入向量值赋给每个block的share memory
    }
    else
        shared_mem[threadIdx.x]=0;             //多余的位置填0


 
    __syncthreads();

    //**********shared memory summation stage***********
    for (int length = BLOCK_SIZE / 2; length >= 1; length /= 2)  //length=128 64...1, 
    {
        int pair_sum = -1; 
	if (threadIdx.x < length)
	{
	    pair_sum = shared_mem[threadIdx.x] + shared_mem[threadIdx.x + length]; //两两元素求和
	}
	__syncthreads();  //why we need two __syncthreads() here, and,
	
	if (threadIdx.x < length)
	{
	    shared_mem[threadIdx.x] = pair_sum;
	}
	__syncthreads();  //....here ?
	
    } //the per-block partial sum is shared_mem[0]

    if (blockDim.x * blockIdx.x < count) //in case that our users are naughty
    {
        //the final reduction performed by atomicAdd()
        if (threadIdx.x == 0) atomicAdd(output, shared_mem[0]);
    }
}

int _hawk_sum_cpu(int *ptr, int count)
{
    int sum = 0;
    for (int i = 0; i < count; i++)
    {
        sum += ptr[i];
    }
    return sum;
}

void _nanana_init(int *ptr, int count)
{
    uint32_t seed = (uint32_t)time(NULL); //make huan happy
    srand(seed);  //reseeding the random generator

    //filling the buffer with random data
    for (int i = 0; i < count; i++) ptr[i] = rand();
}

double get_time()
{
    struct timeval tv;
    gettimeofday(&tv, NULL);
    return ((double)tv.tv_usec * 0.000001 + tv.tv_sec);
}

int main()
{
    //**********************************
    fprintf(stderr, "nanana is filling the buffer with %d elements...\n", N);
    _nanana_init(source, N);

    //**********************************
    //Now we are going to kick start your kernel.
    cudaDeviceSynchronize(); //steady! ready! go!
    //Good luck & have fun!
    
    fprintf(stderr, "Running on GPU...\n");
    
double t0 = get_time();
    _hawk_sum_gpu<<<BLOCKS, BLOCK_SIZE>>>(source, N, final_result);
        KEN_CHECK(cudaGetLastError());  //checking for launch failures
    KEN_CHECK(cudaDeviceSynchronize()); //checking for run-time failurs
double t1 = get_time();

    int A = final_result[0];
    fprintf(stderr, "GPU sum: %u\n", A);


    //**********************************
    //Now we are going to exercise your CPU...
    fprintf(stderr, "Running on CPU...\n");

double t2 = get_time();
    int B = _hawk_sum_cpu(source, N);
double t3 = get_time();
    fprintf(stderr, "CPU sum: %u\n", B);

    //******The last judgement**********
    if (A == B)
    {
        fprintf(stderr, "Test Passed!\n");
    }
    else
    {
        fprintf(stderr, "Test failed!\n");
	exit(-1);
    }
    
    //****and some timing details*******
    fprintf(stderr, "GPU time %.3f ms\n", (t1 - t0) * 1000.0);
    fprintf(stderr, "CPU time %.3f ms\n", (t3 - t2) * 1000.0);

    return 0;
}	
	

例2 求元素最大值

方式与上例一样,两两比较大小,需要注意,在比较的for循环中,只进行length次的比较。此外,最后share memory中存放的首位元素即最大最小值,但每个Block都有一个并行的独立的share memory,最终需要对block number个的 share memory做求最大值或最小值

#include<stdio.h>
#include<stdint.h>
#include<time.h>     //for time()
#include<stdlib.h>   //for srand()/rand()
#include<sys/time.h> //for gettimeofday()/struct timeval
#include<math.h>


#define KEN_CHECK(r) \
{\
    cudaError_t rr = r;   \
    if (rr != cudaSuccess)\
    {\
        fprintf(stderr, "CUDA Error %s, function: %s, line: %d\n",       \
		        cudaGetErrorString(rr), __FUNCTION__, __LINE__); \
        exit(-1);\
    }\
}

#define N 900000000
#define BLOCK_SIZE 1024
#define BLOCKS ((N + BLOCK_SIZE - 1) / BLOCK_SIZE) //try next line if you can
//#define BLOCKS 666

__managed__ int source[N];                                //input data
__managed__ int final_result[2 * 1] = {INT_MAX, INT_MIN}; //output


__global__ void minmax_gpu(int *input, int count,int *output)
{
    __shared__ int find_min[BLOCK_SIZE];
    __shared__ int find_max[BLOCK_SIZE];
 
    int idx=threadIdx.x+blockDim.x*blockIdx.x;
    if(idx<count)
    {
        find_min[threadIdx.x]=input[idx];
        find_max[threadIdx.x]=input[idx];
    }
    else
    {
        find_min[threadIdx.x]=input[0];
        find_max[threadIdx.x]=input[0];
    }
    __syncthreads();
    for(int length=BLOCK_SIZE/2;length>=1;length=length/2)
    {
       //printf("min:%d",input);
       if(threadIdx.x<length) //important
       {
            if(find_min[threadIdx.x]<find_min[threadIdx.x+length])
                find_min[threadIdx.x]=find_min[threadIdx.x];
            else
            find_min[threadIdx.x]=find_min[threadIdx.x+length];
            if(find_max[threadIdx.x]>find_max[threadIdx.x+length])
                find_max[threadIdx.x]=find_max[threadIdx.x];
            else
            find_max[threadIdx.x]=find_max[threadIdx.x+length];
       }
        __syncthreads();
       
       
        
    }
    __syncthreads();
    if (threadIdx.x == 0) atomicMin(output, find_min[0]);  //这是为了实现计算每个Block的
    if (threadIdx.x == 0) atomicMax(output+1, find_max[0]);
    //output[0]=find_min[0];
    //output[1]=find_max[0];
    //printf("min-%d",output[0]);
}

typedef struct
{
    int min;
    int max;
}cpu_result_t;	
	
cpu_result_t _hawk_minmax_cpu(int *ptr, int count)
{
    int YZP_min = INT_MAX;
    int YZP_max = INT_MIN;
    for (int i = 0; i < count; i++)
    {
	YZP_min = min(YZP_min, ptr[i]);
	YZP_max = max(YZP_max, ptr[i]);
    }

    cpu_result_t r;
    {
	r.min = YZP_min;
	r.max = YZP_max;
    }		
    return r;
}

void _nanana_init(int *ptr, int count)
{
    uint32_t seed = (uint32_t)time(NULL); //make huan happy
    srand(seed);  //reseeding the random generator

    //filling the buffer with random data
    //for (int i = 0; i < count; i++) ptr[i] = (rand() << 3) ^ rand();
    for (int i = 0; i < count; i++) ptr[i] = rand();
}

double get_time()
{
    struct timeval tv;
    gettimeofday(&tv, NULL);
    return ((double)tv.tv_usec * 0.000001 + tv.tv_sec);
}

int main()
{
    //**********************************
    fprintf(stderr, "nanana is filling the buffer with %d elements...\n", N);
    _nanana_init(source, N);

    //**********************************
    //Now we are going to kick start your kernel.
    cudaDeviceSynchronize(); //steady! ready! go!
    //Good luck & have fun!
    
    fprintf(stderr, "Running on GPU...\n");
    
double t0 = get_time();
    minmax_gpu<<<BLOCKS, BLOCK_SIZE>>>(source, N, final_result);
        KEN_CHECK(cudaGetLastError());  //checking for launch failures
	
    KEN_CHECK(cudaDeviceSynchronize()); //checking for run-time failurs
double t1 = get_time();

    int A0 = final_result[0];
    int A1 = final_result[1];
    fprintf(stderr, "GPU min: %d, max: %d\n", A0, A1);


    //**********************************
    //Now we are going to exercise your CPU...
    fprintf(stderr, "Running on CPU...\n");

double t2 = get_time();
    cpu_result_t B = _hawk_minmax_cpu(source, N);
double t3 = get_time();
    fprintf(stderr, "CPU min: %d, max: %d\n", B.min, B.max);

    //******The last judgement**********
    if (A0 == B.min && A1 == B.max)
    {
        fprintf(stderr, "Test Passed!\n");
    }
    else
    {
        fprintf(stderr, "Test failed!\n");
	exit(-1);
    }
    
    //****and some timing details*******
    fprintf(stderr, "GPU time %.3f ms\n", (t1 - t0) * 1000.0);
    fprintf(stderr, "CPU time %.3f ms\n", (t3 - t2) * 1000.0);

    return 0;
}	
	


求矩阵转置

注意一个原则:global memory读取的地址顺序尽可能连续,即尽可能横坐标先发生改变,因为二维矩阵是以一维数组方式存入的。而对share memory而言,不那么重要,实测区别在2倍以上。
然鹅,改变share memory的横纵坐标读取顺序(share[threadIdx.x][threadIdx.y]不会改变block的排序方式,仅仅是每个小块的顺序发生改变)
此外,share_mem的维度大小只要大于等于block size就行,因为blockDim.x不受share_mem的维度大小的影响

#include<stdio.h>
#include<stdint.h>
#include<time.h>     //for time()
#include<stdlib.h>   //for srand()/rand()
#include<sys/time.h> //for gettimeofday()/struct timeval
#include<assert.h>
	
#define KEN_CHECK(r) \
{\
    cudaError_t rr = r;   \
    if (rr != cudaSuccess)\
    {\
        fprintf(stderr, "CUDA Error %s, function: %s, line: %d\n",       \
		        cudaGetErrorString(rr), __FUNCTION__, __LINE__); \
        exit(-1);\
    }\
}

#define M 3000000 //three thousand and one nights
#define BLOCK_SIZE 16
__managed__ int shark[M][M];      //input matrix
__managed__ int gpu_shark_T[M][M];//GPU result
__managed__ int cpu_shark_T[M][M];//CPU result


__global__ void _ZHI_transpose(int A[M][M], int B[M][M])
{
    __shared__ int transpose[BLOCK_SIZE][BLOCK_SIZE]; //tell me why?
	
    int thread_x = threadIdx.x + blockDim.x * blockIdx.x;
    int thread_y = threadIdx.y + blockDim.y * blockIdx.y;
    if (thread_x < M && thread_y < M)
    {
	transpose[threadIdx.y][threadIdx.x] = A[thread_y][thread_x];
    //B[thread_x][thread_y] = A[thread_y][thread_x];
    }
    
    __syncthreads();
	
    int y2 = threadIdx.y + blockDim.x * blockIdx.x;
    int x2 = threadIdx.x + blockDim.y * blockIdx.y;
    if (y2 < M && x2 < M)
    {
	B[threadIdx.y + blockDim.x * blockIdx.x][threadIdx.x + blockDim.y * blockIdx.y] = transpose[threadIdx.x][threadIdx.y];
    
    }
    __syncthreads();
    //if (x2 < M && y2 < M) printf("B:%d   ,A:%d\n",B[y2][x2], A[y2][x2]);
}

void _sparks_transpose_cpu(int A[M][M], int B[M][M])
{
    for (int j = 0; j < M; j++)
    {
	for (int i = 0; i < M; i++)
	{
	    B[i][j] = A[j][i];
	}
    }
}

void DDBDDH_init(int A[M][M])
{
    uint32_t seed = (uint32_t)time(NULL); //make huan happy
    srand(seed);  //reseeding the random generator

    //filling the matrix with random data
    for (int j = 0; j < M; j++)
    {
	for (int i = 0; i < M; i++)
	{
	    A[j][i] = rand();
	}
    }
}

double get_time()
{
    struct timeval tv;
    gettimeofday(&tv, NULL);
    return ((double)tv.tv_usec * 0.000001 + tv.tv_sec);
}

int main()
{
    //**********************************
    fprintf(stderr, "DDBDDH is filling the %dx%d maxtrix with random data\n",
	            M, M);
    DDBDDH_init(shark);

    //**********************************
    //Now we are going to kick start your kernel.
    cudaDeviceSynchronize(); //steady! ready! go!
    //Good luck & have fun!
    
    fprintf(stderr, "Running on GPU...\n");
    
double t0 = get_time();
    int n = (M + BLOCK_SIZE - 1) / BLOCK_SIZE; //what the hell is this!
    dim3 grid_shape(n, n);
    dim3 block_shape(BLOCK_SIZE, BLOCK_SIZE);
    _ZHI_transpose<<<grid_shape, block_shape>>>(shark, gpu_shark_T);
        KEN_CHECK(cudaGetLastError());  //checking for launch failures
    KEN_CHECK(cudaDeviceSynchronize()); //checking for run-time failurs
double t1 = get_time();

    //**********************************
    //Now we are going to exercise your CPU...
    fprintf(stderr, "Running on CPU...\n");

double t2 = get_time();
    _sparks_transpose_cpu(shark, cpu_shark_T);
double t3 = get_time();

    //******The last judgement**********
    for (int j = 0; j < M; j++)
    {
	for (int i = 0; i < M; i++)
	{
	    if (gpu_shark_T[j][i] != cpu_shark_T[j][i])
	    {
	        fprintf(stderr, "Test failed!\n");
	   	exit(-1);
	    }
	}
    }	
    fprintf(stderr, "Test Passed!\n");
    
    //****and some timing details*******
    fprintf(stderr, "GPU time %.3f ms\n", (t1 - t0) * 1000.0);
    fprintf(stderr, "CPU time %.3f ms\n", (t3 - t2) * 1000.0);

    return 0;
}	
	

  • 2
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值