1.实践点
-
cuda核函数调用方式:<<< ... >>>
-
Grid和Block的内部三维索引
blockIdx.x, blockIdx.y, blockIdx.z;
threadIdx.x, threadIdx.y, threadId
-
核函数的定义:
2. 代码
-
common.h
#include <sys/time.h>
#ifndef _COMMON_H
#define _COMMON_H
#define CHECK(call) \
{ \
const cudaError_t error = call; \
if (error != cudaSuccess) \
{ \
fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__); \
fprintf(stderr, "code: %d, reason: %s\n", error, \
cudaGetErrorString(error)); \
exit(1); \
} \
}
#define CHECK_CUBLAS(call) \
{ \
cublasStatus_t err; \
if ((err = (call)) != CUBLAS_STATUS_SUCCESS) \
{ \
fprintf(stderr, "Got CUBLAS error %d at %s:%d\n", err, __FILE__, \
__LINE__); \
exit(1); \
} \
}
#define CHECK_CURAND(call) \
{ \
curandStatus_t err; \
if ((err = (call)) != CURAND_STATUS_SUCCESS) \
{ \
fprintf(stderr, "Got CURAND error %d at %s:%d\n", err, __FILE__, \
__LINE__); \
exit(1); \
} \
}
#define CHECK_CUFFT(call) \
{ \
cufftResult err; \
if ( (err = (call)) != CUFFT_SUCCESS) \
{ \
fprintf(stderr, "Got CUFFT error %d at %s:%d\n", err, __FILE__, \
__LINE__); \
exit(1); \
} \
}
#define CHECK_CUSPARSE(call) \
{ \
cusparseStatus_t err; \
if ((err = (call)) != CUSPARSE_STATUS_SUCCESS) \
{ \
fprintf(stderr, "Got error %d at %s:%d\n", err, __FILE__, __LINE__); \
cudaError_t cuda_err = cudaGetLastError(); \
if (cuda_err != cudaSuccess) \
{ \
fprintf(stderr, " CUDA error \"%s\" also detected\n", \
cudaGetErrorString(cuda_err)); \
} \
exit(1); \
} \
}
inline double seconds()
{
struct timeval tp;
struct timezone tzp;
int i = gettimeofday(&tp, &tzp);
return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6); // amazing float
}
#endif // _COMMON_H
-
hello.cu
#include "../common/common.h" #include <stdio.h> /* * A simple introduction to programming in CUDA. This program prints "Hello * World from GPU! from 10 CUDA threads running on the GPU. */ __global__ void helloFromGPU() { printf("Hello World from GPU block(%d, %d, %d) thread (%d, %d, %d)!\n", blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z); // why GPU can call printf function ?????? } int main(int argc, char **argv) { printf("Hello World from CPU!\n"); // grid(1,1,1) with only 1 block, 10 threads for each block (10, 1, 1) helloFromGPU<<<1, 20>>>(); // CHECK(cudaDeviceReset()); CHECK(cudaDeviceSynchronize()); return 0; }
-
Makefile
APPS=hello
all: ${APPS}
%: %.cu
nvcc -O2 -arch=sm_60 -o $@ $<
clean:
rm -f ${APPS}
3. QA:
-
核函数是在GPU设备上执行的,为什么能执行printf函数,其实现原理是什么样的?