运行检测函数
常用的运行检验函数
__host____device__const char* cudaGetErrorName ( cudaError_t error );
__host____device__const char* cudaGetErrorString ( cudaError_t error )
错误代码解释
typedef enum{
CUBLAS_STATUS_SUCCESS =0,
CUBLAS_STATUS_NOT_INITIALIZED =1,
CUBLAS_STATUS_ALLOC_FAILED =3,
CUBLAS_STATUS_INVALID_VALUE =7,
CUBLAS_STATUS_ARCH_MISMATCH =8,
CUBLAS_STATUS_MAPPING_ERROR =11,
CUBLAS_STATUS_EXECUTION_FAILED=13,
CUBLAS_STATUS_INTERNAL_ERROR =14,
CUBLAS_STATUS_NOT_SUPPORTED =15,
CUBLAS_STATUS_LICENSE_ERROR =16
} cublasStatus_t;
错误检测 可以将如下代码写入.cuh,方便在编程时候直接调用。
//预处理指令,确保当前文件在编译中不会重复包含
#pragma once
#include <stdio.h>
//定义函数名为CHECK
#define CHECK(call) \
do \
{ \
const cudaError_t error_code = call; \
if (error_code != cudaSuccess) \
{ \
printf("CUDA Error:\n"); \
printf(" File: %s\n", __FILE__); \
printf(" Line: %d\n", __LINE__); \
printf(" Error code: %d\n", error_code); \
printf(" Error text: %s\n", \
cudaGetErrorString(error_code)); \
exit(1); \
} \
} while (0)
事件
本质是一个GPU时间戳,这个时间戳是在用户指定的时间点上记录的。由于GPU本身支持记录时间戳,因此就避免了当使用CPU定时器来统计GPU执行时间时可能遇到的诸多问题。
使用事件的流程一般如下。
// create two events
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// record start event on the default stream
cudaEventRecord(start);
// execute kernel
kernel<<<grid, block>>>(arguments);
// record stop event on the default stream
cudaEventRecord(stop);
// wait until the stop event completes
cudaEventSynchronize(stop);
// calculate the elapsed time between two events
float time;
cudaEventElapsedTime(&time, start, stop);
// clean up the two events
cudaEventDestroy(start);
cudaEventDestroy(stop);
实验
用事件去记录程序运行的时间 对gpu并行加速运算有更深的理解
实验demo如下。
#include <stdio.h>
#include <math.h>
#include "error.cuh"
#define BLOCK_SIZE 16
__global__ void gpu_matrix_mult(int *a,int *b, int *c, int m, int n, int k)
{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int sum = 0;
if( col < k && row < m)
{
for(int i = 0; i < n; i++)
{
sum += a[row * n + i] * b[i * k + col];
}
c[row * k + col] = sum;
}
}
void cpu_matrix_mult(int *h_a, int *h_b, int *h_result, int m, int n, int k) {
for (int i = 0; i < m; ++i)
{
for (int j = 0; j < k; ++j)
{
int tmp = 0.0;
for (int h = 0; h < n; ++h)
{
tmp += h_a[i * n + h] * h_b[h * k + j];
}
h_result[i * k + j] = tmp;
}
}
}
int main(int argc, char const *argv[])
{
int m=100;
int n=100;
int k=100;
int *h_a, *h_b, *h_c, *h_cc;
CHECK(cudaMallocHost((void **) &h_a, sizeof(int)*m*n));
CHECK(cudaMallocHost((void **) &h_b, sizeof(int)*n*k));
CHECK(cudaMallocHost((void **) &h_c, sizeof(int)*m*k));
CHECK(cudaMallocHost((void **) &h_cc, sizeof(int)*m*k));
cudaEvent_t start, stop;
CHECK(cudaEventCreate(&start));
CHECK(cudaEventCreate(&stop));
for (int i = 0; i < m; ++i) {
for (int j = 0; j < n; ++j) {
h_a[i * n + j] = rand() % 1024;
}
}
for (int i = 0; i < n; ++i) {
for (int j = 0; j < k; ++j) {
h_b[i * k + j] = rand() % 1024;
}
}
int *d_a, *d_b, *d_c;
CHECK(cudaMalloc((void **) &d_a, sizeof(int)*m*n));
CHECK(cudaMalloc((void **) &d_b, sizeof(int)*n*k));
CHECK(cudaMalloc((void **) &d_c, sizeof(int)*m*k));
// copy matrix A and B from host to device memory
CHECK(cudaMemcpy(d_a, h_a, sizeof(int)*m*n, cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(d_b, h_b, sizeof(int)*n*k, cudaMemcpyHostToDevice));
unsigned int grid_rows = (m + BLOCK_SIZE - 1) / BLOCK_SIZE;
unsigned int grid_cols = (k + BLOCK_SIZE - 1) / BLOCK_SIZE;
dim3 dimGrid(grid_cols, grid_rows);
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
CHECK(cudaEventRecord(start));
cudaEventQuery(start);
gpu_matrix_mult<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, m, n, k);
CHECK(cudaEventRecord(stop));
CHECK(cudaEventSynchronize(stop));
float elapsed_time;
CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
printf("Time = %g ms.\n", elapsed_time);
CHECK(cudaEventDestroy(start));
CHECK(cudaEventDestroy(stop));
CHECK(cudaMemcpy(h_c, d_c, (sizeof(int)*m*k), cudaMemcpyDeviceToHost));
//cudaThreadSynchronize();
cpu_matrix_mult(h_a, h_b, h_cc, m, n, k);
int ok = 1;
for (int i = 0; i < m; ++i)
{
for (int j = 0; j < k; ++j)
{
if(fabs(h_cc[i*k + j] - h_c[i*k + j])>(1.0e-10))
{
ok = 0;
}
}
}
if(ok)
{
printf("Pass!!!\n");
}
else
{
printf("Error!!!\n");
}
// free memory
CHECK(cudaFree(d_a));
CHECK(cudaFree(d_b));
CHECK(cudaFree(d_c));
CHECK(cudaFreeHost(h_a));
CHECK(cudaFreeHost(h_b));
CHECK(cudaFreeHost(h_c));
return 0;
}
实验结果:
==17463== NVPROF is profiling process 17463, command: ./d.out
==17463== Warning: Unified Memory Profiling is not supported on the underlying platform. System requirements for unified memory can be found at: http://docs.nvs
Time = 0.88768 ms.
Pass!!!
==17463== Profiling application: ./d.out
==17463== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 90.25% 715.12us 1 715.12us 715.12us 715.12us gpu_matrix_mult(int*, int*, int*, int, int, int)
7.09% 56.150us 2 28.075us 23.657us 32.493us [CUDA memcpy HtoD]
2.67% 21.128us 1 21.128us 21.128us 21.128us [CUDA memcpy DtoH]
API calls: 98.54% 308.39ms 4 77.097ms 19.392us 308.32ms cudaMallocHost
0.46% 1.4286ms 3 476.19us 130.14us 1.1153ms cudaMemcpy
0.29% 920.02us 3 306.67us 41.184us 781.82us cudaFree
0.23% 709.91us 1 709.91us 709.91us 709.91us cudaEventSynchronize
0.19% 585.02us 4 146.25us 19.456us 449.05us cudaFreeHost
0.16% 486.62us 3 162.21us 16.543us 446.30us cudaMalloc
0.05% 151.65us 97 1.5630us 608ns 62.239us cuDeviceGetAttribute
0.04% 124.13us 1 124.13us 124.13us 124.13us cudaLaunchKernel
0.02% 58.527us 2 29.263us 24.832us 33.695us cudaEventRecord
0.01% 39.071us 2 19.535us 4.1600us 34.911us cudaEventCreate
0.01% 29.024us 1 29.024us 29.024us 29.024us cudaEventQuery
0.00% 12.064us 1 12.064us 12.064us 12.064us cuDeviceTotalMem
0.00% 10.079us 2 5.0390us 3.3280us 6.7510us cudaEventDestroy
0.00% 6.8800us 3 2.2930us 1.3120us 3.7760us cuDeviceGetCount
0.00% 6.4000us 1 6.4000us 6.4000us 6.4000us cudaEventElapsedTime
0.00% 3.4240us 2 1.7120us 1.1200us 2.3040us cuDeviceGet
0.00% 2.4320us 1 2.4320us 2.4320us 2.4320us cuDeviceGetName
0.00% 928ns 1 928ns 928ns 928ns cuDeviceGetUuid