#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda.h>
#include <cuda_runtime.h>
#define BLOCK_SIZE 16
static void HandleError(cudaError_t err, const char *file, int line)
{
if (err != cudaSuccess)
{
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
exit(EXIT_FAILURE);
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
#define HANDLE_NULL( a ) {if ((a) == NULL) { \
printf("Host memory failed in %s at line %d\n", \
__FILE__, __LINE__); \
exit(EXIT_FAILURE); }}
static bool InitCUDA()
{
int count;
cudaGetDeviceCount(&count);
if (count == 0)
{
fprintf(stderr, "There is no device.\n");
return false;
}
int i;
cudaDeviceProp prop = {0};
for (i = 0; i < count; i++)
{
if (cudaGetDeviceProperties(&prop, i) == cudaSuccess)
{
if (prop.major >= 1)
{
printf("%s\n", prop.name);
break;
}
}
}
if (i >= count)
{
fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
return false;
}
cudaSetDevice(i);
return true;
}
static void matgen(float* a, int lda, int n)
{
int i, j;
for (i = 0; i < n; i++)
{
for (j = 0; j < n; j++)
{
a[i * lda + j] = (float)rand() / RAND_MAX + (float)rand() / (RAND_MAX * RAND_MAX);
}
}
}
static void matmult(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)
{
int i, j, k;
for (i = 0; i < n; i++)
{
for (j = 0; j < n; j++)
{
double t = 0;
for (k = 0; k < n; k++)
{
t += a[i * lda + k] * b[j + k * ldb];
}
c[i * ldc + j] = t;
}
}
}
static void compare_mat(const float* a, int lda, const float* b, int ldb, int n)
{
float max_err = 0;
float average_err = 0;
float max_absolute_err = 0;
int i, j;
for (i = 0; i < n; i++)
{
for (j = 0; j < n; j++)
{
if (b[i * ldb + j] != 0)
{
float tmp = fabs(a[i * lda + j] - b[i * ldb + j]);
if (max_absolute_err < tmp)
{
max_absolute_err = tmp;
}
float err = fabs(tmp / b[i * ldb + j]);
if (max_err < err)
{
max_err = err;
}
average_err += err;
}
}
}
printf("Max absolute error: %g\nMax error: %g\nAverage error: %g\n", \
max_absolute_err, max_err, average_err / (n * n));
}
//利用 Kahan's Summation Formula 来提高精确度,矩阵进行分块
__global__ static void matMultCUDA(const float* dev_a, size_t lda, const float* dev_b, size_t ldb, float* dev_c, size_t ldc, int n)
{
__shared__ float matA[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float matB[BLOCK_SIZE][BLOCK_SIZE];
const int tidc = threadIdx.x;
const int tidr = threadIdx.y;
const int bidc = blockIdx.x * BLOCK_SIZE;
const int bidr = blockIdx.y * BLOCK_SIZE;
float results = 0;
float comp = 0;
int i, j;
for (j = 0; j < n; j += BLOCK_SIZE)
{
if (tidr + bidr < n && tidc + j < n)
{
matA[tidr][tidc] = dev_a[(tidr + bidr) * lda + tidc + j];
}
else
{
matA[tidr][tidc] = 0;
}
if (tidr + j < n && tidc + bidc < n)
{
matB[tidr][tidc] = dev_b[(tidr + j) * ldb + tidc + bidc];
}
else
{
matB[tidr][tidc] = 0;
}
__syncthreads();
for (i = 0; i < BLOCK_SIZE; i++)
{
float t;
comp -= matA[tidr][i] * matB[i][tidc];
t = results - comp;
comp = (t - results) + comp;
results = t;
}
__syncthreads();
}
if (tidr + bidr < n && tidc + bidc < n)
{
dev_c[(tidr + bidr) * ldc + tidc + bidc] = results;
}
}
//显存地址自动对齐,可以提高访问显存的效率
static clock_t matmultCUDA(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)
{
const int thread_num = BLOCK_SIZE;
dim3 threads(thread_num, thread_num);
dim3 blocks((n + thread_num - 1) / thread_num, (n + thread_num - 1) / thread_num);
float *dev_a, *dev_b, *dev_c;
clock_t time;
size_t pitch_a, pitch_b, pitch_c;
cudaEvent_t start, stop;
float elapsedTime;
time = clock();
HANDLE_ERROR(cudaEventCreate(&start));
HANDLE_ERROR(cudaEventCreate(&stop));
HANDLE_ERROR(cudaMallocPitch((void**)&dev_a, &pitch_a, sizeof(float)* n, n));
HANDLE_ERROR(cudaMallocPitch((void**)&dev_b, &pitch_b, sizeof(float)* n, n));
HANDLE_ERROR(cudaMallocPitch((void**)&dev_c, &pitch_c, sizeof(float)* n, n));
HANDLE_ERROR(cudaMemcpy2D(dev_a, pitch_a, a, sizeof(float)* lda, sizeof(float)* n, n, cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy2D(dev_b, pitch_b, b, sizeof(float)* ldb, sizeof(float)* n, n, cudaMemcpyHostToDevice));
//int blocks = (n * n + thread_num - 1) / thread_num;
//int blocks = n;
HANDLE_ERROR(cudaEventRecord(start, 0));
matMultCUDA << <blocks, threads >> >(dev_a, pitch_a / sizeof(float), dev_b, pitch_b / sizeof(float), dev_c, pitch_c / sizeof(float), n);
HANDLE_ERROR(cudaEventRecord(stop, 0));
HANDLE_ERROR(cudaEventSynchronize(stop));
HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime, start, stop));
printf("核函数执行时间:%lfms\n", elapsedTime);
HANDLE_ERROR(cudaMemcpy2D(c, sizeof(float)* ldc, dev_c, pitch_c, sizeof(float)* n, n, cudaMemcpyDeviceToHost));
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return clock() - time;
}
int main(int argc, char *argv[])
{
const int n = 64 * 4 * 2;
float *a, *b, *c, *d;
/*HANDLE_NULL(a = (float*)malloc(sizeof(float)* n * n));
HANDLE_NULL(b = (float*)malloc(sizeof(float)* n * n));
HANDLE_NULL(c = (float*)malloc(sizeof(float)* n * n));
HANDLE_NULL(d = (float*)malloc(sizeof(float)* n * n));*/
HANDLE_ERROR(cudaHostAlloc((void**)&a, sizeof(float)* n * n, cudaHostAllocDefault));
HANDLE_ERROR(cudaHostAlloc((void**)&b, sizeof(float)* n * n, cudaHostAllocDefault));
HANDLE_ERROR(cudaHostAlloc((void**)&c, sizeof(float)* n * n, cudaHostAllocDefault));
HANDLE_ERROR(cudaHostAlloc((void**)&d, sizeof(float)* n * n, cudaHostAllocDefault));
srand(0);
matgen(a, n, n);
matgen(b, n, n);
printf("利用 Kahan's Summation Formula 来提高精确度\n");
printf("显存地址自动对齐,可以提高访问显存的效率\n");
printf("对矩阵进行分块\n");
clock_t time = matmultCUDA(a, n, b, n, c, n, n);
matmult(a, n, b, n, d, n, n);
compare_mat(c, n, d, n, n);
double sec = (double)time / CLOCKS_PER_SEC;
printf("Time used: %.6fs (%.6lf GFLOPS)\n", sec, 2.0 * n * n * n / (sec * 1E9));
/*free(a);
free(b);
free(c);
free(d);*/
cudaFreeHost(a);
cudaFreeHost(b);
cudaFreeHost(c);
cudaFreeHost(d);
//remember to release the device
cudaDeviceReset();
return 0;
}
cuda编程入门示例24
最新推荐文章于 2024-03-31 03:33:36 发布