1.使用一个线程块,线程个数为1024个线程,完成长度为512的两个整型向量相加。
完成以下改造:
- 只用一个线程块,且线程个数为1025个线程,计算长度为1025的两个数组相加,观察输出结果;
#include <stdio.h>
#define N 1025
__global__ void add(int *a, int *b, int *c) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < N) {
c[tid] = a[tid] + b[tid];
}
}
int main() {
int *a, *b, *c;
int *d_a, *d_b, *d_c;
int size = N * sizeof(int);
int blockSize = 256;
int numBlocks = (N + blockSize - 1) / blockSize; // 计算所需的线程块数量
// 分配内存
cudaMalloc((void **)&d_a, size);
cudaMalloc((void **)&d_b, size);
cudaMalloc((void **)&d_c, size);
a = (int *)malloc(size);
b = (int *)malloc(size);
c = (int *)malloc(size);
// 初始化数组
for (int i = 0; i < N; ++i) {
a[i] = i;
b[i] = i;
}
// 将数据从主机内存复制到设备内存
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
// 调用CUDA核函数
add<<<numBlocks, blockSize>>>(d_a, d_b, d_c);
// 将结果从设备内存复制回主机内存
cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
// 输出结果
for (int i = 0; i < N; ++i) {
printf("%d + %d = %d\n", a[i], b[i], c[i]);
}
// 释放内存
free(a);
free(b);
free(c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
2.使用一维网格,一维线程块,每个线程块包含256个线程,完成长度为100万及1000万的两个一维向量相加,并对比串行计算和并行计算时间(忽略数据在CPU和GPU之间的拷贝时间)。
/*
CUDA实现一维卷积平滑滤波计算,并与串行计算结果进行校验和性能对比
*/
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
//从文件读入滤波前数据,作为输入数组
void readInputFile(float *in, int size)
{
FILE *fp = fopen("noise.txt", "r");
if (fp == NULL)
{
printf("open file failed!\n");
exit(0);
}
for (int i = 0; i < size; i++)
{
fscanf(fp, "%f", &in[i]);
}
fclose(fp);
printf("读取输入文件noise.txt成功!\n");
}
//将滤波后数据写入文件
void writeOutputFile(float *out, int size)
{
FILE *fp = fopen("conv_result.txt" , "w");
if (fp == NULL)
{
printf("open file failed!\n");
exit(0);
}
for (int i = 0; i < size; i++)
{
fprintf(fp, "%f\n", out[i]);
}
fclose(fp);
printf("写入输出文件conv_result.txt成功!\n");
}
//CPU时间,用于串行实现计时
double cpuSecond()
{
struct timeval tp;
gettimeofday(&tp, NULL);
return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6);
}
//一维卷积串行实现
void convolution1D_host(float *N, float *P, int width, float *M, int filterSize)
{
for (int i = 0; i < width; i++)
{
float tmpP = 0;
int startPos = i - (filterSize / 2);
for (int j = 0; j < filterSize; j++)
{
int offset = startPos + j;
if (offset >= 0 && offset < width)
{
tmpP += N[offset] * M[j];
}
}
P[i] = tmpP;
}
}
//重点:CUDA实现一维卷积,kernel函数
__global__ void convolution1D_kernel(float *N , float *P , int width , float *M , int filterSize)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
float tmpP = 0;
int startPos = id - (filterSize / 2);
for(int i = 0; i < filterSize; i++)
{
int offset = startPos + i;
if(offset >= 0 && offset < width)
{
tmpP += N[offset] * M[i];
}
}
P[id] = tmpP;
}
int main()
{
//1.指定输入输出数组大小、卷积核大小,host端申请输入数组、卷积核、输出数组内存空间
int width = 1000000;
int filterSize = 5;
float *hostN = (float *)malloc(sizeof(float) * width);
float *hostM = (float *)malloc(sizeof(float) * filterSize);
float *hostP = (float *)malloc(sizeof(float) * width);
float *hostCPUTest = (float *)malloc(sizeof(float) * width);
//2.CPU上的输入数据准备:从噪声数据文件读取输入数组,卷积核初始化为平滑滤波器
readInputFile(hostN, width);
for(int i = 0; i < filterSize; i++)
{
hostM[i] = 1.0 / (float)filterSize;
}
//3.串行计算一维卷积,并计时
double cpuStart = cpuSecond();
convolution1D_host(hostN, hostCPUTest, width, hostM, filterSize);
double cpuTime = (cpuSecond() - cpuStart) * 1000;
printf("CPU上,串行卷积计算时间: %f ms\n", cpuTime);
//4.device端申请输入数组、卷积核、输出数组的内存空间
float *deviceN , *deviceM , *deviceP;
cudaMalloc((void **)&deviceN , sizeof(float) * width);
cudaMalloc((void **)&deviceM , sizeof(float) * filterSize);
cudaMalloc((void **)&deviceP , sizeof(float) * width);
//5.将输入数组和卷积核从host拷贝到device
cudaMemcpy(deviceN , hostN , sizeof(float) * width , cudaMemcpyHostToDevice);
cudaMemcpy(deviceM , hostM , sizeof(float) * filterSize , cudaMemcpyHostToDevice);
//6.设置网格和线程块的维度和尺寸
int blockSize = 512;
dim3 dimBlock(blockSize , 1 , 1);
dim3 dimGrid((width + blockSize - 1)/blockSize , 1 , 1);
//7.开始CUDA计时
cudaEvent_t start, stop;
float gpuTime = 0.0;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
//8.调用kernel函数
convolution1D_kernel<<<dimGrid , dimBlock>>>(deviceN , deviceP , width , deviceM , filterSize);
//9.计时结束,计算加速效果
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&gpuTime, start, stop);
printf("GPU上,并行卷积计算时间 %f ms\n" , gpuTime);
printf("加速%f倍\n" , cpuTime / gpuTime);
cudaEventDestroy(start);
cudaEventDestroy(stop);
//10.将输出结果从device拷贝到host
cudaMemcpy(hostP , deviceP , sizeof(float) * width , cudaMemcpyDeviceToHost);
//11.释放device上的内存空间
cudaFree(deviceN);
cudaFree(deviceM);
cudaFree(deviceP);
//11.对比串行计算和并行计算结果是否相同
for(int i = 0; i < width; i++)
{
if(fabs(hostCPUTest[i] - hostP[i]) > 1e-5)
{
exit(1);
}
}
//12.将计算结果写到文件里
writeOutputFile(hostP, width);
//13.释放host上的内存空间
free(hostN);
free(hostM);
free(hostP);
free(hostCPUTest);
return 0;
}
3.已知卷积核长度为7,使用平滑滤波器(卷积核上的每个元素是1/7),使用一维网格,一维线程块,完成长度为100万一维数组卷积操作。并完成:
- 并行计算与串行计算结果进行对比校验计算正确与否;
- 并行计算与串行计算时间进行对比(并行计算忽略数据在CPU和GPU之间的拷贝时间,只统计kernel函数执行时间)。
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#define N 1000000 // 数组长度为100万
__global__ void convolution(float *input, float *output, int n) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < n) {
float sum = 0.0f;
// 卷积核长度为7,平滑滤波器
for (int i = -3; i <= 3; ++i) {
int index = tid + i;
if (index >= 0 && index < n) {
sum += input[index];
}
}
output[tid] = sum / 7.0f; // 平均值
}
}
void convolutionCPU(float *input, float *output, int n) {
// 卷积核长度为7,平滑滤波器
for (int i = 0; i < n; ++i) {
float sum = 0.0f;
for (int j = -3; j <= 3; ++j) {
int index = i + j;
if (index >= 0 && index < n) {
sum += input[index];
}
}
output[i] = sum / 7.0f; // 平均值
}
}
double getCurrentTime() {
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec + tv.tv_usec / 1000000.0;
}
int main() {
float *input, *output_cpu, *output_gpu;
float *d_input, *d_output;
int size = N * sizeof(float);
// 分配内存
cudaMalloc((void **)&d_input, size);
cudaMalloc((void **)&d_output, size);
input = (float *)malloc(size);
output_cpu = (float *)malloc(size);
output_gpu = (float *)malloc(size);
// 初始化数组
for (int i = 0; i < N; ++i) {
input[i] = i;
}
// 将数据从主机内存复制到设备内存
cudaMemcpy(d_input, input, size, cudaMemcpyHostToDevice);
// 启动并行计算
int blockSize = 256;
int numBlocks = (N + blockSize - 1) / blockSize;
double startTime = getCurrentTime();
convolution<<<numBlocks, blockSize>>>(d_input, d_output, N);
cudaDeviceSynchronize();
double endTime = getCurrentTime();
printf("Parallel execution time: %.6f seconds\n", endTime - startTime);
// 将结果从设备内存复制回主机内存
cudaMemcpy(output_gpu, d_output, size, cudaMemcpyDeviceToHost);
// 串行计算
startTime = getCurrentTime();
convolutionCPU(input, output_cpu, N);
endTime = getCurrentTime();
printf("Serial execution time: %.6f seconds\n", endTime - startTime);
// 检查结果
for (int i = 0; i < 10; ++i) {
printf("CPU: %.6f, GPU: %.6f\n", output_cpu[i], output_gpu[i]);
}
// 释放内存
free(input);
free(output_cpu);
free(output_gpu);
cudaFree(d_input);
cudaFree(d_output);
return 0;
}
4.已知卷积核长度为7,使用平滑滤波器(卷积核上的每个元素是1/7),使用一维网格,一维线程块,进行长度为1000万一维数组卷积操作。完成以下实验:
1.使用常量内存优化;
2.使用共享内存优化;
3.使用常量内存+共享内存优化;
并完成:
- 并行计算与串行计算结果进行对比校验计算正确与否;
- 并行计算与串行计算时间进行对比。
/*
CUDA编程实现:使用常量内存加速一维卷积平滑滤波计算,并与原始版本进行性能对比
*/
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
//定义卷积核长度以及申请常量内存
const int filterSize = 7;
__constant__ float deviceM[filterSize];
//从文件读入滤波前数据,作为输入数组
void readInputFile(float *in, int size)
{
FILE *fp = fopen("noise.txt", "r");
if (fp == NULL)
{
printf("open file failed!\n");
exit(0);
}
for (int i = 0; i < size; i++)
{
fscanf(fp, "%f", &in[i]);
}
fclose(fp);
printf("读取输入文件noise.txt成功!\n");
}
//将滤波后数据写入文件
void writeOutputFile(float *out, int size)
{
FILE *fp = fopen("conv_result.txt" , "w");
if (fp == NULL)
{
printf("open file failed!\n");
exit(0);
}
for (int i = 0; i < size; i++)
{
fprintf(fp, "%f\n", out[i]);
}
fclose(fp);
printf("写入输出文件conv_result.txt成功!\n");
}
//CPU时间,用于串行实现计时
double cpuSecond()
{
struct timeval tp;
gettimeofday(&tp, NULL);
return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6);
}
//一维卷积串行实现
void convolution1D_host(float *N, float *P, int width, float *M)
{
for (int i = 0; i < width; i++)
{
float tmpP = 0;
int startPos = i - (filterSize / 2);
for (int j = 0; j < filterSize; j++)
{
int offset = startPos + j;
if (offset >= 0 && offset < width)
{
tmpP += N[offset] * M[j];
}
}
P[i] = tmpP;
}
}
//重点:CUDA实现一维卷积,kernel函数
__global__ void convolution1D_kernel(float *N , float *P , int width)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
float tmpP = 0;
int startPos = id - (filterSize / 2);
for(int i = 0; i < filterSize; i++)
{
int offset = startPos + i;
if(offset >= 0 && offset < width)
{
tmpP += N[offset] * deviceM[i];
}
}
P[id] = tmpP;
}
int main()
{
//1.指定输入输出数组大小,host端申请输入数组、卷积核、输出数组内存空间
int width = 10000000;
float *hostN = (float *)malloc(sizeof(float) * width);
float *hostM = (float *)malloc(sizeof(float) * filterSize);
float *hostP = (float *)malloc(sizeof(float) * width);
float *hostCPUTest = (float *)malloc(sizeof(float) * width);
//2.CPU上的输入数据准备:从噪声数据文件读取输入数组,卷积核初始化为平滑滤波器
readInputFile(hostN, width);
for(int i = 0; i < filterSize; i++)
{
hostM[i] = 1.0 / (float)filterSize;
}
//3.串行计算一维卷积,并计时
double cpuStart = cpuSecond();
convolution1D_host(hostN, hostCPUTest, width, hostM);
double cpuTime = (cpuSecond() - cpuStart) * 1000;
printf("CPU上,串行卷积计算时间: %f ms\n", cpuTime);
//4.device端申请输入数组、卷积核、输出数组的内存空间
float *deviceN , *deviceP;
cudaMalloc((void **)&deviceN , sizeof(float) * width);
cudaMalloc((void **)&deviceP , sizeof(float) * width);
//5.将输入数组和卷积核从host拷贝到device
cudaMemcpy(deviceN , hostN , sizeof(float) * width , cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(deviceM , hostM , sizeof(float) * filterSize);
//6.设置网格和线程块的维度和尺寸
int blockSize = 512;
dim3 dimBlock(blockSize , 1 , 1);
dim3 dimGrid((width + blockSize - 1)/blockSize , 1 , 1);
//7.开始CUDA计时
cudaEvent_t start, stop;
float gpuTime = 0.0;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
//8.调用kernel函数
convolution1D_kernel<<<dimGrid , dimBlock>>>(deviceN , deviceP , width);
//9.计时结束,计算加速效果
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&gpuTime, start, stop);
printf("GPU上,并行卷积计算时间 %f ms\n" , gpuTime);
printf("加速%f倍\n" , cpuTime / gpuTime);
cudaEventDestroy(start);
cudaEventDestroy(stop);
//10.将输出结果从device拷贝到host
cudaMemcpy(hostP , deviceP , sizeof(float) * width , cudaMemcpyDeviceToHost);
//11.释放device上的内存空间
cudaFree(deviceN);
cudaFree(deviceM);
cudaFree(deviceP);
//11.对比串行计算和并行计算结果是否相同
for(int i = 0; i < width; i++)
{
if(fabs(hostCPUTest[i] - hostP[i]) > 1e-5)
{
exit(1);
}
}
//12.将计算结果写到文件里
writeOutputFile(hostP, width);
//13.释放host上的内存空间
free(hostN);
free(hostM);
free(hostP);
free(hostCPUTest);
return 0;
}
/*
CUDA编程实现:使用常量内存+共享内存加速一维卷积平滑滤波计算,并与原始版本进行性能对比
*/
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
//定义卷积核长度以及申请常量内存
const int filterSize = 7;
__constant__ float deviceM[filterSize];
//定义tile大小等于block的大小
const int tileSize = 512;
//从文件读入滤波前数据,作为输入数组
void readInputFile(float *in, int size)
{
FILE *fp = fopen("noise.txt", "r");
if (fp == NULL)
{
printf("open file failed!\n");
exit(0);
}
for (int i = 0; i < size; i++)
{
fscanf(fp, "%f", &in[i]);
}
fclose(fp);
printf("读取输入文件noise.txt成功!\n");
}
//将滤波后数据写入文件
void writeOutputFile(float *out, int size)
{
FILE *fp = fopen("conv_result.txt" , "w");
if (fp == NULL)
{
printf("open file failed!\n");
exit(0);
}
for (int i = 0; i < size; i++)
{
fprintf(fp, "%f\n", out[i]);
}
fclose(fp);
printf("写入输出文件conv_result.txt成功!\n");
}
//CPU时间,用于串行实现计时
double cpuSecond()
{
struct timeval tp;
gettimeofday(&tp, NULL);
return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6);
}
//一维卷积串行实现
void convolution1D_host(float *N, float *P, int width, float *M)
{
for (int i = 0; i < width; i++)
{
float tmpP = 0;
int startPos = i - (filterSize / 2);
for (int j = 0; j < filterSize; j++)
{
int offset = startPos + j;
if (offset >= 0 && offset < width)
{
tmpP += N[offset] * M[j];
}
}
P[i] = tmpP;
}
}
//重点:CUDA实现一维卷积,kernel函数
__global__ void convolution1D_kernel(float *N , float *P , int width)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ float N_ds[tileSize + filterSize - 1];
int radius = filterSize / 2;
if(threadIdx.x >= (blockDim.x - radius))
{
int halo_index_left = (blockIdx.x - 1) * blockDim.x + threadIdx.x;
if(halo_index_left < 0)
N_ds[threadIdx.x - (blockDim.x - radius)] = 0;
else
N_ds[threadIdx.x - (blockDim.x - radius)] = N[halo_index_left];
}
if(threadIdx.x < radius)
{
int halo_index_right = (blockIdx.x + 1) * blockDim.x + threadIdx.x;
if(halo_index_right >= width)
N_ds[threadIdx.x + blockDim.x + radius] = 0;
else
N_ds[threadIdx.x + blockDim.x + radius] = N[halo_index_right];
}
if(id < width)
N_ds[threadIdx.x + radius] = N[id];
else
N_ds[threadIdx.x + radius] = 0;
__syncthreads();
float tmpP = 0;
for(int i = 0; i < filterSize; i++)
{
tmpP += N_ds[threadIdx.x + i] * deviceM[i];
}
P[id] = tmpP;
}
int main()
{
//1.指定输入输出数组大小,host端申请输入数组、卷积核、输出数组内存空间
int width = 10000000;
float *hostN = (float *)malloc(sizeof(float) * width);
float *hostM = (float *)malloc(sizeof(float) * filterSize);
float *hostP = (float *)malloc(sizeof(float) * width);
float *hostCPUTest = (float *)malloc(sizeof(float) * width);
//2.CPU上的输入数据准备:从噪声数据文件读取输入数组,卷积核初始化为平滑滤波器
readInputFile(hostN, width);
for(int i = 0; i < filterSize; i++)
{
hostM[i] = 1.0 / (float)filterSize;
}
//3.串行计算一维卷积,并计时
double cpuStart = cpuSecond();
convolution1D_host(hostN, hostCPUTest, width, hostM);
double cpuTime = (cpuSecond() - cpuStart) * 1000;
printf("CPU上,串行卷积计算时间: %f ms\n", cpuTime);
//4.device端申请输入数组、卷积核、输出数组的内存空间
float *deviceN , *deviceP;
cudaMalloc((void **)&deviceN , sizeof(float) * width);
cudaMalloc((void **)&deviceP , sizeof(float) * width);
//5.将输入数组和卷积核从host拷贝到device
cudaMemcpy(deviceN , hostN , sizeof(float) * width , cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(deviceM , hostM , sizeof(float) * filterSize);
//6.设置网格和线程块的维度和尺寸
int blockSize = 512;
dim3 dimBlock(blockSize , 1 , 1);
dim3 dimGrid((width + blockSize - 1)/blockSize , 1 , 1);
//7.开始CUDA计时
cudaEvent_t start, stop;
float gpuTime = 0.0;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
//8.调用kernel函数
convolution1D_kernel<<<dimGrid , dimBlock>>>(deviceN , deviceP , width);
//9.计时结束,计算加速效果
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&gpuTime, start, stop);
printf("GPU上,并行卷积计算时间 %f ms\n" , gpuTime);
printf("加速%f倍\n" , cpuTime / gpuTime);
cudaEventDestroy(start);
cudaEventDestroy(stop);
//10.将输出结果从device拷贝到host
cudaMemcpy(hostP , deviceP , sizeof(float) * width , cudaMemcpyDeviceToHost);
//11.释放device上的内存空间
cudaFree(deviceN);
cudaFree(deviceM);
cudaFree(deviceP);
//11.对比串行计算和并行计算结果是否相同
for(int i = 0; i < width; i++)
{
if(fabs(hostCPUTest[i] - hostP[i]) > 1e-5)
{
exit(1);
}
}
//12.将计算结果写到文件里
writeOutputFile(hostP, width);
//13.释放host上的内存空间
free(hostN);
free(hostM);
free(hostP);
free(hostCPUTest);
return 0;
}
5.对比不同线程块规模下,二维矩阵并行加法的性能。并完成:
- 并行计算与串行计算结果进行对比校验计算正确与否;
- 并行计算与串行计算时间进行对比(并行计算忽略数据在CPU和GPU之间的拷贝时间,只统计kernel函数执行时间)。
/*
CUDA编程实现: 对比不同线程块规模下,二维矩阵并行加法的性能
*/
#include <stdio.h>
#include <sys/time.h>
//初始化矩阵数据,随机数填充
void initialData(float* in,int size)
{
time_t t;
srand((unsigned)time(&t));
for(int i = 0; i < size; i++)
{
in[i] = (float)(rand() & 0xffff) / 1000.0f;
}
}
//CPU时间,用于串行实现计时
double cpuSecond()
{
struct timeval tp;
gettimeofday(&tp, NULL);
return ((double)tp.tv_sec + (double)tp.tv_usec * 1e-6);
}
//CPU实现二维矩阵加
void sumMatrix2D_CPU(float *a,float *b,float *c,int nx,int ny)
{
for(int j = 0; j < ny; j++)
{
for(int i = 0; i < nx; i++)
{
int idx = i + j * nx;
c[idx] = a[idx]+b[idx];
}
}
}
//重点:CUDA实现二维矩阵加,kernel函数
__global__ void sumMatrix_kernel(float *a , float *b , float *c , int nx , int ny)
{
int ix = threadIdx.x + blockDim.x * blockIdx.x;
int iy = threadIdx.y + blockDim.y * blockIdx.y;
int idx = ix + iy * nx;
if(ix < nx && iy < ny)
{
c[idx] = a[idx] + b[idx];
}
}
int main(int argc,char** argv)
{
//1.指定矩阵的规模, host端申请输入矩阵和结果矩阵的内存空间
int nx = 1<<14, ny = 1<<14;
int width = nx * ny;
float *hostA = (float *)malloc(sizeof(float) * width);
float *hostB = (float *)malloc(sizeof(float) * width);
float *hostC = (float *)malloc(sizeof(float) * width);
float *hostCPUTest = (float *)malloc(sizeof(float) * width);
//2.CPU上的输入数据初始化:填入随机数
initialData(hostA , width);
initialData(hostB , width);
//3.串行计算二维矩阵和,并计时
double cpuStart = cpuSecond();
sumMatrix2D_CPU(hostA, hostB, hostCPUTest, nx, ny);
double cpuTime = (cpuSecond() - cpuStart) * 1000;
printf("CPU上,串行矩阵和计算时间:%f ms\n", cpuTime);
//4.device端申请输入矩阵和结果矩阵的内存空间
float *deviceA, *deviceB, *deviceC;
cudaMalloc((void **)&deviceA, sizeof(float) * width);
cudaMalloc((void **)&deviceB, sizeof(float) * width);
cudaMalloc((void **)&deviceC, sizeof(float) * width);
//5.将输入矩阵从host拷贝到device
cudaMemcpy(deviceA, hostA, sizeof(float) * width, cudaMemcpyHostToDevice);
cudaMemcpy(deviceB, hostB, sizeof(float) * width, cudaMemcpyHostToDevice);
//6.设置网格和线程块的维度和尺寸
int dimx = 32 , dimy = 32;
if(argc > 2)
{
dimx = atoi(argv[1]);
dimy = atoi(argv[2]);
}
dim3 dimBlock(dimx , dimy);
dim3 dimGrid((nx - 1) / dimx + 1 , (ny - 1) / dimy + 1);
//7.开始CUDA计时
cudaEvent_t start, stop;
float gpuTime = 0.0;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
//8.调用kernel函数
sumMatrix_kernel<<<dimGrid , dimBlock>>>(deviceA , deviceB , deviceC , nx , ny);
//9.计时结束,计算加速效果
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&gpuTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaDeviceSynchronize();
printf("GPU上:规模<<<(%d,%d),(%d,%d)>>>矩阵计算时间 %f ms\n",
dimGrid.x, dimGrid.y, dimBlock.x, dimBlock.y, gpuTime);
printf("加速%f倍\n", cpuTime / gpuTime);
//10.将输出结果从device拷贝到host
cudaMemcpy(hostC, deviceC, sizeof(float) * width, cudaMemcpyDeviceToHost);
//11.释放device上的内存空间
cudaFree(deviceA);
cudaFree(deviceB);
cudaFree(deviceC);
//12.对比串行计算和并行计算结果是否相同
for(int i = 0; i < width; i++)
{
if(fabs(hostCPUTest[i] - hostC[i]) > 1e-5)
{
printf("结果不匹配: CPU(%f) != GPU(%f)\n", hostCPUTest[i], hostC[i]);
exit(1);
}
}
//13.释放host上的内存空间
free(hostA);
free(hostB);
free(hostC);
free(hostCPUTest);
printf("计算完毕\n");
return 0;
}
6.多流并行计算三角函数,并对比单流性能,练习多流的使用、错误管理、事件计时。并完成:
- 并行计算与串行计算结果进行对比校验计算正确与否;
- 并行计算与串行计算时间进行对比(并行计算忽略数据在CPU和GPU之间的拷贝时间,只统计kernel函数执行时间)。
/*
CUDA编程实现: 多流并行计算三角函数,并对比单流性能,练习多流的使用、错误管理、事件计时
*/
#include <stdio.h>
// 宏定义:检查API调用是否出错
#define CudaSafeCall(err) __cudaSafeCall(err, __FILE__, __LINE__)
inline void __cudaSafeCall(cudaError err, const char *file, const int line)
{
#ifdef CUDA_ERROR_CHECK
if (cudaSuccess != err)
{
fprintf(stderr, "cudaSafeCall() failed at %s:%i : %s\n",
file, line, cudaGetErrorString(err));
exit(-1);
}
#endif
return;
}
// 宏定义:返回上一个错误,主要用于检查kernel函数
#define CudaCheckError() __cudaCheckError(__FILE__, __LINE__)
inline void __cudaCheckError(const char *file, const int line)
{
cudaError err = cudaGetLastError();
if (cudaSuccess != err)
{
fprintf(stderr, "cudaCheckError() failed at %s:%i : %s\n",
file, line, cudaGetErrorString(err));
exit(-1);
}
return;
}
//kernel函数,计算((sinx) ^ 2 + (cosx) ^ 2)
__global__ void kernel(float *a, int offset)
{
int id = offset + threadIdx.x + blockIdx.x * blockDim.x;
float x = (float)id;
float s = sinf(x);
float c = cosf(x);
a[id] = a[id] + (s * s + c * c);
}
int main(int argc, char **argv)
{
// 流的个数
int nStreams = (argc > 1) ? (atoi(argv[1])) : 4;
// 线程块的大小
int blockSize = 256;
// 总的数据个数
int n = (1 << 28);
// 总的数据字节数
int bytes = n * sizeof(float);
// 每个流分配的数据个数
int streamSize = n / nStreams;
// 每个流分配的数据字节数
int streamBytes = streamSize * sizeof(float);
float *h_a, *d_a;
// 申请计算结果数据的页锁定主机内存和设备内存
CudaSafeCall(cudaMallocHost((void **)&h_a , bytes));
CudaSafeCall(cudaMalloc((void **)&d_a , bytes));
memset(h_a , 0 , bytes);
// 开始CUDA计时
cudaEvent_t start , stop;
float ms1 = 0.0 , ms2 = 0.0;
CudaSafeCall(cudaEventCreate(&start));
CudaSafeCall(cudaEventCreate(&stop));
CudaSafeCall(cudaEventRecord(start));
// 单流(默认流)执行
CudaSafeCall(cudaMemcpy(d_a , h_a , bytes , cudaMemcpyHostToDevice));
kernel<<<(n / blockSize) , blockSize>>>(d_a , 0);
// kernel函数错误检查
CudaSafeCall(cudaDeviceSynchronize());
CudaCheckError();
CudaSafeCall(cudaMemcpy(h_a , d_a , bytes , cudaMemcpyDeviceToHost));
// 结束计时
CudaSafeCall(cudaEventRecord(stop));
CudaSafeCall(cudaEventSynchronize(stop));
CudaSafeCall(cudaEventElapsedTime(&ms1 , start , stop));
printf("使用单流,数据传输及计算的时间为: %fms\n", ms1);
// 使用多个流
// 创建流
memset(h_a , 0 , bytes);
cudaStream_t stream[nStreams];
for(int i = 0; i < nStreams; i++)
{
CudaSafeCall(cudaStreamCreate(&stream[i]));
}
// 多流传输和计算重叠 开始计时
CudaSafeCall(cudaEventRecord(start));
for(int i = 0; i < nStreams; i++)
{
int offset = i * streamSize;
CudaSafeCall(cudaMemcpyAsync(&d_a[offset] , &h_a[offset] , streamBytes , cudaMemcpyHostToDevice , stream[i]));
kernel<<<(streamSize / blockSize) , (blockSize) , 0 , stream[i]>>>(d_a , offset);
CudaSafeCall(cudaMemcpyAsync(&h_a[offset] , &d_a[offset] , streamBytes , cudaMemcpyDeviceToHost , stream[i]));
}
for(int i = 0; i < nStreams; i++)
{
CudaSafeCall(cudaStreamSynchronize(stream[i]));
}
// 结束计时
CudaSafeCall(cudaEventRecord(stop));
CudaSafeCall(cudaEventSynchronize(stop));
CudaSafeCall(cudaEventElapsedTime(&ms2 , start , stop));
printf("使用%d个流,异步数据传输及计算的时间为: %fms,多流的加速比为: %.2f\n", nStreams, ms2 , (ms1 / ms2));
CudaSafeCall(cudaEventDestroy(start));
CudaSafeCall(cudaEventDestroy(stop));
for(int i = 0; i < nStreams; i++)
{
CudaSafeCall(cudaStreamDestroy(stream[i]));
}
CudaSafeCall(cudaFree(d_a));
CudaSafeCall(cudaFreeHost(h_a));
return 0;
}
7.将大小为64MB的一个整型数组,在两个GPU上进行单向传输20次,试验以下不同传输方式:
- 通过CPU中转传输;
- 通过P2P传输(使用cudaMemcpy或者cudaMemcpyPeer)。
并对比两者的传输性能,包括传输时间及带宽(带宽=传输数据量/传输时间,单位GB/s)
/*
CUDA编程实现: 多GPU上完成大规模向量点乘
运行方式:yhrun -n 1 -p TH_GPU ./multi_GPUs 4 30
*/
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
// 宏定义:检查API调用是否出错
#define CudaSafeCall(err) __cudaSafeCall(err, __FILE__, __LINE__)
inline void __cudaSafeCall(cudaError err, const char *file, const int line)
{
#ifdef CUDA_ERROR_CHECK
if (cudaSuccess != err)
{
fprintf(stderr, "cudaSafeCall() failed at %s:%i : %s\n",
file, line, cudaGetErrorString(err));
exit(-1);
}
#endif
return;
}
// 宏定义:返回上一个错误,主要用于检查kernel函数
#define CudaCheckError() __cudaCheckError(__FILE__, __LINE__)
inline void __cudaCheckError(const char *file, const int line)
{
cudaError err = cudaGetLastError();
if (cudaSuccess != err)
{
fprintf(stderr, "cudaCheckError() failed at %s:%i : %s\n",
file, line, cudaGetErrorString(err));
exit(-1);
}
printf("11111\n");
return;
}
//CPU时间,用于串行实现计时
double cpuSecond()
{
struct timeval tp;
gettimeofday(&tp, NULL);
return ((double)tp.tv_sec + (double)tp.tv_usec * 1e-6);
}
//用随机数赋值数组,完成数组的初始化
void initialData(float *A , int width)
{
for(int i = 0; i < width; i++)
{
A[i] = (float)rand() / (float)RAND_MAX;
}
}
//串行向量点乘
void sumOnHost(float *A, float *B, float *C, int width)
{
for(int i = 0; i < width; i++)
{
C[i] = A[i] * B[i];
}
}
//计算结果比对
void checkResult(float *hostResult, float *gpuResult, int width)
{
double epsilon = 1.0E-8;
for(int i = 0; i < width; i++)
{
if(abs(hostResult[i] - gpuResult[i]) > epsilon)
{
printf("第%d个计算结果不同:host %.2f , gpu %.2f\n", i , hostResult[i] , gpuResult[i]);
exit(1);
}
}
}
//kernel函数:两个向量点乘
__global__ void kernel(float *A , float *B , float *C , int width)
{
int id = threadIdx.x + blockDim.x * blockIdx.x;
if(id < width)
C[id] = A[id] * B[id];
}
int main(int argc, char **argv)
{
//查询支持CUDA的GPU设备个数
int ngpus;
cudaGetDeviceCount(&ngpus);
printf("支持CUDA的GPU设备个数: %d\n" , ngpus);
//使用long long ,可以测试1<<31次方规模的数据
long long dataSize = (1LL << 30);
if(argc > 2)
{
//如果输入的GPU个数大于最大的设备个数,或者数据规模不能被GPU数量整除,输入无效,仍然取最大的设备个数
dataSize = (1LL << atoi(argv[2]));
ngpus = ((atoi(argv[1]) > ngpus) || (dataSize % ngpus != 0)) ? (ngpus) : atoi(argv[1]);
}
//每个GPU分配的数组长度
int gpuSize = dataSize / ngpus;
//每个GPU分配的数据字节数
size_t gpuBytes = gpuSize * sizeof(float);
printf("单个数组的数据规模 %d M, 输入输出3个数组共 %d M\n共使用 %d 个GPU。单个GPU上需占用显存 %d M。\n",
dataSize * sizeof(float) / 1024 / 1024 ,
dataSize * sizeof(float) / 1024 / 1024 * 3,
ngpus ,
gpuBytes / 1024 / 1024 * 3);
//判断单GPU上是否全局内存不够用,这里用的K80的11.17GB
if((gpuBytes / 1024 / 1024 * 3) > 11170)
{
printf("单GPU上全局内存不足,程序退出!\n");
exit(0);
}
//声明指针数组
float *d_A[ngpus] , *d_B[ngpus] , *d_C[ngpus];
float *h_A[ngpus] , *h_B[ngpus];
float *hostResult[ngpus] , *gpuResult[ngpus];
//声明多个流,每个GPU上一个流
cudaStream_t stream[ngpus];
//指定设备,申请对应的页锁定内存和设备内存,分配流,完成主机内存初始化
for(int i = 0; i < ngpus; i++)
{
cudaSetDevice(i);
cudaMalloc((void **)&d_A[i] , gpuBytes);
cudaMalloc((void **)&d_B[i] , gpuBytes);
cudaMalloc((void **)&d_C[i] , gpuBytes);
cudaMallocHost((void **)&h_A[i] , gpuBytes);
cudaMallocHost((void **)&h_B[i] , gpuBytes);
cudaMallocHost((void **)&hostResult[i] , gpuBytes);
cudaMallocHost((void **)&gpuResult[i] , gpuBytes);
cudaStreamCreate(&stream[i]);
initialData(h_A[i] , gpuSize);
initialData(h_B[i] , gpuSize);
}
//开始计时
dim3 blockSize = 512;
dim3 gridSize = (gpuSize + blockSize.x - 1) / blockSize.x;
cudaEvent_t start , stop;
float gpuTime = 0.0;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start , 0);
//控制多GPU并行,异步传输和kernel函数计算
for(int i = 0; i < ngpus; i++)
{
cudaSetDevice(i);
cudaMemcpyAsync(d_A[i] , h_A[i] , gpuBytes , cudaMemcpyHostToDevice , stream[i]);
cudaMemcpyAsync(d_B[i] , h_B[i] , gpuBytes , cudaMemcpyHostToDevice , stream[i]);
kernel<<<gridSize , blockSize , 0 , stream[i]>>>(d_A[i] , d_B[i] , d_C[i] , gpuSize);
cudaMemcpyAsync(gpuResult[i] , d_C[i] , gpuBytes , cudaMemcpyDeviceToHost , stream[i]);
}
//结束计时,同步流
cudaEventRecord(stop , 0);
for(int i = 0; i < ngpus; i++)
{
cudaSetDevice(i);
cudaStreamSynchronize(stream[i]);
}
cudaEventSynchronize(stop);
cudaEventElapsedTime(&gpuTime, start , stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
//3.串行计算向量点乘,并计时
double cpuStart = cpuSecond();
for(int i = 0; i < ngpus; i++)
{
sumOnHost(h_A[i], h_B[i], hostResult[i] , gpuSize);
checkResult(hostResult[i] , gpuResult[i] , gpuSize);
}
double cpuTime = (cpuSecond() - cpuStart) * 1000.0;
printf("CPU上,串行计算时间: %f ms\n", cpuTime);
printf("使用%d个GPU,异步数据传输及计算时间为: %fms\n并行计算对比串行计算加速比为: %.2f\n",
ngpus, gpuTime , (cpuTime / gpuTime));
//释放内存及流
for(int i = 0; i < ngpus; i++)
{
cudaSetDevice(i);
cudaFree(d_A[i]);
cudaFree(d_B[i]);
cudaFree(d_C[i]);
cudaFreeHost(h_A[i]);
cudaFreeHost(h_B[i]);
cudaFreeHost(hostResult[i]);
cudaFreeHost(gpuResult[i]);
cudaStreamDestroy(stream[i]);
}
return 0;
}