1D Convolution with CUDA

最新推荐文章于 2023-08-08 22:00:00 发布

iteye_20478

最新推荐文章于 2023-08-08 22:00:00 发布

阅读量157

点赞数

文章标签： python c/c++ 开发工具

1D Convolution

Write a CUDA program to do the following:// n can be any integer > 0. convolve(float* a, float* b, float* c, int n) { for(int i = 0; i < 2*n-1; ++i) c[i] = 0f; for(int i = 0; i < n; ++i) for(int j = 0; j < n; ++j) c[i+j] += a[i] * b[j]; }
In the latest attempt (2011, by Francis Fok), on xi03, the speedups of using CUDA against CPU are
ratio = 38.82 when n = 100000,
ratio = 48.74 when n = 500000,
ratio = 153.24 when n = 1000000.

We use ICC with -O3.

(My reference code is faster than the above result)

Code:(for reference only, and without AtomicAdd or Thrust API)

#include <stdio.h> #include <stdlib.h> #include <time.h> #include <math.h> #define THREAD_NUM 512 void init_arr(float *arr, int len) { int i = 0;; for (i = 0; i < len; i++) { arr[i] = 0.0f; } } void rand_arr(float *arr, int len) { int i = 0; for (i = 0; i < len; i++) { arr[i] = (rand()%1000) * 0.01; } } clock_t convolve(const float *a, const float *b, float *d, int n) { clock_t startCPU, endCPU; int i, j = 0; startCPU = clock(); for (i = 0; i < n; i++) for (j = 0; j < n; j++) { d[i+j] += a[i] * b[j]; } endCPU = clock(); return endCPU - startCPU; } __global__ static void ConvolveCUDA(const float* a, const float* b, float* c, int n) { int i = 0; int idx = threadIdx.x + blockDim.x * blockIdx.x; //Method 3 （improved from Method 2） if (idx < n) { float t1 = 0; float t2 = 0; for (i = 0; i <= idx; i++) { t1 += a[i] * b[idx - i]; } for (i = idx+1; i < n; i++) { t2 += a[i] * b[n+idx-i]; } c[idx] = t1; c[n+idx] = t2; } /*//Method 2 for (i = 0; i < n; i++) { if (idx >= i) { t1 += a[i] * b[idx - i]; } else { t2 += a[i] * b[n + idx - i]; } c[idx] = t1; c[n+idx] = t2; } */ /*//Method 1 if(idx < (2*n-1)) { if (idx <= (n-1)){ float t = 0; for (i = 0; i <= idx; i++) { t += a[i] * b[idx - i]; } c[idx] = t; } if (idx > (n-1)){ idx = 2*n-1 - idx -1; float t = 0; for (i = 0; i <= idx; i++) { t += a[(n - 1) - (idx - i)] * b[(n -1) - i]; } c[2*n-1-idx -1] = t; } } */ } clock_t convolveCUDA(const float *a, const float *b, float *c, int n) { float *a_d, *b_d, *c_d; clock_t start, end; int BLOCK_NUM = n / THREAD_NUM + ((n % THREAD_NUM > 0)?1:0); //int BLOCK_NUM = (n + THREAD_NUM) / THREAD_NUM; cudaMalloc((void**) &a_d, sizeof(float) * n); cudaMalloc((void**) &b_d, sizeof(float) * n); cudaMalloc((void**) &c_d, sizeof(float) * (2*n-1)); start = clock(); cudaMemcpy(a_d, a, sizeof(float) * n, cudaMemcpyHostToDevice); cudaMemcpy(b_d, b, sizeof(float) * n, cudaMemcpyHostToDevice); cudaMemcpy(c_d, c, sizeof(float) * (2*n-1), cudaMemcpyHostToDevice); ConvolveCUDA<<<BLOCK_NUM, THREAD_NUM>>>(a_d, b_d, c_d, n); cudaMemcpy(c, c_d, sizeof(float) * (2*n-1), cudaMemcpyDeviceToHost); end = clock(); cudaFree(a_d); cudaFree(b_d); cudaFree(c_d); return end - start; } void compare_arr(const float* a, const float* b, int len) { float max_err = 0; float average_err = 0; int i = 0; for(i = 0; i < len; i++) { if(b[i] != 0) { float err = fabs((a[i] - b[i]) / b[i]); if(max_err < err) max_err = err; average_err += err; } } printf("Max error: %g\tAverage error: %g\n", max_err, average_err / (len * len)); } int main() { float *a, *b, *c, *d; int m, n = 0; printf("\nPlease input length n:"); scanf("%d", &n); m = 2 * n - 1; a = (float*) malloc(sizeof(float) * n); b = (float*) malloc(sizeof(float) * n); c = (float*) malloc(sizeof(float) * m); d = (float*) malloc(sizeof(float) * m); srand((unsigned int)time(NULL) + rand()); rand_arr(a, n); rand_arr(b, n); init_arr(c, m); init_arr(d, m); //for (int i = 0; i < n; i++) {printf("a[%d] = %.2f\t\tb[%d] = %.2f\n", i, a[i], i, b[i]);} //printf("\n"); clock_t timeGPU = convolveCUDA(a, b, c, n); clock_t timeCPU = convolve(a, b, d, n); //clock_t timeCPU = 1; compare_arr(c, d, m); double secGPU = (double) timeGPU / CLOCKS_PER_SEC; double secCPU = (double) timeCPU / CLOCKS_PER_SEC; float ratio = secCPU / secGPU; printf("CPU vs GPU Time used: %.2f vs %.2f\n", secCPU, secGPU); printf("CPU vs GPU ratio: %.2f\n\n", ratio); //for (int j = 0; j < m; j++) {printf("c[%d] = %.2f\t\td[%d] = %.2f\n", j, c[j], j, d[j]);} //printf("\n"); free(a); free(b); free(c); free(d); }

Result:

[michaelchen@xi03 ex1]$ ./ex1_gpu （Method 3）

Please input length n:100000

Max error: 1.72791e-06 Average error: 2.17172e-11

CPU vs GPU Time used: 12.43 vs 0.16

CPU vs GPU ratio: 77.69

[michaelchen@xi03 ex1]$ ./ex1_gpu

Please input length n:500000

Max error: 2.94493e-06 Average error: -5.19955e-10

CPU vs GPU Time used: 321.91 vs 3.87

CPU vs GPU ratio: 83.18

[michaelchen@xi03 ex1]$ ./ex1_gpu_3 (Method 1)

Please input length n:100000

Max error: 1.73582e-06 Average error: 2.174e-11

CPU vs GPU Time used: 12.43 vs 0.47

CPU vs GPU ratio: 26.45

[michaelchen@xi03 ex1]$ ./ex1_gpu_3

Please input length n:500000

Max error: 2.83556e-06 Average error: -5.20274e-10

CPU vs GPU Time used: 318.16 vs 4.50

CPU vs GPU ratio: 70.70

Notes:

计算运行时间：http://www.cnitblog.com/mantou/archive/2005/08/24/2304.html
随机字符串：http://topic.csdn.net/u/20080324/03/1381e449-ed79-47e0-837e-64cada4439c2.html
vim 高亮：
au BufNewFile,BufReadPost *.cl,*.cu,*.cuh, set ft=cpp
set nu
nvcc路径配置： ~/.bashrc 配置完后，不需重启，用source ~/.bashrc 将配置更新
which 命令查看路径
参考网站及示例：https://sites.google.com/a/kimicat.com/www/cuda%E7%B0%A1%E4%BB%8B
编辑器： Notepad++ （编辑后缀名若是 .cu，可以将其选择为C语言显示，则会高亮显示）
GPU函数要写在main函数外
基本的多线程程序在CUDA里实现很简单
http://code.google.com/p/thrust/
AtomicAdd支持浮点，在编译时加上“-arch=sm_20”即可
-O3编译相关知识：（http://www.gentoo.org/doc/zh_cn/gcc-optimization.xml）
-O0：这个等级（字母“O”后面跟个零）关闭所有优化选项，也是CFLAGS或CXXFLAGS中没有设置-O等级时的默认等级。这样就不会优化代码，这通常不是我们想要的。

-O1：这是最基本的优化等级。编译器会在不花费太多编译时间的同时试图生成更快更小的代码。这些优化是非常基础的，但一般这些任务肯定能顺利完成。

-O2：-O1的进阶。这是推荐的优化等级，除非你有特殊的需求。-O2会比-O1启用多一些标记。设置了-O2后，编译器会试图提高代码性能而不会增大体积和大量占用的编译时间。

-O3：这是最高最危险的优化等级。用这个选项会延长编译代码的时间，并且在使用gcc4.x的系统里不应全局启用。自从3.x版本以来gcc的行为已经有了极大地改变。在3.x，-O3生成的代码也只是比-O2快一点点而已，而gcc4.x中还未必更快。用-O3来编译所有的软件包将产生更大体积更耗内存的二进制文件，大大增加编译失败的机会或不可预知的程序行为（包括错误）。这样做将得不偿失，记住过犹不及。在gcc 4.x.中使用-O3是不推荐的。

-Os：这个等级用来优化代码尺寸。其中启用了-O2中不会增加磁盘空间占用的代码生成选项。这对于磁盘空间极其紧张或者CPU缓存较小的机器非常有用。但也可能产生些许问题，因此软件树中的大部分ebuild都过滤掉这个等级的优化。使用-Os是不推荐的。

正如前面所提到的，-O2是推荐的优化等级。如果编译软件出现错误，请先检查是否启用了-O3。再试试把CFLAGS和CXXFLAGS倒回到较低的等级，如-O1甚或-O0 -g2 -ggdb（用来报告错误和检查可能存在的问题），再重新编译。

###