1D Convolution with CUDA

1D Convolution

Write a CUDA program to do the following:// n can be any integer > 0. convolve(float* a, float* b, float* c, int n) { for(int i = 0; i < 2*n-1; ++i) c[i] = 0f; for(int i = 0; i < n; ++i) for(int j = 0; j < n; ++j) c[i+j] += a[i] * b[j]; }
In the latest attempt (2011, by Francis Fok), on xi03, the speedups of using CUDA against CPU are
ratio = 38.82 when n = 100000,
ratio = 48.74 when n = 500000,
ratio = 153.24 when n = 1000000.

We use ICC with -O3.


(My reference code is faster than the above result)


Code:(for reference only, and without AtomicAdd or Thrust API)

#include <stdio.h> #include <stdlib.h> #include <time.h> #include <math.h> #define THREAD_NUM 512 void init_arr(float *arr, int len) { int i = 0;; for (i = 0; i < len; i++) { arr[i] = 0.0f; } } void rand_arr(float *arr, int len) { int i = 0; for (i = 0; i < len; i++) { arr[i] = (rand()%1000) * 0.01; } } clock_t convolve(const float *a, const float *b, float *d, int n) { clock_t startCPU, endCPU; int i, j = 0; startCPU = clock(); for (i = 0; i < n; i++) for (j = 0; j < n; j++) { d[i+j] += a[i] * b[j]; } endCPU = clock(); return endCPU - startCPU; } __global__ static void ConvolveCUDA(const float* a, const float* b, float* c, int n) { int i = 0; int idx = threadIdx.x + blockDim.x * blockIdx.x; //Method 3 (improved from Method 2) if (idx < n) { float t1 = 0; float t2 = 0; for (i = 0; i <= idx; i++) { t1 += a[i] * b[idx - i]; } for (i = idx+1; i < n; i++) { t2 += a[i] * b[n+idx-i]; } c[idx] = t1; c[n+idx] = t2; } /*//Method 2 for (i = 0; i < n; i++) { if (idx >= i) { t1 += a[i] * b[idx - i]; } else { t2 += a[i] * b[n + idx - i]; } c[idx] = t1; c[n+idx] = t2; } */ /*//Method 1 if(idx < (2*n-1)) { if (idx <= (n-1)){ float t = 0; for (i = 0; i <= idx; i++) { t += a[i] * b[idx - i]; } c[idx] = t; } if (idx > (n-1)){ idx = 2*n-1 - idx -1; float t = 0; for (i = 0; i <= idx; i++) { t += a[(n - 1) - (idx - i)] * b[(n -1) - i]; } c[2*n-1-idx -1] = t; } } */ } clock_t convolveCUDA(const float *a, const float *b, float *c, int n) { float *a_d, *b_d, *c_d; clock_t start, end; int BLOCK_NUM = n / THREAD_NUM + ((n % THREAD_NUM > 0)?1:0); //int BLOCK_NUM = (n + THREAD_NUM) / THREAD_NUM; cudaMalloc((void**) &a_d, sizeof(float) * n); cudaMalloc((void**) &b_d, sizeof(float) * n); cudaMalloc((void**) &c_d, sizeof(float) * (2*n-1)); start = clock(); cudaMemcpy(a_d, a, sizeof(float) * n, cudaMemcpyHostToDevice); cudaMemcpy(b_d, b, sizeof(float) * n, cudaMemcpyHostToDevice); cudaMemcpy(c_d, c, sizeof(float) * (2*n-1), cudaMemcpyHostToDevice); ConvolveCUDA<<<BLOCK_NUM, THREAD_NUM>>>(a_d, b_d, c_d, n); cudaMemcpy(c, c_d, sizeof(float) * (2*n-1), cudaMemcpyDeviceToHost); end = clock(); cudaFree(a_d); cudaFree(b_d); cudaFree(c_d); return end - start; } void compare_arr(const float* a, const float* b, int len) { float max_err = 0; float average_err = 0; int i = 0; for(i = 0; i < len; i++) { if(b[i] != 0) { float err = fabs((a[i] - b[i]) / b[i]); if(max_err < err) max_err = err; average_err += err; } } printf("Max error: %g\tAverage error: %g\n", max_err, average_err / (len * len)); } int main() { float *a, *b, *c, *d; int m, n = 0; printf("\nPlease input length n:"); scanf("%d", &n); m = 2 * n - 1; a = (float*) malloc(sizeof(float) * n); b = (float*) malloc(sizeof(float) * n); c = (float*) malloc(sizeof(float) * m); d = (float*) malloc(sizeof(float) * m); srand((unsigned int)time(NULL) + rand()); rand_arr(a, n); rand_arr(b, n); init_arr(c, m); init_arr(d, m); //for (int i = 0; i < n; i++) {printf("a[%d] = %.2f\t\tb[%d] = %.2f\n", i, a[i], i, b[i]);} //printf("\n"); clock_t timeGPU = convolveCUDA(a, b, c, n); clock_t timeCPU = convolve(a, b, d, n); //clock_t timeCPU = 1; compare_arr(c, d, m); double secGPU = (double) timeGPU / CLOCKS_PER_SEC; double secCPU = (double) timeCPU / CLOCKS_PER_SEC; float ratio = secCPU / secGPU; printf("CPU vs GPU Time used: %.2f vs %.2f\n", secCPU, secGPU); printf("CPU vs GPU ratio: %.2f\n\n", ratio); //for (int j = 0; j < m; j++) {printf("c[%d] = %.2f\t\td[%d] = %.2f\n", j, c[j], j, d[j]);} //printf("\n"); free(a); free(b); free(c); free(d); }

Result:

[michaelchen@xi03 ex1]$ ./ex1_gpu (Method 3)


Please input length n:100000

Max error: 1.72791e-06 Average error: 2.17172e-11

CPU vs GPU Time used: 12.43 vs 0.16

CPU vs GPU ratio: 77.69


[michaelchen@xi03 ex1]$ ./ex1_gpu


Please input length n:500000

Max error: 2.94493e-06 Average error: -5.19955e-10

CPU vs GPU Time used: 321.91 vs 3.87

CPU vs GPU ratio: 83.18



[michaelchen@xi03 ex1]$ ./ex1_gpu_3 (Method 1)


Please input length n:100000

Max error: 1.73582e-06 Average error: 2.174e-11

CPU vs GPU Time used: 12.43 vs 0.47

CPU vs GPU ratio: 26.45


[michaelchen@xi03 ex1]$ ./ex1_gpu_3


Please input length n:500000

Max error: 2.83556e-06 Average error: -5.20274e-10

CPU vs GPU Time used: 318.16 vs 4.50

CPU vs GPU ratio: 70.70



Notes:
  1. 计算运行时间:http://www.cnitblog.com/mantou/archive/2005/08/24/2304.html
  2. 随机字符串:http://topic.csdn.net/u/20080324/03/1381e449-ed79-47e0-837e-64cada4439c2.html
  3. vim 高亮:
    au BufNewFile,BufReadPost *.cl,*.cu,*.cuh, set ft=cpp
    set nu
  4. nvcc路径配置: ~/.bashrc 配置完后,不需重启,用source ~/.bashrc 将配置更新
  5. which 命令 查看路径
  6. 参考网站及示例:https://sites.google.com/a/kimicat.com/www/cuda%E7%B0%A1%E4%BB%8B
  7. 编辑器: Notepad++ (编辑后缀名若是 .cu,可以将其选择为C语言显示,则会高亮显示)
  8. GPU函数要写在main函数外
  9. 基本的多线程程序在CUDA里实现很简单
  10. http://code.google.com/p/thrust/
  11. AtomicAdd支持浮点,在编译时加上“-arch=sm_20”即可
  12. -O3编译相关知识:(http://www.gentoo.org/doc/zh_cn/gcc-optimization.xml)
    -O0:这个等级(字母“O”后面跟个零)关闭所有优化选项,也是CFLAGS或CXXFLAGS中没有设置-O等级时的默认等级。这样就不会优化代码,这通常不是我们想要的。

-O1:这是最基本的优化等级。编译器会在不花费太多编译时间的同时试图生成更快更小的代码。这些优化是非常基础的,但一般这些任务肯定能顺利完成。

-O2:-O1的进阶。这是推荐的优化等级,除非你有特殊的需求。-O2会比-O1启用多一些标记。设置了-O2后,编译器会试图提高代码性能而不会增大体积和大量占用的编译时间。

-O3:这是最高最危险的优化等级。用这个选项会延长编译代码的时间,并且在使用gcc4.x的系统里不应全局启用。自从3.x版本以来gcc的行为已经有了极大地改变。在3.x,-O3生成的代码也只是比-O2快一点点而已,而gcc4.x中还未必更快。用-O3来编译所有的软件包将产生更大体积更耗内存的二进制文件,大大增加编译失败的机会或不可预知的程序行为(包括错误)。这样做将得不偿失,记住过犹不及。在gcc 4.x.中使用-O3是不推荐的。

-Os:这个等级用来优化代码尺寸。其中启用了-O2中不会增加磁盘空间占用的代码生成选项。这对于磁盘空间极其紧张或者CPU缓存较小的机器非常有用。但也可能产生些许问题,因此软件树中的大部分ebuild都过滤掉这个等级的优化。使用-Os是不推荐的。

正如前面所提到的,-O2是推荐的优化等级。如果编译软件出现错误,请先检查是否启用了-O3。再试试把CFLAGS和CXXFLAGS倒回到较低的等级,如-O1甚或-O0 -g2 -ggdb(用来报告错误和检查可能存在的问题),再重新编译。



###



  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值