归约
方法
#include<cuda_runtime_api.h>
#include<cuda_runtime.h>
#include<device_launch_parameters.h>
#include<iostream>
#include<device_functions.h>
using namespace std;
__global__ void jia(int a[101], int b[101]) {
__shared__ int sdata[101];
int idx = threadIdx.x;
int x = blockIdx.x * blockDim.x + threadIdx.x;
sdata[idx] = a[x];
__syncthreads();
for (int s = 1; s < blockDim.x; s *= 2) {
if (idx % (s * 2) == 0) {
sdata[idx] += sdata[idx + s];
}
__syncthreads();
}
if (idx == 0)b[0] = sdata[0];
}
int main() {
int ha[101];
int *da,*db;
int hb[101];
for(int i = 0; i <= 100; i++)
ha[i] = i;
cudaMalloc((void**)&da, sizeof(int) * 101);
cudaMalloc((void**)&db, sizeof(int) * 101);
cudaMemcpy(da, ha, sizeof(int) * 101, cudaMemcpyHostToDevice);
dim3 Block(101);
jia << <1, Block >> > (da,db);
cudaMemcpy(hb, db, sizeof(int), cudaMemcpyDeviceToHost);
cout << hb[0] << endl;
cudaFree(da);
cudaFree(db);
}
优化一
在上一方法中对于线程的使用由于间隔使用会造成很大的资源浪费
#include<cuda_runtime_api.h>
#include<cuda_runtime.h>
#include<device_launch_parameters.h>
#include<iostream>
#include<device_functions.h>
using namespace std;
__global__ void jia(int a[101], int b[101]) {
__shared__ int sdata[101];
int idx = threadIdx.x;
int x = blockIdx.x * blockDim.x + threadIdx.x;
sdata[idx] = a[x];
__syncthreads();
for (int s = 1; s < blockDim.x; s *= 2) {
int indx = 2 * s * idx;
if (indx<blockDim.x) {
sdata[indx] += sdata[indx + s];
}
__syncthreads();
}
if (idx == 0)b[0] = sdata[0];
}
int main() {
int ha[101];
int *da,*db;
int hb[101];
for(int i = 0; i <= 100; i++)
ha[i] = i;
cudaMalloc((void**)&da, sizeof(int) * 101);
cudaMalloc((void**)&db, sizeof(int) * 101);
cudaMemcpy(da, ha, sizeof(int) * 101, cudaMemcpyHostToDevice);
dim3 Block(101);
jia << <1, Block >> > (da,db);
cudaMemcpy(hb, db, sizeof(int), cudaMemcpyDeviceToHost);
cout << hb[0] << endl;
cudaFree(da);
cudaFree(db);
}
优化二
提高内存访问效率
#include<cuda_runtime_api.h>
#include<cuda_runtime.h>
#include<device_launch_parameters.h>
#include<iostream>
#include<device_functions.h>
using namespace std;
__global__ void jia(int a[128], int b[128]) {
__shared__ int sdata[100];
int idx = threadIdx.x;
int x = blockIdx.x * blockDim.x + threadIdx.x;
sdata[idx] = a[x];
__syncthreads();
for (int s = blockDim.x / 2; s > 0; s /= 2) {
if (idx < s) {
sdata[idx] += sdata[idx + s];
}
__syncthreads();
}
if (idx == 0)b[0] = sdata[0];
}
int main() {
int ha[128] = { 0 };
int* da, * db;
int hb[128];
for (int i = 0; i < 100; i++)
ha[i] = i + 1;
cudaMalloc((void**)&da, sizeof(int) * 128);
cudaMalloc((void**)&db, sizeof(int) * 128);
cudaMemcpy(da, ha, sizeof(int) * 128, cudaMemcpyHostToDevice);
dim3 Block(128);
jia << <1, Block >> > (da, db);
cudaMemcpy(hb, db, sizeof(int), cudaMemcpyDeviceToHost);
cout << hb[0] << endl;
cudaFree(da);
cudaFree(db);
}
优化三
在加载共享内存时可以在一个共享内存中同时加载两个数值进去,先进行一次归约,提高效率
#include<cuda_runtime_api.h>
#include<cuda_runtime.h>
#include<device_launch_parameters.h>
#include<iostream>
#include<math.h>
#include<device_functions.h>
using namespace std;
__global__ void jia(int a[128], int b[128]) {
__shared__ int sdata[100];
int idx = threadIdx.x;
int x = blockIdx.x * blockDim.x + threadIdx.x * 2;
sdata[idx] = a[x] + a[x + 1];
__syncthreads();
for (int s = blockDim.x / 2; s > 0; s /= 2) {
if (idx < s) {
sdata[idx] += sdata[idx + s];
}
__syncthreads();
}
if (idx == 0)b[0] = sdata[0];
}
int main() {
int ha[128] = { 0 };
int* da, * db;
int hb[128];
for (int i = 0; i < 100; i++)
ha[i] = i + 1;
cudaMalloc((void**)&da, sizeof(int) * 128);
cudaMalloc((void**)&db, sizeof(int) * 128);
cudaMemcpy(da, ha, sizeof(int) * 128, cudaMemcpyHostToDevice);
dim3 Block(128);
jia << <1, Block >> > (da, db);
cudaMemcpy(hb, db, sizeof(int), cudaMemcpyDeviceToHost);
cout << hb[0] << endl;
cudaFree(da);
cudaFree(db);
}
优化四
当我们执行到只有32个线程在计算值的时候,此时只有一个warp在使用而其他的warp没有使用此时会造成资源的浪费,我们可以单独将最后一个warp展开运行
#include<cuda_runtime_api.h>
#include<cuda_runtime.h>
#include<device_launch_parameters.h>
#include<iostream>
#include<math.h>
#include<device_functions.h>
using namespace std;
__global__ void jia(int a[128], int b[128]) {
__shared__ int sdata[100];
int idx = threadIdx.x;
int x = blockIdx.x * blockDim.x + threadIdx.x * 2;
sdata[idx] = a[x] + a[x + 1];
__syncthreads();
for (int s = blockDim.x / 2; s > 32; s /= 2) {
if (idx < s) {
sdata[idx] += sdata[idx + s];
}
__syncthreads();
}
if (idx < 32) {
sdata[idx] += sdata[idx + 32];
sdata[idx] += sdata[idx + 16];
sdata[idx] += sdata[idx + 8];
sdata[idx] += sdata[idx + 4];
sdata[idx] += sdata[idx + 2];
sdata[idx] += sdata[idx + 1];
}
if (idx == 0)b[0] = sdata[0];
}
int main() {
int ha[128] = { 0 };
int* da, * db;
int hb[128];
for (int i = 0; i < 100; i++)
ha[i] = i + 1;
cudaMalloc((void**)&da, sizeof(int) * 128);
cudaMalloc((void**)&db, sizeof(int) * 128);
cudaMemcpy(da, ha, sizeof(int) * 128, cudaMemcpyHostToDevice);
dim3 Block(128);
jia << <1, Block >> > (da, db);
cudaMemcpy(hb, db, sizeof(int), cudaMemcpyDeviceToHost);
cout << hb[0] << endl;
cudaFree(da);
cudaFree(db);
}
优化五
根据线程数量必须是2的n次幂,我们可以在同一个block里面将for循环完全拆除,从而加快运行速率
#include<cuda_runtime_api.h>
#include<cuda_runtime.h>
#include<device_launch_parameters.h>
#include<iostream>
#include<math.h>
#include<device_functions.h>
using namespace std;
__global__ void jia(int a[128], int b[128]) {
__shared__ int sdata[100];
int idx = threadIdx.x;
int x = blockIdx.x * blockDim.x + threadIdx.x * 2;
sdata[idx] = a[x] + a[x + 1];
__syncthreads();
if (idx < 32) {
sdata[idx] += sdata[idx + 32];
sdata[idx] += sdata[idx + 16];
sdata[idx] += sdata[idx + 8];
sdata[idx] += sdata[idx + 4];
sdata[idx] += sdata[idx + 2];
sdata[idx] += sdata[idx + 1];
}
if (idx == 0)b[0] = sdata[0];
}
int main() {
int ha[128] = { 0 };
int* da, * db;
int hb[128];
for (int i = 0; i < 100; i++)
ha[i] = i + 1;
cudaMalloc((void**)&da, sizeof(int) * 128);
cudaMalloc((void**)&db, sizeof(int) * 128);
cudaMemcpy(da, ha, sizeof(int) * 128, cudaMemcpyHostToDevice);
dim3 Block(128);
jia << <1, Block >> > (da, db);
cudaMemcpy(hb, db, sizeof(int), cudaMemcpyDeviceToHost);
cout << hb[0] << endl;
cudaFree(da);
cudaFree(db);
}