#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include "device_functions.h"
using namespace std;
//规约求最大值
__global__ void reduceMax(float* input, float* output, int n)
{
int tid = threadIdx.x;
if (tid >= n)return;
float* data = input + blockIdx.x*blockDim.x;
for (int stride = 1; stride < blockDim.x; stride *= 2)
{
if ((tid % (2 * stride)) == 0)
{
if (data[tid] < data[tid + stride])
{
data[tid] = data[tid + stride];
}
}
__syncthreads();
}
if (tid == 0)
{
output[blockIdx.x] = data[0];
}
}
//规约求和
__global__ void reduceSum(float* input, float* output, int n)
{
int tid = threadIdx.x;
//boundary check
if (tid >= n) return;
//以block为单位拆分数组
float* data = input + blockIdx.x*blockDim.x;
for (int stride = 1; stride < blockDim.x; stride *= 2)
{
if ((tid % (2 * stride)) == 0)
{
data[tid] += data[tid + stride];
}
__syncthreads();
}
if (tid == 0)
{
output[blockIdx.x] = data[0];
}
}
int main()
{
//array initialization
const int n = 1<<15;
float* A = new float[n];
for (int i = 0; i < n; i++)
{
A[i] = i;
}
//set block and grid dims
dim3 block(1024, 1);
dim3 grid((n - 1) / block.x + 1, 1);
float* t_A = nullptr;
float* t_B = nullptr;
cudaMalloc(&t_A, sizeof(float)*n);
cudaMalloc(&t_B, sizeof(float)*grid.x);
cudaMemcpy(t_A, A, sizeof(float)*n, cudaMemcpyHostToDevice);
reduceMax << <grid, block >> > (t_A, t_B, n);
float* B = new float[grid.x];
cudaMemcpy(B, t_B, sizeof(float)*grid.x, cudaMemcpyDeviceToHost);
float maxValue = 0.0;
for (int i = 0; i < grid.x; i++)
{
if (maxValue < B[i])
{
maxValue = B[i];
}
}
cout << "MAX=" << maxValue << endl;
return 0;
}
CUDA编程 | 规约算法求最值、求和
于 2022-07-03 09:55:19 首次发布