#include<iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include<stdio.h>
#include<memory>
#include<cuda.h>
#define arraySize 10
#define threadPerBlock 5
__global__ void addKernel(int* d_a, int* d_b) {
int count = 0;
int tid = threadIdx.x;
int ttid = blockIdx.x * threadPerBlock + tid;
int val = d_a[ttid];
__shared__ int cache[threadPerBlock];
for (int i = tid; i < arraySize; i += threadPerBlock) {
cache[tid] = d_a[i];
__syncthreads();
for (int j = 0; j < threadPerBlock; ++j)
{
if (val > cache[j]) {
count++;
__syncthreads();
}
}
}
d_b[count] = val;
}
int main() {
int h_a[arraySize] = { 5, 9, 3, 4, 8, 10, 7, 1, 2, 6};
int h_b[arraySize];
int* d_a, * d_b;
cudaMalloc((void**)&d_a, arraySize * sizeof(int));
cudaMalloc((void**)&d_b, arraySize * sizeof(int));
cudaMemcpy(d_a, h_a, arraySize * sizeof(int), cudaMemcpyHostToDevice);
addKernel << <arraySize / threadPerBlock, threadPerBlock >> > (d_a, d_b);
cudaDeviceSynchronize();
cudaMemcpy(h_b, d_b, arraySize * sizeof(int), cudaMemcpyDeviceToHost);
printf("The Enumeration sorted Array is: \n");
for (int i = 0; i < arraySize; i++)
{
printf("%d\t", h_b[i]);
}
cudaFree(d_a);
cudaFree(d_b);
return 0;
}
CUDA加速排序
最新推荐文章于 2024-07-25 10:09:27 发布