1.向量点积 cuda dot product
Question 1: Dot product
Dot product is a reduction from vectors to a scalar.
Please implement the kernel of dot product in CUDA. The host code is provided. Your work will be evaluated by accuracy and efficiency.
计算两个向量的点积 经典的官方教程
#include <stdio.h>
#define imin(a,b) (a<b?a:b)
const int N = 4096 * 4096;
const int threadsPerBlock = 512;
const int blocksPerGrid = imin(32, (N+threadsPerBlock-1) / threadsPerBlock); //32768.99取整
__global__ void dot( float *a, float *b, float *c ) {
// Shared memory for results of multiplication
__shared__ float temp[threadsPerBlock]; //对于每个block缓存对应的thread的点积
int tid = threadIdx.x + blockIdx.x * blockDim.x; //tid为线程的偏移量
float move = 0;
while (tid < N){
move += a[tid] * b[tid];
tid += blockDim.x * gridDim.x;
}
temp[threadIdx.x] = move;
//wait all threads calculate over
__syncthreads();
// Thread 0 sums the pairwise products
// calculate current block. 而不是将thread_sum放在thread_0计算
int i = blockDim.x/2;
while (i != 0){
if (threadIdx.x < i)
temp[threadIdx.x] += temp[threadIdx.x + i];
__syncthreads();
i /= 2;
}
if (threadIdx.x == 0)
c[blockIdx.x] = temp[0];
}
int main( void ) {
float *a, *b, c, *partial_c;
float *dev_a, *dev_b, *dev_partial_c;
// allocate memory on the cpu side
a = (float*)malloc( N*sizeof(float) );
b = (float*)malloc( N*sizeof(float) );
partial_c = (float*)malloc( blocksPerGrid*sizeof(float) );
// allocate the memory on the GPU
cudaMalloc( (void**)&dev_a,
N*sizeof(float) ) ;
cudaMalloc( (void**)&dev_b,
N*sizeof(float) ) ;
cudaMalloc( (void**)&dev_partial_c,
blocksPerGrid*sizeof(float) ); //注意这里开辟空间的大小
// fill in the host memory with data
for (int i=0; i<N; i++) {
a[i] = i;
b[i] = i*2;
}
// copy the arrays 'a' and 'b' to the GPU
cudaMemcpy( dev_a, a, N*sizeof(float),
cudaMemcpyHostToDevice ) ;
cudaMemcpy( dev_b, b, N*sizeof(float),
cudaMemcpyHostToDevice ) ;
//注意thread的wait
dot<<<blocksPerGrid,threadsPerBlock>>>( dev_a, dev_b,
dev_partial_c );
// copy the array 'c' back from the GPU to the CPU
cudaMemcpy( partial_c, dev_partial_c,
blocksPerGrid*sizeof(float),
cudaMemcpyDeviceToHost ) ;
// finish up on the CPU side
// 统计block_sum in grid
c = 0;
for (int i=0; i<blocksPerGrid; i++) {
c += partial_c[i];
}
//#define sum_squares(x) (x*(x+1)*(2*x+1)/6) //此程序相当于计算0,1,...N-1的平方和
//printf("Does GPU value %.10g = %.10g?\n", c, 2*sum_squares((float)(N-1)));
//printf("Does GPU value %f = %f?\n", c, 2*sum_squares((float)(N-1)));
// free memory on the gpu side
cudaFree( dev_a ) ;
cudaFree( dev_b ) ;
cudaFree( dev_partial_c );
// free memory on the cpu side
free( a );
free( b );
free( partial_c );
}
/*
blockDim.x,y,z gives the number of threads in a block, in the particular direction
gridDim.x,y,z gives the number of blocks in a grid, in the particular direction
blockDim.x * gridDim.x gives the number of threads in a grid (in the x direction, in this case)
1 block的官方examle
__global__ voiddot( int*a, int*b, int*c ) {
__shared__ inttemp[N];
temp[threadIdx.x] = a[threadIdx.x] * b[threadIdx.x];
__syncthreads();
if( 0 == threadIdx.x ) {
intsum = 0;
for( inti= 0; i< N; i++ )
sum += temp[i];
*c = sum;
}
}
*/
2.cuda实现枚举排序
枚举排序(Enumeration Sort)是一种最简单的排序算法,通常也称为秩排序(Rank Sort)。该算法的具体思想是(假设按关键字递增排序),对每一个待排序的元素统计小于它的所有元素的个数,从而得到该元素最终处于序列中的位置。假定待排序的n个数存在a[1], …, a[n]中。首先将a[1]与a[2],…, a[n]比较,记录比其小的数的个数,假设为k,则a[1]就被存入有序的数组b[1], b[2],…, b[n]的b[k+1]位置上;然后将a[2]与a[1], a[3],…, a[n]比较,记录比其小的数的个数,以此类推。这样的比较操作共n(n-1)次,所以串行秩排序的时间复杂度为O(n2). 请用CUDA并行枚举排序法
//感觉block没用好 是不是再遍历全部array之前有个combine操作?
//目前计算是9s 有点慢呢
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda_runtime.h>
#define BLOCK_SIZE 512
__global__ void RankSortKernel(const unsigned int *ori, unsigned int *sorted, unsigned int len)
{
unsigned int final_location = 0;
unsigned int same = 0;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
for(int i=0;i<len;i++){
if(ori[tid] == ori[i])
same ++;
if(ori[tid] > ori[i])
final_location ++;
}
for(int i=0;i<same;i++){
sorted[final_location+i] = ori[tid];
}
}
int main ( int argc, char *argv[] )
{
unsigned int len = 1024*1024;
unsigned int *ori = new unsigned int[len];
unsigned int *gpuSorted = new unsigned int[len];
unsigned int *d_ori = NULL;
unsigned int *d_sorted = NULL;
unsigned int size = len*sizeof(unsigned int);
unsigned int numBlock = len/BLOCK_SIZE;
dim3 dimGrid(numBlock); //num of block in grid
dim3 dimBlock(BLOCK_SIZE);
cudaMalloc((void**)&d_ori, size);
cudaMalloc((void**)&d_sorted, size);
for (int i=0;i<len;i++)
ori[i] = rand()%100;
printf("init origin array ok\n");
cudaMemcpy(d_ori, ori, size, cudaMemcpyHostToDevice);
cudaMemset(d_sorted, 0, size);
cudaEvent_t start, stop;
float elapsedTime;
cudaEventCreate(&start);
cudaEventRecord(start,0);
RankSortKernel<<<dimGrid, dimBlock>>>(d_ori, d_sorted, len); //cuda kernel函数
//阻塞until device结束
cudaThreadSynchronize();
cudaEventCreate(&stop);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start,stop);
printf("sort_Elapsed time : %f ms\n" ,elapsedTime);
cudaMemcpy(gpuSorted, d_sorted, len*sizeof(unsigned int), cudaMemcpyDeviceToHost);
for (int i=0;i<len;i++){
//printf("%d ",gpuSorted[i]);
if (i==len-1)
break;
if(gpuSorted[i] > gpuSorted[i+1])
printf("sort failed in %d\n",gpuSorted[i]);
}
printf("sort ok\n");
cudaFree(d_ori);
cudaFree(d_sorted);
delete ori;
delete gpuSorted;
return 0;
}
/*
1.cuda中kernel函数<<<dim3>>>和<<<int,int>>>区别?
https://stackoverflow.com/questions/31141541/cuda-block-grid-dimensions-when-to-use-dim3
*/