#include <stdio.h>
#include <stdlib.h>
#include <ctime>
#include <iostream>
#include <cmath>
using namespace std;
#define M 3200 //num of person
#define N 3200 //num of car
#define B_S 32
#define tile_x 2
#define tile_y 2
#define SHOW
//P[M]*C[N]=D[M][N]
__global__ void distance_gpu(float *x, float *y, float *px, float *py, float *distance, int m, int n)
{
__shared__ float px_s[B_S], py_s[B_S], x_s[B_S], y_s[B_S];
int index_x = blockIdx.x * blockDim.x + threadIdx.x;
int index_y = blockIdx.y * blockDim.y + threadIdx.y;
if (index_x >= N || index_y >= M) return;
if (threadIdx.y == 0)
{
x_s[threadIdx.x] = x[index_x];
y_s[threadIdx.x] = y[index_x];
}
if (threadIdx.x==0)
{
px_s[threadIdx.y] = px[index_y];
py_s[threadIdx.y] = py[index_y];
}
__syncthreads();
distance[N*index_y + index_x] = sqrt((px_s[threadIdx.y] - x_s[threadIdx.x])*(px_s[threadIdx.y] - x_s[threadIdx.x]) + (py_s[threadIdx.y] - y_s[threadIdx.x])*(py_s[threadIdx.y] - y_s[threadIdx.x]));
}
void distance_cpu(float *x, float *y, float *px, float *py, float *distance, int m, int n)
{
for (int i = 0; i<m; i++)
{
for (int j = 0; j<n; j++)
{
int xx = px[i] - x[j];
int yy = py[i] - y[j];
distance[i*N + j] = sqrt(xx*xx + yy*yy);
}
}
}
void compute_gpu(float *x, float *y, float *px, float *py, float *distance, int m, int n)
{
float *dx, *dy, *dpx, *dpy, *dd;
cudaMalloc((void **)&dpx, sizeof(float)*M);
cudaMalloc((void **)&dpy, sizeof(float)*M);
cudaMalloc((void **)&dx, sizeof(float)*N);
cudaMalloc((void **)&dy, sizeof(float)*N);
cudaMalloc((void **)&dd, sizeof(float)*N*M);
///测试时间
float elapsedTime = 0.0f;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
cudaMemcpy(dx, x, sizeof(float)*N, cudaMemcpyHostToDevice);
cudaMemcpy(dy, y, sizeof(float)*N, cudaMemcpyHostToDevice);
cudaMemcpy(dpx, px, sizeof(float)*M, cudaMemcpyHostToDevice);
cudaMemcpy(dpy, py, sizeof(float)*M, cudaMemcpyHostToDevice);
dim3 dimGrid((N + B_S - 1) / B_S, (M + B_S - 1) / B_S);
dim3 dimBlock(B_S, B_S);
distance_gpu << <dimGrid, dimBlock >> >(dx, dy, dpx, dpy, dd, M, N);
cudaMemcpy(distance, dd, sizeof(float)*N*M, cudaMemcpyDeviceToHost);
///时间结束
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("the time on gpu is %f ms\n", elapsedTime);
cudaFree(dx);
cudaFree(dy);
cudaFree(dpx);
cudaFree(dpy);
cudaFree(dd);
cudaEventDestroy(start);
cudaEventDestroy(stop);
}
void compute_cpu(float *x, float *y, float *px, float *py, float *distance, int m, int n)
{
const int stride_x = N / tile_x;
const int stride_y = M / tile_y;
float x_l[stride_x], y_l[stride_x], px_l[stride_y], py_l[stride_y];
clock_t start, finish;
start = clock();
for (int p = 0; p < tile_y; p++)
{
for (int j = 0; j < stride_y; j++)
{
px_l[j] = px[p*stride_y + j];
py_l[j] = py[p*stride_y + j];
}
for (int q = 0; q < tile_x; q++)
{
for (int i = 0; i < stride_x; i++)
{
x_l[i] = x[q*stride_x + i];
y_l[i] = y[q*stride_x + i];
}
float *distance_l = distance+p*N*stride_y + q*stride_x;
distance_cpu(x_l, y_l, px_l, py_l, distance_l, stride_x, stride_y);
}
}
finish = clock();
printf("the time on cpu is %f ms\n", (double)(finish - start));
}
void verify(float *C1, float *C2, int m, int n)
{
for (int i = 0; i < m; i++)
for (int j = 0; j < n; j++)
{
if ((C2[i*n + j] - C1[i*m + j])>1e-5)
{
printf("error! results are not equel!");
break;
}
}
}
int main()
{
float* px = (float*)malloc(M*sizeof(float));
float* py = (float*)malloc(M*sizeof(float));
float* x = (float*)malloc(N*sizeof(float));
float* y = (float*)malloc(N*sizeof(float));
float* distance1 = (float*)malloc(N*M*sizeof(float));
float* distance2 = (float*)malloc(N*M*sizeof(float));
for (int i = 0; i<N; i++)
{
x[i] = rand() % 10;
y[i] = rand() % 10;
#ifdef SHOW
cout << " (" << x[i] << "," << y[i] << ")";
#endif // SHOW
}
for (int i = 0; i<M; i++)
{
px[i] = rand() % 10;
py[i] = rand() % 10;
#ifdef SHOW
cout << endl << "(" << px[i] << "," << py[i] << ")" << endl;
#endif // SHOW
}
compute_cpu(x, y, px, py, distance1, M, N);
#ifdef SHOW
for (int i = 0; i< M; i++)
{
for (int j = 0; j< N; j++)
cout << distance1[i*N + j] << " ";
cout << endl;
}
#endif // SHOW
compute_gpu(x, y, px, py, distance2, M, N);
#ifdef SHOW
for (int i = 0; i< M; i++)
{
for (int j = 0; j< N; j++)
cout << distance2[i*N + j] << " ";
cout << endl;
}
#endif // SHOW
verify(distance1, distance2, M, N);
free(x);
free(y);
free(px);
free(py);
free(distance1);
free(distance2);
return 0;
}
多人多车求距离_cpu&&gpu_寄存器优化_sharememory优化
最新推荐文章于 2024-05-14 15:08:05 发布