对10w大小的数组做1w次重复赋值,
分别用多线程和单线程实现。
结果发现
1begin
1 main Time to generate: 50.0 ms
2begin
2 main Time to generate: 117630.0 ms
#include <stdio.h>
#define MAX_BLOCKS_PER_GRID 65535
#define MAX_BLOCK_ROWS 255
#define MAX_BLOCK_COLS 255
#define MAX_THREADS_PER_BLOCK 1024
#define MAX_THREAD_ROWS 32
#define MAX_ThREAD_COLS 32
__global__ void kernel(double* dev_array)
{
int tid = blockIdx.x*blockDim.x+threadIdx.x;
dev_array[tid] = 1;
//dev_array[0] = 1;
}
__global__ void kernel2(double* dev_array,int array_size)
{
for(int i=0;i<array_size;i++)
{
dev_array[i] = 1;
//dev_array[0] = 1;
}
}
int main()
{
int rounds = 10000;
clock_t start,stop;
float elapsedTime;
int array_size = 100000;
/*
double* array = new double[array_size];
printf("0begin\n");
start = clock();
for(int i=0;i<rounds;i++)
{
for(int i=0;i<array_size;i++)
{
array[i] = 1;
}
}
stop= clock();
elapsedTime = (float)(stop - start) /
(float)CLOCKS_PER_SEC * 1000.0f;
printf( "0 main Time to generate: %3.1f ms\n", elapsedTime );
delete []array;
*/
double * dev_array = NULL;
cudaMalloc(&dev_array,array_size*sizeof(double));
int threads = MAX_THREADS_PER_BLOCK;
if(threads>array_size) threads = array_size;
int blocks = (array_size+threads-1)/threads;
if(blocks > MAX_BLOCKS_PER_GRID) blocks = MAX_BLOCKS_PER_GRID;
printf("1begin\n");
start = clock();
for(int i=0;i<rounds;i++)
{
kernel<<<blocks,threads>>>(dev_array);
}
cudaDeviceSynchronize();
stop= clock();
elapsedTime = (float)(stop - start) /
(float)CLOCKS_PER_SEC * 1000.0f;
printf( "1 main Time to generate: %3.1f ms\n", elapsedTime );
printf("2begin\n");
start = clock();
for(int i=0;i<rounds;i++)
{
kernel2<<<1,1>>>(dev_array,array_size);
}
cudaDeviceSynchronize();
stop= clock();
elapsedTime = (float)(stop - start) /
(float)CLOCKS_PER_SEC * 1000.0f;
printf( "2 main Time to generate: %3.1f ms\n", elapsedTime );
cudaFree(dev_array);
return 0;
}