#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdlib.h>
#include <conio.h>
using namespace std;
__global__ void func2(
int * block_x,
int * block_y,
int * thread,
int * warp,
int * cale_thread,
int * xthread,
int * ythread,
int * grid_dimx,
int * grid_dimy,
int * block_dimx,
int * block_dimy);
#define ARRAY_SIZE_X 32
#define ARRAY_SIZE_Y 16
#define ARRAY_SIZE_IN_BYTES ((ARRAY_SIZE_X)*(ARRAY_SIZE_Y)*(sizeof(int)))
int cpu_block_x[ARRAY_SIZE_X][ARRAY_SIZE_Y];
int cpu_block_y[ARRAY_SIZE_X][ARRAY_SIZE_Y];
int cpu_thread[ARRAY_SIZE_X][ARRAY_SIZE_Y];
int cpu_warp[ARRAY_SIZE_X][ARRAY_SIZE_Y];
int cpu_cale_thread[ARRAY_SIZE_X][ARRAY_SIZE_Y];
int cpu_xthread[ARRAY_SIZE_X][ARRAY_SIZE_Y];
int cpu_ythread[ARRAY_SIZE_X][ARRAY_SIZE_Y];
int cpu_grid_dimx[ARRAY_SIZE_X][ARRAY_SIZE_Y];
int cpu_grid_dimy[ARRAY_SIZE_X][ARRAY_SIZE_Y];
int cpu_block_dimx[ARRAY_SIZE_X][ARRAY_SIZE_Y];
int cpu_block_dimy[ARRAY_SIZE_X][ARRAY_SIZE_Y];
int main(void)
{
dim3 threads_rect(32, 4);
dim3 blocks_rect(1, 4);
dim3 threads_square(16, 8);
dim3 blocks_square(2, 2);
int * gpu_block_x;
int * gpu_block_y;
int * gpu_thread;
int * gpu_warp;
int * gpu_cale_thread;
int * gpu_xthread;
int * gpu_ythread;
int * gpu_grid_dimx;
int * gpu_grid_dimy;
int * gpu_block_dimx;
int * gpu_block_dimy;
cudaMalloc((void **)& gpu_block_x, ARRAY_SIZE_IN_BYTES);
cudaMalloc((void **)& gpu_block_y, ARRAY_SIZE_IN_BYTES);
cudaMalloc((void **)& gpu_thread, ARRAY_SIZE_IN_BYTES);
cudaMalloc((void **)& gpu_warp, ARRAY_SIZE_IN_BYTES);
cudaMalloc((void **)& gpu_cale_thread, ARRAY_SIZE_IN_BYTES);
cudaMalloc((void **)& gpu_xthread, ARRAY_SIZE_IN_BYTES);
cudaMalloc((void **)& gpu_ythread, ARRAY_SIZE_IN_BYTES);
cudaMalloc((void **)& gpu_grid_dimx, ARRAY_SIZE_IN_BYTES);
cudaMalloc((void **)& gpu_grid_dimy, ARRAY_SIZE_IN_BYTES);
cudaMalloc((void **)& gpu_block_dimx, ARRAY_SIZE_IN_BYTES);
cudaMalloc((void **)& gpu_block_dimy, ARRAY_SIZE_IN_BYTES);
for (int kernel = 0; kernel < 2; kernel++)
{
switch (kernel)
{
case 0:
{
func2 <<<blocks_rect, threads_rect >>>(gpu_block_x,
gpu_block_y,
gpu_thread,
gpu_warp,
gpu_cale_thread,
gpu_xthread,
gpu_ythread,
gpu_grid_dimx,
gpu_grid_dimy,
gpu_block_dimx,
gpu_block_dimy);
}break;
case 1:
{
func2 <<<blocks_rect, threads_rect >>>(gpu_block_x,
gpu_block_y,
gpu_thread,
gpu_warp,
gpu_cale_thread,
gpu_xthread,
gpu_ythread,
gpu_grid_dimx,
gpu_grid_dimy,
gpu_block_dimx,
gpu_block_dimy);
}break;
default:exit(1); break;
}
cudaMemcpy(cpu_block_x, gpu_block_x, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);
cudaMemcpy(cpu_block_y, gpu_block_y, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);
cudaMemcpy(cpu_thread, gpu_thread, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);
cudaMemcpy(cpu_cale_thread, gpu_cale_thread, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);
cudaMemcpy(cpu_xthread, gpu_xthread, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);
cudaMemcpy(cpu_ythread, gpu_ythread, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);
cudaMemcpy(cpu_grid_dimx, gpu_grid_dimx, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);
cudaMemcpy(cpu_grid_dimy, gpu_grid_dimy, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);
cudaMemcpy(cpu_block_dimx, gpu_block_dimx, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);
cudaMemcpy(cpu_block_dimy, gpu_block_dimy, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);
printf("\nkernel %d\n", kernel);
for (int y = 0; y < ARRAY_SIZE_Y; y++)
{
for (int x = 0; x < ARRAY_SIZE_X; x++)
{
printf("CT:%2u BKX: %1u BKY: %1u TID: %2u YTID: %2u XTID: %2u GDX: %1u BDX: %1u GDY %1u BDY %1u\n",
cpu_cale_thread[y][x],
cpu_block_x[y][x],
cpu_block_y[y][x],
cpu_thread[y][x],
cpu_ythread[y][x],
cpu_xthread[y][x],
cpu_grid_dimx[y][x],
cpu_block_dimx[y][x],
cpu_grid_dimy[y][x],
cpu_block_dimy[y][x]);
//cin.get();
}
}
printf("press any key to continue\n");
cin.get();
}
return 0;
}
__global__ void func2(
int * block_x,
int * block_y,
int * thread,
int * warp,
int * cale_thread,
int * xthread,
int * ythread,
int * grid_dimx,
int * grid_dimy,
int * block_dimx,
int * block_dimy)
{
int idx = (blockIdx.x*blockDim.x) + threadIdx.x;
int idy = (blockIdx.y*blockDim.y) + threadIdx.y;
int thread_idx = ((gridDim.x*blockDim.x)*idy) + idx;
block_x[thread_idx] = blockIdx.x;
block_y[thread_idx] = blockIdx.y;
thread[thread_idx] = threadIdx.x;
cale_thread[thread_idx] = thread_idx;
xthread[thread_idx] = idx;
ythread[thread_idx] = idy;
grid_dimx[thread_idx] = gridDim.x;
grid_dimy[thread_idx] = gridDim.y;
block_dimx[thread_idx] = blockDim.x;
block_dimy[thread_idx] = blockDim.y;
}
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdlib.h>
#include <conio.h>
using namespace std;
__global__ void func2(
int * block_x,
int * block_y,
int * thread,
int * warp,
int * cale_thread,
int * xthread,
int * ythread,
int * grid_dimx,
int * grid_dimy,
int * block_dimx,
int * block_dimy);
#define ARRAY_SIZE_X 32
#define ARRAY_SIZE_Y 16
#define ARRAY_SIZE_IN_BYTES ((ARRAY_SIZE_X)*(ARRAY_SIZE_Y)*(sizeof(int)))
int cpu_block_x[ARRAY_SIZE_X][ARRAY_SIZE_Y];
int cpu_block_y[ARRAY_SIZE_X][ARRAY_SIZE_Y];
int cpu_thread[ARRAY_SIZE_X][ARRAY_SIZE_Y];
int cpu_warp[ARRAY_SIZE_X][ARRAY_SIZE_Y];
int cpu_cale_thread[ARRAY_SIZE_X][ARRAY_SIZE_Y];
int cpu_xthread[ARRAY_SIZE_X][ARRAY_SIZE_Y];
int cpu_ythread[ARRAY_SIZE_X][ARRAY_SIZE_Y];
int cpu_grid_dimx[ARRAY_SIZE_X][ARRAY_SIZE_Y];
int cpu_grid_dimy[ARRAY_SIZE_X][ARRAY_SIZE_Y];
int cpu_block_dimx[ARRAY_SIZE_X][ARRAY_SIZE_Y];
int cpu_block_dimy[ARRAY_SIZE_X][ARRAY_SIZE_Y];
int main(void)
{
dim3 threads_rect(32, 4);
dim3 blocks_rect(1, 4);
dim3 threads_square(16, 8);
dim3 blocks_square(2, 2);
int * gpu_block_x;
int * gpu_block_y;
int * gpu_thread;
int * gpu_warp;
int * gpu_cale_thread;
int * gpu_xthread;
int * gpu_ythread;
int * gpu_grid_dimx;
int * gpu_grid_dimy;
int * gpu_block_dimx;
int * gpu_block_dimy;
cudaMalloc((void **)& gpu_block_x, ARRAY_SIZE_IN_BYTES);
cudaMalloc((void **)& gpu_block_y, ARRAY_SIZE_IN_BYTES);
cudaMalloc((void **)& gpu_thread, ARRAY_SIZE_IN_BYTES);
cudaMalloc((void **)& gpu_warp, ARRAY_SIZE_IN_BYTES);
cudaMalloc((void **)& gpu_cale_thread, ARRAY_SIZE_IN_BYTES);
cudaMalloc((void **)& gpu_xthread, ARRAY_SIZE_IN_BYTES);
cudaMalloc((void **)& gpu_ythread, ARRAY_SIZE_IN_BYTES);
cudaMalloc((void **)& gpu_grid_dimx, ARRAY_SIZE_IN_BYTES);
cudaMalloc((void **)& gpu_grid_dimy, ARRAY_SIZE_IN_BYTES);
cudaMalloc((void **)& gpu_block_dimx, ARRAY_SIZE_IN_BYTES);
cudaMalloc((void **)& gpu_block_dimy, ARRAY_SIZE_IN_BYTES);
for (int kernel = 0; kernel < 2; kernel++)
{
switch (kernel)
{
case 0:
{
func2 <<<blocks_rect, threads_rect >>>(gpu_block_x,
gpu_block_y,
gpu_thread,
gpu_warp,
gpu_cale_thread,
gpu_xthread,
gpu_ythread,
gpu_grid_dimx,
gpu_grid_dimy,
gpu_block_dimx,
gpu_block_dimy);
}break;
case 1:
{
func2 <<<blocks_rect, threads_rect >>>(gpu_block_x,
gpu_block_y,
gpu_thread,
gpu_warp,
gpu_cale_thread,
gpu_xthread,
gpu_ythread,
gpu_grid_dimx,
gpu_grid_dimy,
gpu_block_dimx,
gpu_block_dimy);
}break;
default:exit(1); break;
}
cudaMemcpy(cpu_block_x, gpu_block_x, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);
cudaMemcpy(cpu_block_y, gpu_block_y, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);
cudaMemcpy(cpu_thread, gpu_thread, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);
cudaMemcpy(cpu_cale_thread, gpu_cale_thread, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);
cudaMemcpy(cpu_xthread, gpu_xthread, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);
cudaMemcpy(cpu_ythread, gpu_ythread, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);
cudaMemcpy(cpu_grid_dimx, gpu_grid_dimx, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);
cudaMemcpy(cpu_grid_dimy, gpu_grid_dimy, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);
cudaMemcpy(cpu_block_dimx, gpu_block_dimx, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);
cudaMemcpy(cpu_block_dimy, gpu_block_dimy, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);
printf("\nkernel %d\n", kernel);
for (int y = 0; y < ARRAY_SIZE_Y; y++)
{
for (int x = 0; x < ARRAY_SIZE_X; x++)
{
printf("CT:%2u BKX: %1u BKY: %1u TID: %2u YTID: %2u XTID: %2u GDX: %1u BDX: %1u GDY %1u BDY %1u\n",
cpu_cale_thread[y][x],
cpu_block_x[y][x],
cpu_block_y[y][x],
cpu_thread[y][x],
cpu_ythread[y][x],
cpu_xthread[y][x],
cpu_grid_dimx[y][x],
cpu_block_dimx[y][x],
cpu_grid_dimy[y][x],
cpu_block_dimy[y][x]);
//cin.get();
}
}
printf("press any key to continue\n");
cin.get();
}
return 0;
}
__global__ void func2(
int * block_x,
int * block_y,
int * thread,
int * warp,
int * cale_thread,
int * xthread,
int * ythread,
int * grid_dimx,
int * grid_dimy,
int * block_dimx,
int * block_dimy)
{
int idx = (blockIdx.x*blockDim.x) + threadIdx.x;
int idy = (blockIdx.y*blockDim.y) + threadIdx.y;
int thread_idx = ((gridDim.x*blockDim.x)*idy) + idx;
block_x[thread_idx] = blockIdx.x;
block_y[thread_idx] = blockIdx.y;
thread[thread_idx] = threadIdx.x;
cale_thread[thread_idx] = thread_idx;
xthread[thread_idx] = idx;
ythread[thread_idx] = idy;
grid_dimx[thread_idx] = gridDim.x;
grid_dimy[thread_idx] = gridDim.y;
block_dimx[thread_idx] = blockDim.x;
block_dimy[thread_idx] = blockDim.y;
}