#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdlib.h>
#include <conio.h>
using namespace std;
__global__ void func1(int* block,int* thread,int* warp,int* calc_thread);
#define ARRAY_SIZE 128
#define ARRAY_SIZE_IN_BYTES (sizeof(int)*(ARRAY_SIZE))
//int cpu_block[ARRAY_SIZE];
//int cpu_thread[ARRAY_SIZE];
//int cpu_warp[ARRAY_SIZE];
//int cpu_calc_thread[ARRAY_SIZE];
int cpu_block[ARRAY_SIZE];
int cpu_thread[ARRAY_SIZE];
int cpu_warp[ARRAY_SIZE];
int cpu_calc_thread[ARRAY_SIZE];
int main()
{
const int num_blocks = 2;
const int num_threads = 64;
int * gpu_block;
int * gpu_thread;
int * gpu_warp;
int * gpu_calc_thread;
cudaMalloc((void **)&gpu_block, ARRAY_SIZE_IN_BYTES);
cudaMalloc((void **)&gpu_thread, ARRAY_SIZE_IN_BYTES);
cudaMalloc((void **)&gpu_warp, ARRAY_SIZE_IN_BYTES);
cudaMalloc((void **)&gpu_calc_thread, ARRAY_SIZE_IN_BYTES);
func1 <<<num_blocks, num_threads >>>(gpu_block,gpu_thread,gpu_warp,gpu_calc_thread);
cudaMemcpy(cpu_block, gpu_block, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);
cudaMemcpy(cpu_thread, gpu_thread, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);
cudaMemcpy(cpu_warp, gpu_warp, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);
cudaMemcpy(cpu_calc_thread, gpu_calc_thread, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);
cudaFree(gpu_block);
cudaFree(gpu_thread);
cudaFree(gpu_warp);
cudaFree(gpu_calc_thread);
int i;
for (i = 0; i < ARRAY_SIZE; i++)
{
printf("Calculated Thread: %d - Block: %d - Warp %d -Thread %d\n",cpu_calc_thread[i], cpu_block[i], cpu_warp[i], cpu_thread[i]);
}
cin.get();
return 0;
}
__global__ void func1(int* block, int* thread, int* warp, int* calc_thread)
{
int i = (blockIdx.x*blockDim.x) + threadIdx.x;
block[i] = blockIdx.x;
thread[i] = threadIdx.x;
warp[i] = threadIdx.x / warpSize;
calc_thread[i] = i;
}
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdlib.h>
#include <conio.h>
using namespace std;
__global__ void func1(int* block,int* thread,int* warp,int* calc_thread);
#define ARRAY_SIZE 128
#define ARRAY_SIZE_IN_BYTES (sizeof(int)*(ARRAY_SIZE))
//int cpu_block[ARRAY_SIZE];
//int cpu_thread[ARRAY_SIZE];
//int cpu_warp[ARRAY_SIZE];
//int cpu_calc_thread[ARRAY_SIZE];
int cpu_block[ARRAY_SIZE];
int cpu_thread[ARRAY_SIZE];
int cpu_warp[ARRAY_SIZE];
int cpu_calc_thread[ARRAY_SIZE];
int main()
{
const int num_blocks = 2;
const int num_threads = 64;
int * gpu_block;
int * gpu_thread;
int * gpu_warp;
int * gpu_calc_thread;
cudaMalloc((void **)&gpu_block, ARRAY_SIZE_IN_BYTES);
cudaMalloc((void **)&gpu_thread, ARRAY_SIZE_IN_BYTES);
cudaMalloc((void **)&gpu_warp, ARRAY_SIZE_IN_BYTES);
cudaMalloc((void **)&gpu_calc_thread, ARRAY_SIZE_IN_BYTES);
func1 <<<num_blocks, num_threads >>>(gpu_block,gpu_thread,gpu_warp,gpu_calc_thread);
cudaMemcpy(cpu_block, gpu_block, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);
cudaMemcpy(cpu_thread, gpu_thread, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);
cudaMemcpy(cpu_warp, gpu_warp, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);
cudaMemcpy(cpu_calc_thread, gpu_calc_thread, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);
cudaFree(gpu_block);
cudaFree(gpu_thread);
cudaFree(gpu_warp);
cudaFree(gpu_calc_thread);
int i;
for (i = 0; i < ARRAY_SIZE; i++)
{
printf("Calculated Thread: %d - Block: %d - Warp %d -Thread %d\n",cpu_calc_thread[i], cpu_block[i], cpu_warp[i], cpu_thread[i]);
}
cin.get();
return 0;
}
__global__ void func1(int* block, int* thread, int* warp, int* calc_thread)
{
int i = (blockIdx.x*blockDim.x) + threadIdx.x;
block[i] = blockIdx.x;
thread[i] = threadIdx.x;
warp[i] = threadIdx.x / warpSize;
calc_thread[i] = i;
}