1.得到原子加 全1 的结果
__global__ void addKernel( int *c,int *x,int *y)
{
unsigned int tid_in_x,tid_in_y;
tid_in_x=blockIdx.x*blockDim.x+threadIdx.x;
unsigned int i=0;
//for(i=0;i<16;i++)
//{
// c[i]=10;
//}
if(tid_in_x<48){
atomicAdd(&c[tid_in_x],1);
}
*x=blockDim.x;
*y=blockDim.y;
}
void zong(){
int *a;
int *c;
int xc,yc;
int *y,*x;
int l=4*12;
dim3 block(12,1);
a=(int*)malloc(sizeof(int )*l);
memset(a,0,sizeof(int)*l);
cudaMalloc((void**)&c,sizeof(int)*l);
cudaMemset(c,0,sizeof(int)*l);
cudaMalloc((void**)&x,sizeof(int));
cudaMemset(x,0,sizeof(int));
cudaMalloc((void**)&y,sizeof(int));
cudaMemset(y,0,sizeof(int));
addKernel<<<4,block>>>(c,x,y);
cudaMemcpy(a,c,sizeof(int)*l,cudaMemcpyDeviceToHost);
int i;
for(i=0;i<l;i++)
{
printf("%d ",a[i]);
}
cudaMemcpy(&xc,x,sizeof(int),cudaMemcpyDeviceToHost);
cudaMemcpy(&yc,y,sizeof(int),cudaMemcpyDeviceToHost);
printf("\n %d ",xc);printf("\n %d ",yc); //3 4
}
int main()
{
zong();
return 0;
}
#define W 12
#define H 4
__global__ void addKernel( int **c)
{
int tid_in_x,tid_in_y;
tid_in_x=blockIdx.x*blockDim.x+threadIdx.x;
tid_in_y=blockIdx.y*blockDim.y+threadIdx.y;
int tid;
// tid=tid_in_x*H+tid_in_y;
//if((tid_in_x<W)&&( tid_in_y<H)){
atomicAdd(&c[tid_in_x][tid_in_y],1);
//}
/**x=blockDim.x;
*y=blockDim.y;*/
}
void zong(){
int **a;
int **c;
//int xc,yc;
//int *y,*x;
int l=4*12;
dim3 block(3,4);
a=(int**)malloc(sizeof(int* )*H);
int i,j;
for(i=0; i<H; i++) {
a[i]=(int*)malloc(sizeof(int)*W);
}
for(i=0; i<H; i++){
for(j=0; j<W; j++){
a[i][j]=0;
}
}
//memset(a,0,sizeof(int)*l);
//cudaMalloc((void**)&c,sizeof(int)*l);
size_t size = sizeof(int)*W; // 数据的宽度in bytes
size_t pitch;
cudaMallocPitch((void**)&c, &pitch, size, H);
cudaMemset2D(c, pitch, 0, size, H);
//cudaMalloc((void**)&x,sizeof(int));
//cudaMemset(x,0,sizeof(int));
//cudaMalloc((void**)&y,sizeof(int));
//cudaMemset(y,0,sizeof(int));
addKernel<<<(2,2),block>>>(c);
//cudaMemcpy(a,c,sizeof(int)*l,cudaMemcpyDeviceToHost);
cudaMemcpy2D(a, size, c, pitch, size, H, cudaMemcpyDeviceToHost);
for(i=0;i<H;i++)
{
for(j=0;j<W;j++)
{
printf("%d ",a[i][j]);
}
printf("\n");
}
/* //cudaMemcpy(&xc,x,sizeof(int),cudaMemcpyDeviceToHost);
//cudaMemcpy(&yc,y,sizeof(int),cudaMemcpyDeviceToHost);
//printf("\n %d ",xc);printf("\n %d ",yc); //
//cudaFree((void*)c);
// free(c);
for(i=0; i<H; i++) {
free(a[i]);
}
free(a); */
}
int main()
{
zong();
return 0;
}
3. ??????????????????????????????? cuowu
#define W 12
#define H 4
__global__ void addKernel( int *c,int *x,int *y)
{
int tid_in_x,tid_in_y;
tid_in_x=blockIdx.x*blockDim.x+threadIdx.x;
tid_in_y=blockIdx.y*blockDim.y+threadIdx.y;
int tid;
tid=tid_in_x*H+tid_in_y;
if(tid<W*H){
atomicAdd(&c[tid],1);
}
*x=blockDim.x;
*y=blockDim.y;
}
void zong(){
int *a;
int *c;
int xc,yc;
int *y,*x;
int l=4*12;
dim3 block(3,4);
a=(int*)malloc(sizeof(int )*l);
memset(a,0,sizeof(int)*l);
cudaMalloc((void**)&c,sizeof(int)*l);
cudaMemset(c,0,sizeof(int)*l);
cudaMalloc((void**)&x,sizeof(int));
cudaMemset(x,0,sizeof(int));
cudaMalloc((void**)&y,sizeof(int));
cudaMemset(y,0,sizeof(int));
addKernel<<<(2,2),block>>>(c,x,y);
cudaMemcpy(a,c,sizeof(int)*l,cudaMemcpyDeviceToHost);
int i;
for(i=0;i<l;i++)
{
printf("%d ",a[i]);
}
cudaMemcpy(&xc,x,sizeof(int),cudaMemcpyDeviceToHost);
cudaMemcpy(&yc,y,sizeof(int),cudaMemcpyDeviceToHost);
printf("\n %d ",xc);printf("\n %d ",yc); //3 4
}
4. //借鉴(被收藏的)“我人生的第一个程序” 修改而成
//*****************************************************************************************************
/*原: dim3 Db(width,1, 1); (可实现原子操作结果) 改:(不对?) dim3 Db(width/2, 2, 1);
******************************************************************************************************/
__global__ void myKernel(int *c, size_t pitch, int height, int width,int *x,int *y)
{
int i = blockIdx.y * blockDim.y + threadIdx.y;
int j = blockIdx.x * blockDim.x + threadIdx.x;
if(i < height && j < width){
atomicAdd(&c[i * pitch/ sizeof(int) + j],1);
}
*x=blockDim.x;
*y=blockDim.y;
}
int main(int argc, char* argv[])
{
int *x,*y;
int xc,yc;
//if(!InitCUDA())
// return 0;
//CPU上的矩阵数组
int *cpu_C;
//GPU上的矩阵数组
int *gpu_C;
int width = 4; //矩阵的宽度(列数)
int height = 2;//矩阵的高度(行数)
size_t pitch; //GPU数组的pitch
//为CPU上的矩阵数组申请内存空间
cpu_C = (int*)malloc(sizeof(int) * width * height);
//为GPU上的矩阵数组申请显存空间
cudaMallocPitch((void**) &gpu_C, &pitch, sizeof(int) * width, height);
//将pitch打印
printf("The pitch is: %d\n", pitch);
//为CPU上的矩阵数组初始化
for(int r = 0; r < height; ++r){
for(int c = 0; c < width; ++c){
cpu_C[r * width + c] = 0;
}
}
//将CPU上的矩阵数组cpu_C拷贝到GPU上的矩阵数组gpu_C中
cudaMemcpy2D( gpu_C, pitch, cpu_C, sizeof(int) * width, sizeof(int) * width, height, cudaMemcpyHostToDevice);
dim3 Dg(1, 2, 1); //定义整个grid的维度和尺寸
dim3 Db(width,1, 1); //定义每个block的维度和尺寸 原: dim3 Db(width,1, 1); 改:(不对?) dim3 Db(width/2, 2, 1);
//
cudaMalloc((void**)&x,sizeof(int));
cudaMemset(x,0,sizeof(int));
cudaMalloc((void**)&y,sizeof(int));
cudaMemset(y,0,sizeof(int));
//
myKernel<<<Dg, Db, 0>>>(gpu_C, pitch, height, width,x,y); //调用kernel函数
//将显存数组gpu_C拷贝会内存数组cpu_C
cudaMemcpy2D( cpu_C, sizeof(int) * width, gpu_C, pitch, sizeof(int) * width, height, cudaMemcpyDeviceToHost);
//打印CPU_C数组
printf("\nAfter change CPU_C DATA\n");
for(int r = 0; r < height; ++r){
for(int c = 0; c < width; ++c){
printf("%d\t", cpu_C[r * width + c]);
}
printf("\n");
}
printf("\n ceshi: ");
cudaMemcpy(&xc,x,sizeof(int),cudaMemcpyDeviceToHost);
cudaMemcpy(&yc,y,sizeof(int),cudaMemcpyDeviceToHost);
printf("\n %d ",xc);printf("\n %d ",yc); //3 4
//释放内存空间
free(cpu_C);
//释放显存空间
cudaFree(gpu_C);
//退出CUDA
// CUT_EXIT(argc, argv);
return 0;
}