测试求和 waiting

//duide 1

#include "h.h"
#define  G 4 //灰度阶
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
//测试求和
float ComputeFeature1( int* pdMatrix,int total)
{
 int i,j;
 float f1=0;
 for(i=0; i<G; i++)
 {
  for(j=0; j<G; j++)
  {
   float t;
   t=(float)pdMatrix[i*G+j]/(float)total;
   f1+= (t)*(t);
  }
 }
 return f1;
}
__global__  static void ComputeFeature1( float *sum1,int* pdMatrix,int width,int height,int *total)  //186ye
{
 int i;
 float t;
 int tid=threadIdx.x+threadIdx.y*4;//
 int tid_in_x = threadIdx.x;
 int tid_in_y = threadIdx.y;
 __shared__ float stemp1[G*G];

 if(tid_in_y<G &&(tid_in_x<G))
 {
 stemp1[tid_in_x*G+tid_in_y]=0;
 }

 __syncthreads();
 //归一化
 if(tid_in_y<G &&(tid_in_x<G))
 {
  stemp1[tid_in_x*G+tid_in_y]=(float)pdMatrix[tid]/((float)(*total));
 }
  __syncthreads();
if(threadIdx.x<G && threadIdx.y<G)
 {
  stemp1[threadIdx.x*G+threadIdx.y]=stemp1[threadIdx.x*G+threadIdx.y]*stemp1[threadIdx.x*G+threadIdx.y];
  
    }
 __syncthreads();
 
if(tid<G*G)
{
 for(i=G*G/2; i>0; i/=2)
 {
        if(tid<i)
  {
            stemp1[tid] = stemp1[tid] + stemp1[tid+i] ;

  }
        __syncthreads();

    }
}
  __syncthreads();
 *sum1 = stemp1[0];
DUI
//if(tid<G*G)
//{
// for(i=G*G/2; i>0; i/=2)
// {
//        if(tid<i)
//  {
//            pdMatrix[tid] = pdMatrix[tid] + pdMatrix[tid+i] ;
//
//  }
//        __syncthreads();
//
//    }
//}
//  __syncthreads();
// *sum1 = pdMatrix[0];
//

}
int main()
{
 float *sum1,sum2;
 //dim3 grid(2,2);
 dim3 block(4,4);
 int* pdMatrix;
 int*p;
 float *g_f1,*g_f2;
 int width,height;
 width=height=4;
 int t=height*width;
 //int tt=height*width/4;
 p=(int *)malloc(height*width* sizeof(float));
 int tot=0;
 int *total;
 float f1=0;
 cudaMalloc((void**)&total, sizeof(int));
 for(int i=0;i<t;i++)
 {
  p[i]=i;//i;
  tot+=p[i];
 }
 f1=ComputeFeature1( p,tot);
 printf("%f\n",f1);
 //tot=1;
 cudaMalloc((void**)&total, sizeof(int));
 cudaMemset(total,0,sizeof(int));
 
 cudaMemcpy(total,&tot,sizeof(int), cudaMemcpyHostToDevice);

 cudaMalloc((void**)&pdMatrix,height*width* sizeof(int));
 //cudaMalloc((void**)&g_f1,tt* sizeof(float));
 //cudaMemset(g_f1,0,sizeof(float)*tt);
 cudaMalloc((void**)&sum1, sizeof(float));
 cudaMemset(sum1,0,sizeof(float));
 //g_f2=(float*)malloc(tt* sizeof(float));
 //memset(g_f2,0,sizeof(float)*tt);
 sum2=0;
    cudaMemcpy(pdMatrix,p,height*width*sizeof(int), cudaMemcpyHostToDevice);
 ComputeFeature1<<<1,block>>>(sum1,pdMatrix,width,height,total);
 cudaError_t cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }

  //cudaMemcpy(g_f2,g_f1,tt*sizeof(float), cudaMemcpyDeviceToHost);
   cudaMemcpy(&sum2,sum1,sizeof(float), cudaMemcpyDeviceToHost);
  cudaStatus = cudaDeviceSynchronize();
 if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);

 }
 printf("%f\n",sum2);
 //printf("\n%d\n",sum2);
 
 Error:
   
    return cudaStatus;
}

 

 

//duide 1 end

 

#include "h.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
//测试求和

__global__  static void ComputeFeature1( int *sum1,int* pdMatrix,int width,int height,float *g_f1)  //186ye
{
 //pdMatrix[threadIdx.x]=pdMatrix[threadIdx.x]*pdMatrix[threadIdx.x];

 __shared__ float stemp1[G][G];
 int i,j;
 unsigned int xIndex=__mul24(blockDim.x,blockIdx.x)+threadIdx.x;
 unsigned int yIndex=__mul24(blockDim.y,blockIdx.y)+threadIdx.y;

  unsigned int index=yIndex*width+xIndex; //?
 int tid_in_x=threadIdx.x;
 int tid_in_y=threadIdx.y;
 if(tid_in_x<G &&tid_in_y<G){
  stemp1[tid_in_x][tid_in_y]=0.0f;}
 __syncthreads();
 if((index<width*height)&&(threadIdx.x<G)&&(threadIdx.y<G)) //index_in
 {
  stemp1[threadIdx.x][threadIdx.y]=pdMatrix[index];
 } 
 __syncthreads();

 //float sum1;
 //sum1=0.0f;
 //for(i=0;i<16;i++)
 //{
 // //for(j=0;j<G;j++)
 // //{
 //          *sum1+= pdMatrix[i];
 // //}
 //}

//   这种对
if(index<height*width)
{
 for(i=height*width/2; i>0; i/=2)
 {
        if(index<i)
  {
            pdMatrix[index] = pdMatrix[index] + pdMatrix[index+i] ;

  }
        __syncthreads();

    }
}
  __syncthreads();
 *sum1 = pdMatrix[0];

 ///

/if(sum1!=0)
 //{
 //if(threadIdx.x<G &&threadIdx.y<G){
 stemp1[threadIdx.x][threadIdx.y]=stemp1[threadIdx.x][threadIdx.y]/sum1;
 //}
 //__syncthreads();
 //if(threadIdx.x<G &&threadIdx.y<G){
 stemp1[threadIdx.x][threadIdx.y]=stemp1[threadIdx.x][threadIdx.y]*stemp1[threadIdx.x][threadIdx.y];
 //}
 //__syncthreads();

//这种不对??????????????
 /*if(threadIdx.x<G &&threadIdx.y<G){
 for(j=blockDim.y*2;j>0;j--)
 {
  if(threadIdx.y<j)
  {
   stemp1[threadIdx.x][threadIdx.y]+=stemp1[threadIdx.x][threadIdx.y+j];
  }
  __syncthreads();
 }
 for( i=blockDim.x*2;i > 0 ;i--)
 {
  if (threadIdx.y < j )
  {
   stemp1[threadIdx.x][0]+= stemp1[threadIdx.x+i][0];
  }
  __syncthreads();
 }
 }
 if( threadIdx.x==0&&threadIdx.y==0){
  g_f1[blockIdx.x*2+blockIdx.y]=stemp1[0][0];//

 //}
}*/
}
int main()
{
 int *sum1,sum2;
 dim3 grid(2,2,1);
 dim3 block(2,2,1);
 int* pdMatrix;
 int*p;
 float *g_f1,*g_f2;
 int width,height;
 width=height=4;
 int t=height*width;
 int tt=height*width/4;
 p=(int *)malloc(height*width* sizeof(float));
 for(int i=0;i<t;i++)
 {
  p[i]=i;//i;
 }
 
 cudaMalloc((void**)&pdMatrix,height*width* sizeof(int));
 cudaMalloc((void**)&g_f1,tt* sizeof(float));
 cudaMemset(g_f1,0,sizeof(float)*tt);
 cudaMalloc((void**)&sum1, sizeof(int));
 cudaMemset(sum1,0,sizeof(int));
 g_f2=(float*)malloc(tt* sizeof(float));
 memset(g_f2,0,sizeof(float)*tt);
 sum2=0;
    cudaMemcpy(pdMatrix,p,height*width*sizeof(int), cudaMemcpyHostToDevice);
 ComputeFeature1<<<grid,block>>>(sum1,pdMatrix,width,height,g_f1);
 cudaError_t cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }

  cudaMemcpy(g_f2,g_f1,tt*sizeof(float), cudaMemcpyDeviceToHost);
   cudaMemcpy(&sum2,sum1,sizeof(int), cudaMemcpyDeviceToHost);
  cudaStatus = cudaDeviceSynchronize();
 if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);

 }
 printf("%d\n",sum2);sum2=0;
 for(int i=0;i<tt;i++)
 {
  printf("%f ",g_f2[i]);
   sum2+=g_f2[i];
 }
 printf("\n%d\n",sum2);

 //p[width*height]={0,0,0,1,0,0,1,0,1,1,0,0,0,1,0,1};
 
 Error:
   
    return cudaStatus;
}

 

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值