测试求和 waiting

niefeng111

于 2014-04-21 21:45:49 发布

阅读量441

点赞数 1

分类专栏： CUDA

本文链接：https://blog.csdn.net/niefeng111/article/details/24272505

版权

CUDA 专栏收录该内容

6 篇文章 0 订阅

订阅专栏

//duide 1

#include "h.h"
#define G 4 //灰度阶
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
//测试求和
float ComputeFeature1( int* pdMatrix,int total)
{
int i,j;
float f1=0;
for(i=0; i<G; i++)
{
  for(j=0; j<G; j++)
  {
   float t;
   t=(float)pdMatrix[i*G+j]/(float)total;
   f1+= (t)*(t);
  }
}
return f1;
}
__global__ static void ComputeFeature1( float *sum1,int* pdMatrix,int width,int height,int *total) //186ye
{
int i;
float t;
int tid=threadIdx.x+threadIdx.y*4;//
int tid_in_x = threadIdx.x;
int tid_in_y = threadIdx.y;
__shared__ float stemp1[G*G];

if(tid_in_y<G &&(tid_in_x<G))
{
stemp1[tid_in_x*G+tid_in_y]=0;
}

__syncthreads();
//归一化
if(tid_in_y<G &&(tid_in_x<G))
{
  stemp1[tid_in_x*G+tid_in_y]=(float)pdMatrix[tid]/((float)(*total));
}
  __syncthreads();
if(threadIdx.x<G && threadIdx.y<G)
{
  stemp1[threadIdx.x*G+threadIdx.y]=stemp1[threadIdx.x*G+threadIdx.y]*stemp1[threadIdx.x*G+threadIdx.y];

    }
__syncthreads();

if(tid<G*G)
{
for(i=G*G/2; i>0; i/=2)
{
        if(tid<i)
  {
            stemp1[tid] = stemp1[tid] + stemp1[tid+i] ;

}
__syncthreads();

    }
}
__syncthreads();
*sum1 = stemp1[0];
DUI
//if(tid<G*G)
//{
// for(i=G*G/2; i>0; i/=2)
// {
//        if(tid<i)
//  {
//            pdMatrix[tid] = pdMatrix[tid] + pdMatrix[tid+i] ;
//
//  }
//        __syncthreads();
//
//    }
//}
// __syncthreads();
// *sum1 = pdMatrix[0];
//

}
int main()
{
float *sum1,sum2;
//dim3 grid(2,2);
dim3 block(4,4);
int* pdMatrix;
int*p;
float *g_f1,*g_f2;
int width,height;
width=height=4;
int t=height*width;
//int tt=height*width/4;
p=(int *)malloc(height*width* sizeof(float));
int tot=0;
int *total;
float f1=0;
cudaMalloc((void**)&total, sizeof(int));
for(int i=0;i<t;i++)
{
p[i]=i;//i;
tot+=p[i];
}
f1=ComputeFeature1( p,tot);
printf("%f\n",f1);
//tot=1;
cudaMalloc((void**)&total, sizeof(int));
cudaMemset(total,0,sizeof(int));

cudaMemcpy(total,&tot,sizeof(int), cudaMemcpyHostToDevice);

cudaMalloc((void**)&pdMatrix,height*width* sizeof(int));
//cudaMalloc((void**)&g_f1,tt* sizeof(float));
//cudaMemset(g_f1,0,sizeof(float)*tt);
cudaMalloc((void**)&sum1, sizeof(float));
cudaMemset(sum1,0,sizeof(float));
//g_f2=(float*)malloc(tt* sizeof(float));
//memset(g_f2,0,sizeof(float)*tt);
sum2=0;
    cudaMemcpy(pdMatrix,p,height*width*sizeof(int), cudaMemcpyHostToDevice);
ComputeFeature1<<<1,block>>>(sum1,pdMatrix,width,height,total);
cudaError_t cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }

//cudaMemcpy(g_f2,g_f1,tt*sizeof(float), cudaMemcpyDeviceToHost);
   cudaMemcpy(&sum2,sum1,sizeof(float), cudaMemcpyDeviceToHost);
  cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);

}
printf("%f\n",sum2);
//printf("\n%d\n",sum2);

Error:

return cudaStatus;
}

//duide 1 end

#include "h.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
//测试求和

__global__ static void ComputeFeature1( int *sum1,int* pdMatrix,int width,int height,float *g_f1) //186ye
{
//pdMatrix[threadIdx.x]=pdMatrix[threadIdx.x]*pdMatrix[threadIdx.x];

__shared__ float stemp1[G][G];
int i,j;
unsigned int xIndex=__mul24(blockDim.x,blockIdx.x)+threadIdx.x;
unsigned int yIndex=__mul24(blockDim.y,blockIdx.y)+threadIdx.y;

  unsigned int index=yIndex*width+xIndex; //?
int tid_in_x=threadIdx.x;
int tid_in_y=threadIdx.y;
if(tid_in_x<G &&tid_in_y<G){
  stemp1[tid_in_x][tid_in_y]=0.0f;}
__syncthreads();
if((index<width*height)&&(threadIdx.x<G)&&(threadIdx.y<G)) //index_in
{
  stemp1[threadIdx.x][threadIdx.y]=pdMatrix[index];
}
__syncthreads();

//float sum1;
//sum1=0.0f;
//for(i=0;i<16;i++)
//{
// //for(j=0;j<G;j++)
// //{
// *sum1+= pdMatrix[i];
// //}
//}

//   这种对
if(index<height*width)
{
for(i=height*width/2; i>0; i/=2)
{
        if(index<i)
  {
            pdMatrix[index] = pdMatrix[index] + pdMatrix[index+i] ;

}
__syncthreads();

}
}
__syncthreads();
*sum1 = pdMatrix[0];

///

/if(sum1!=0)
//{
//if(threadIdx.x<G &&threadIdx.y<G){
stemp1[threadIdx.x][threadIdx.y]=stemp1[threadIdx.x][threadIdx.y]/sum1;
//}
//__syncthreads();
//if(threadIdx.x<G &&threadIdx.y<G){
stemp1[threadIdx.x][threadIdx.y]=stemp1[threadIdx.x][threadIdx.y]*stemp1[threadIdx.x][threadIdx.y];
//}
//__syncthreads();

//这种不对？？？？？？？？？？？？？？
/*if(threadIdx.x<G &&threadIdx.y<G){
for(j=blockDim.y*2;j>0;j--)
{
  if(threadIdx.y<j)
  {
   stemp1[threadIdx.x][threadIdx.y]+=stemp1[threadIdx.x][threadIdx.y+j];
  }
  __syncthreads();
}
for( i=blockDim.x*2;i > 0 ;i--)
{
  if (threadIdx.y < j )
  {
   stemp1[threadIdx.x][0]+= stemp1[threadIdx.x+i][0];
  }
  __syncthreads();
}
}
if( threadIdx.x==0&&threadIdx.y==0){
  g_f1[blockIdx.x*2+blockIdx.y]=stemp1[0][0];//

//}
}*/
}
int main()
{
int *sum1,sum2;
dim3 grid(2,2,1);
dim3 block(2,2,1);
int* pdMatrix;
int*p;
float *g_f1,*g_f2;
int width,height;
width=height=4;
int t=height*width;
int tt=height*width/4;
p=(int *)malloc(height*width* sizeof(float));
for(int i=0;i<t;i++)
{
  p[i]=i;//i;
}

cudaMalloc((void**)&pdMatrix,height*width* sizeof(int));
cudaMalloc((void**)&g_f1,tt* sizeof(float));
cudaMemset(g_f1,0,sizeof(float)*tt);
cudaMalloc((void**)&sum1, sizeof(int));
cudaMemset(sum1,0,sizeof(int));
g_f2=(float*)malloc(tt* sizeof(float));
memset(g_f2,0,sizeof(float)*tt);
sum2=0;
    cudaMemcpy(pdMatrix,p,height*width*sizeof(int), cudaMemcpyHostToDevice);
ComputeFeature1<<<grid,block>>>(sum1,pdMatrix,width,height,g_f1);
cudaError_t cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }

cudaMemcpy(g_f2,g_f1,tt*sizeof(float), cudaMemcpyDeviceToHost);
   cudaMemcpy(&sum2,sum1,sizeof(int), cudaMemcpyDeviceToHost);
  cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);

}
printf("%d\n",sum2);sum2=0;
for(int i=0;i<tt;i++)
{
printf("%f ",g_f2[i]);
sum2+=g_f2[i];
}
printf("\n%d\n",sum2);

//p[width*height]={0,0,0,1,0,0,1,0,1,1,0,0,0,1,0,1};

Error:

return cudaStatus;
}

niefeng111

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
测试求和 waiting

#include "h.h"#include "cuda_runtime.h"#include "device_launch_parameters.h"//测试求和不正确？？__global__ static void ComputeFeature1( int *sum1,int* pdMatrix,int width,int height,float *g_f1) //186ye{//pdMa
复制链接

扫一扫

专栏目录