CUDA Fortran不必要数据拷贝

首先请看下面的代码:

module add
	use cudafor
	contains
	attributes(global) subroutine call_global(a)
		integer :: a(:)
		integer :: i
		i=threadIdx%x+(blockIdx%x-1)*blockDim%x
		a(i)=i
	end subroutine call_global
end module

program test
	use add
	implicit none
	integer,device :: d_a(256)
	integer :: a(256)
	integer :: i	

	do i=1,10
		call call_global<<<2,128>>>(d_a)
	enddo
	a=d_a
	print*,"a(1)=",a(1)

end program
把COMPUTE_PROFILE设置为1,profile的分析结果:

# CUDA_PROFILE_LOG_VERSION 2.0
# CUDA_DEVICE 0 GeForce GTX 750
# CUDA_CONTEXT 1
# TIMESTAMPFACTOR 13859dd8cdd69c55
method,gputime,cputime,occupancy
method=[ memcpyHtoD ] gputime=[ 0.960 ] cputime=[ 6.234 ] 
method=[ add_call_global_ ] gputime=[ 3.360 ] cputime=[ 18.620 ] occupancy=[ 1.000 ] 
method=[ memcpyHtoD ] gputime=[ 0.864 ] cputime=[ 4.235 ] 
method=[ add_call_global_ ] gputime=[ 1.536 ] cputime=[ 5.075 ] occupancy=[ 1.000 ] 
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 3.925 ] 
method=[ add_call_global_ ] gputime=[ 1.536 ] cputime=[ 4.782 ] occupancy=[ 1.000 ] 
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 3.893 ] 
method=[ add_call_global_ ] gputime=[ 1.568 ] cputime=[ 4.502 ] occupancy=[ 1.000 ] 
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 3.729 ] 
method=[ add_call_global_ ] gputime=[ 1.568 ] cputime=[ 4.471 ] occupancy=[ 1.000 ] 
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 3.835 ] 
method=[ add_call_global_ ] gputime=[ 1.568 ] cputime=[ 4.420 ] occupancy=[ 1.000 ] 
method=[ memcpyHtoD ] gputime=[ 0.800 ] cputime=[ 3.744 ] 
method=[ add_call_global_ ] gputime=[ 1.568 ] cputime=[ 4.660 ] occupancy=[ 1.000 ] 
method=[ memcpyHtoD ] gputime=[ 0.864 ] cputime=[ 3.731 ] 
method=[ add_call_global_ ] gputime=[ 1.568 ] cputime=[ 4.437 ] occupancy=[ 1.000 ] 
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 3.689 ] 
method=[ add_call_global_ ] gputime=[ 1.600 ] cputime=[ 4.476 ] occupancy=[ 1.000 ] 
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 3.696 ] 
method=[ add_call_global_ ] gputime=[ 1.568 ] cputime=[ 4.375 ] occupancy=[ 1.000 ] 
method=[ memcpyDtoH ] gputime=[ 2.080 ] cputime=[ 14.631 ] 

什么情况?每次循环之前都有数据拷贝!!然后给出PGI的解释:

The array is declared assumed-shape in the global subroutine, so we have to allocate, initialize, and pass the descriptor every time. 

PGI给出两个解决办法:

1、参数以reference形式传递

  修改代码如下:

module add
	use cudafor
	contains
	attributes(global) subroutine call_global(a)
		integer :: a(*)
		integer :: i
		i=threadIdx%x+(blockIdx%x-1)*blockDim%x
		a(i)=i
	end subroutine call_global
end module

program test
	use add
	implicit none
	integer,device :: d_a(256)
	integer :: a(256)
	integer :: i	

	do i=1,10
		call call_global<<<2,128>>>(d_a)
	enddo
	a=d_a
	print*,"a(1)=",a(1)

end program
    如果不注意看的话,估计没看出啥区别,请注意代码第5行。看懂了吧?

   profile的分析结果如下:

# CUDA_PROFILE_LOG_VERSION 2.0
# CUDA_DEVICE 0 GeForce GTX 750
# CUDA_CONTEXT 1
# TIMESTAMPFACTOR 13859dd8cdbb1678
method,gputime,cputime,occupancy
method=[ add_call_global_ ] gputime=[ 3.168 ] cputime=[ 16.332 ] occupancy=[ 1.000 ] 
method=[ add_call_global_ ] gputime=[ 1.408 ] cputime=[ 3.883 ] occupancy=[ 1.000 ] 
method=[ add_call_global_ ] gputime=[ 1.376 ] cputime=[ 3.722 ] occupancy=[ 1.000 ] 
method=[ add_call_global_ ] gputime=[ 1.344 ] cputime=[ 3.291 ] occupancy=[ 1.000 ] 
method=[ add_call_global_ ] gputime=[ 1.408 ] cputime=[ 3.566 ] occupancy=[ 1.000 ] 
method=[ add_call_global_ ] gputime=[ 1.408 ] cputime=[ 3.438 ] occupancy=[ 1.000 ] 
method=[ add_call_global_ ] gputime=[ 1.408 ] cputime=[ 3.333 ] occupancy=[ 1.000 ] 
method=[ add_call_global_ ] gputime=[ 1.376 ] cputime=[ 3.304 ] occupancy=[ 1.000 ] 
method=[ add_call_global_ ] gputime=[ 1.408 ] cputime=[ 3.311 ] occupancy=[ 1.000 ] 
method=[ add_call_global_ ] gputime=[ 1.408 ] cputime=[ 3.226 ] occupancy=[ 1.000 ] 
method=[ memcpyDtoH ] gputime=[ 2.080 ] cputime=[ 12.897 ] 
   木有了每次循环之前都有数据拷贝的过程。
2、把变量放置到moule中(测试有问题)

我们继续修改代码如下:

module add
	use cudafor
	integer,device :: d_a(256)
	contains
	attributes(global) subroutine call_global(a)
		integer :: a(:)
		integer :: i
		i=threadIdx%x+(blockIdx%x-1)*blockDim%x
		a(i)=i
	end subroutine call_global
end module

program test
	use add
	implicit none
	
	integer :: a(256)
	integer :: i	

	
	do i=1,10
		call call_global<<<2,128>>>(d_a)
	enddo
	a=d_a
	print*,"a(1)=",a(1)

end program
profile的分析结果如下:

# CUDA_PROFILE_LOG_VERSION 2.0
# CUDA_DEVICE 0 GeForce GTX 750
# CUDA_CONTEXT 1
# TIMESTAMPFACTOR 13859fff450cdbde
method,gputime,cputime,occupancy
method=[ memcpyHtoD ] gputime=[ 0.864 ] cputime=[ 21.286 ] 
method=[ add_call_global_ ] gputime=[ 2.112 ] cputime=[ 7.290 ] occupancy=[ 1.000 ] 
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 69.106 ] 
method=[ add_call_global_ ] gputime=[ 1.504 ] cputime=[ 6.707 ] occupancy=[ 1.000 ] 
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 54.235 ] 
method=[ add_call_global_ ] gputime=[ 1.504 ] cputime=[ 6.123 ] occupancy=[ 1.000 ] 
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 75.521 ] 
method=[ add_call_global_ ] gputime=[ 1.504 ] cputime=[ 6.707 ] occupancy=[ 1.000 ] 
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 78.729 ] 
method=[ add_call_global_ ] gputime=[ 1.504 ] cputime=[ 6.123 ] occupancy=[ 1.000 ] 
method=[ memcpyHtoD ] gputime=[ 0.864 ] cputime=[ 67.357 ] 
method=[ add_call_global_ ] gputime=[ 1.504 ] cputime=[ 5.832 ] occupancy=[ 1.000 ] 
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 53.944 ] 
method=[ add_call_global_ ] gputime=[ 1.536 ] cputime=[ 5.832 ] occupancy=[ 1.000 ] 
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 53.652 ] 
method=[ add_call_global_ ] gputime=[ 1.536 ] cputime=[ 5.540 ] occupancy=[ 1.000 ] 
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 76.105 ] 
method=[ add_call_global_ ] gputime=[ 1.472 ] cputime=[ 6.123 ] occupancy=[ 1.000 ] 
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 69.690 ] 
method=[ add_call_global_ ] gputime=[ 1.536 ] cputime=[ 5.540 ] occupancy=[ 1.000 ] 
method=[ memcpyDtoH ] gputime=[ 2.688 ] cputime=[ 87.477 ] 

问题还是依旧。等待继续反馈看结果。

7-31号补充:

把数据放置到module中用错了,正确的方法如下:

module add
	use cudafor
	integer,device :: d_a(256,16)
	contains
	attributes(global) subroutine call_global()
		integer :: i,j
		i=threadIdx%x+(blockIdx%x-1)*blockDim%x
		j=threadIdx%y+(blockIdx%y-1)*blockDim%y	

		d_a(i,j)=i+j
	end subroutine call_global
end module

program test
	use add
	implicit none
	
	integer :: a(256,16)
	integer :: i	
	type(dim3) :: tblock,tgrid

	tblock=dim3(128,2,1)
	tgrid=dim3(2,8,1)
	do i=1,10
		call call_global<<<tgrid,tblock>>>()
	enddo
	a=d_a
	print*,"a(1)=",a(1,1)

end program

profile分析结果:

# CUDA_PROFILE_LOG_VERSION 2.0
# CUDA_DEVICE 0 GeForce GTX 750
# CUDA_CONTEXT 1
# TIMESTAMPFACTOR 1385ef0816adc600
method,gputime,cputime,occupancy
method=[ add_call_global_ ] gputime=[ 1.888 ] cputime=[ 8.164 ] occupancy=[ 1.000 ] 
method=[ add_call_global_ ] gputime=[ 1.408 ] cputime=[ 5.249 ] occupancy=[ 1.000 ] 
method=[ add_call_global_ ] gputime=[ 1.408 ] cputime=[ 4.957 ] occupancy=[ 1.000 ] 
method=[ add_call_global_ ] gputime=[ 1.472 ] cputime=[ 4.957 ] occupancy=[ 1.000 ] 
method=[ add_call_global_ ] gputime=[ 1.440 ] cputime=[ 4.957 ] occupancy=[ 1.000 ] 
method=[ add_call_global_ ] gputime=[ 1.408 ] cputime=[ 4.665 ] occupancy=[ 1.000 ] 
method=[ add_call_global_ ] gputime=[ 1.440 ] cputime=[ 4.374 ] occupancy=[ 1.000 ] 
method=[ add_call_global_ ] gputime=[ 1.440 ] cputime=[ 4.957 ] occupancy=[ 1.000 ] 
method=[ add_call_global_ ] gputime=[ 1.408 ] cputime=[ 4.665 ] occupancy=[ 1.000 ] 
method=[ add_call_global_ ] gputime=[ 1.408 ] cputime=[ 4.374 ] occupancy=[ 1.000 ] 
method=[ memcpyDtoH ] gputime=[ 4.768 ] cputime=[ 167.663 ] 
解决了不必要的H2D的数据拷贝。

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值