利用cuda实现1D stencil,关键要注意共享内存的使用形式,动态共享内存需要在<<<>>>中的第三个参数指定大小,静态的需要使用常量在kernel函数中直接指定大小,不然会出现内存错误:error: copyout Memcpy FAILED: 700(an illegal memory access was encountered) 详见:https://forums.developer.nvidia.com/t/cuda-fortran-error-copyout-memcpy-failed-700-an-illegal-memory-access-was-encountered/179186
以下为代码:
module reverse_m
implicit none
integer, device :: n_d
contains
attributes(global) subroutine staticReverse(a, d)
real :: d(:), a(:), result
integer :: t, tr, t1, j, sa
integer,parameter :: ra = 3
real, shared :: s(-ra+1:64 + ra)
!real, shared :: s(64 + 2*ra)
t = (blockIdx%x-1)*blockDim%x + threadIdx%x
sa = size(a)
!write(*,*) 't:', t, ra
t1 = threadIdx%x
s(t1) = a(t)
if (t1 <= ra) then
s(t1 - ra) = 0
if (t - ra > 0) then
s(t1-ra) = a(t - ra)
endif
s(t1 +64 - ra) = 0
if ( t + 64 <= sa) then
s(t1 + 64 ) = a(t + 64 )
endif
endif
call syncthreads()
if (t > ra .and. t <= size(a) - ra) then
result = 0
do j = -ra, ra
result = result + s(t1 + j)
enddo
d(t) = result
endif
end subroutine staticReverse
end module reverse_m
program sharedExample
use cudafor
use reverse_m
implicit none
integer, parameter :: n = 1280, ra = 3
real :: a(n), r(n), d(n)
real, device :: d_a(n), d_d(n)
type(dim3) :: grid, tBlock
integer :: i, sizeInBytes, j
tBlock = dim3(64,1,1)
grid = dim3((n-1)/64 + 1,1,1)
do i = 1, n
a(i) = i
d(i) = 0
enddo
sizeInBytes = sizeof(a(1))*tBlock%x
! run version with static shared memory
d_a = a
d_d = d
write(*,*) 'Size(a):', size(a)
call staticReverse<<<grid,tBlock>>>(d_a, d_d)
r = d_d
do i = 1 + ra, n - ra
do j = -ra, ra
d(i) = d(i) + a(i + j)
enddo
enddo
write(*,*) r
write(*,*) d
write(*,*) 'Static case max error:', maxval(abs(r-d))
if (maxval(abs(r-d)) .gt. 1.0e-7) then
write(*,*) "Test Failed"
else
write(*,*) "Test Passed"
endif
end program sharedExample