在学习CUDA的过程中,免不了要实现矩阵乘,从而对GPU的运作机制以及如何使用share memory有更深的理解。下面是用FORTRAN对简单矩阵乘和利用分块并将数据放入share memory实现矩阵乘的实现。
! this program demonstates various memory optimzation techniques
! applied to a matrix multiplication.
module dimensions_m
implicit none
integer, parameter :: TILE_DIM = 16
integer, parameter :: NUM_REPS = 100
integer, parameter :: nx = 256, ny = 256
end module dimensions_m
module kernels_matrix
use dimensions_m
implicit none
contains
!实际没有使用,但通过调用它的输出可以看出数据是如何移动的,
!对于share memory时,每个线程需要搬用全局的哪些数据有帮助
attributes(global) subroutine matrixMulTestKernel(dA,dB,dC)
real :: dA(ny,nx),dB(ny,nx),dC(ny,nx)
real,shared :: tmA(TILE_DIM,TILE_DIM),tmB(TILE_DIM,TILE_DIM)
integer :: x, y, i,j,tx,ty,ti
real :: tmc
tx = threadIdx%x
ty = threadIdx%y
x = (blockIdx%x-1) * blockDim%x + threadIdx%x
y = (blockIdx%y-1) * blockDim%y + threadIdx%y
tmc = 0.0
i = TILE_DIM
if(tx <= nx .and. ty <= ny) then
tmA(tx,ty) = dA(x,ty+i)
tmB(tx,ty) = dB(tx+i,ty)
call syncthreads()
do j = 1,TILE_DIM
tmc = tmA(tx,ty) !tmA(tx,j) *tmB(j,ty) + tmc
end do
call syncthreads()
end if
dC(x,y) = tmc
end subroutine matrixMulTestKernel
! 简单的矩阵乘实现,基本思路是每个线程获得它在整个线程中的位置,
! 然后计算该点对应的结果即可
attributes(global) subroutine matrixMulKernel(dA,dB,dC)
real :: dA(ny,nx),dB(ny,nx),dC(ny,nx)
integer :: x, y, j
x = (blockIdx%x-1) * blockDim%x + threadIdx%x
y = (blockIdx%y-1) * blockDim%y + threadIdx%y
dC(x,y) = 0.0;
do j = 1,nx
dC(x,y) = dA(x,j)*dB(j,y) + dC(x,y)
end do
end subroutine matrixMulKernel
!共享内存实现版本,首先将矩阵中的一块放入共享内存,并计算出中间结果,
!接着使用循环的方式,计算出来这块的最终结果。将数据放入共享内存,可以减少
!线程从全局内存中取数据的次数,从而提高计算速度
attributes(global) subroutine matrixMulSharedKernel(dA,dB,dC)
real :: dA(ny,nx),dB(ny,nx),dC(ny,nx)
real,shared :: tmA(TILE_DIM,TILE_DIM),tmB(TILE_DIM,TILE_DIM)
integer :: x, y, i,j,tx,ty,ti
real :: tmc
tx = threadIdx%x
ty = threadIdx%y
x = (blockIdx%x-1) * blockDim%x + threadIdx%x
y = (blockIdx%y-1) * blockDim%y + threadIdx%y
tmc = 0.0
! 通过循环的方式,计算出最终结果
do i = 0,nx-1,TILE_DIM
if(tx+i <= nx .and. ty+i <= ny) then
tmA(tx,ty) = dA(x,ty+i)
tmB(tx,ty) = dB(tx+i,y)
call syncthreads()
do j = 1,TILE_DIM
tmc = tmA(tx,j)*tmB(j,ty) + tmc
end do
call syncthreads()
end if
end do
dC(x,y) = tmc
end subroutine matrixMulSharedKernel
end module kernels_matrix
program matrixMulMatrix
use cudafor
use dimensions_m
use kernels_matrix
implicit none
type (dim3) :: grid, tBlock
type (cudaEvent) :: startEvent, stopEvent
type (cudaDeviceProp) :: prop
real :: time
real :: hA(nx,ny), hB(nx,ny), hC(ny,nx),gold(ny,nx)
real, device :: dA(nx,ny), dB(nx,ny), dC(ny,nx)
integer :: i,ii, j,k, istat,errflag
integer :: cputime(0:1),timediff,clock_rate,clock_max
! check parameters and calculate execution configuration
! if (mod(nx, TILE_DIM) /= 0 &
! .or. mod(ny, TILE_DIM) /= 0) then
! write(*,*) 'nx and ny must be a multiple of TILE_DIM'
! stop
! end if
!
grid = dim3(nx/TILE_DIM, ny/TILE_DIM, 1)
tBlock = dim3(TILE_DIM, TILE_DIM, 1)
! write parameters
i = cudaGetDeviceProperties(prop, 0)
write(*,"(/,'Device Name: ',a)") trim(prop%name)
write(*,"('Compute Capability: ',i0,'.',i0)") &
prop%major, prop%minor
write(*,*)
write(*,"('Matrix size:', i5, i5, ', &
Tile size:', i3, i3)") &
nx, ny, TILE_DIM, TILE_DIM
write(*,"('grid:', i4,i4,i4, ', tBlock:', i4,i4,i4)") &
grid%x, grid%y, grid%z, tBlock%x, tBlock%y, tBlock%z
! initialize data
! host
do j = 1, ny
do i = 1, nx
hA(i,j) = i-1
hB(i,j) = j-1
enddo
enddo
! 获得cpu计算的结果
call system_clock(cputime(0),clock_rate,clock_max)
do j = 1, ny
do i = 1, nx
gold(i,j) = 0
do k = 1, nx
gold(i,j) = hA(i,k) * hB(k,j) + gold(i,j)
end do
enddo
enddo
call system_clock(cputime(1),clock_rate,clock_max)
timediff = cputime(1) - cputime(0)
write(*,'(a20,$)') '*** CpuTime(ms):'
write(*,'(F20.2)') real(timediff)*1000/real(clock_rate)
! do i = 1, nx
! do j = 1, ny
! write(*,"(F10.3,$)") hA(i,j)
! end do
! write(*,*)
! end do
! write(*,*)
! do i = 1, nx
! do j = 1, ny
! write(*,"(F10.3,$)") hB(i,j)
! end do
! write(*,*)
! end do
! device
dB = hB
dA = hA
! events for timing
!golbal memory 全局内存实现方式
write(*,'(a30)') '*** global memory results ***'
istat = cudaEventCreate(startEvent)
istat = cudaEventCreate(stopEvent)
istat = cudaEventRecord(startEvent, 0)
do i=1, NUM_REPS
call matrixMulKernel<<<grid, tBlock>>>(dA,dB,dC)
end do
istat = cudaEventRecord(stopEvent, 0)
istat = cudaEventSynchronize(stopEvent)
istat = cudaEventElapsedTime(time, startEvent, stopEvent)
hC = dC
call postprocess(gold,hC,time)
! share memory 共享内存实现方式
write(*,'(a30)') '*** share memory results ***'
istat = cudaEventCreate(startEvent)
istat = cudaEventCreate(stopEvent)
istat = cudaEventRecord(startEvent, 0)
do i=1, NUM_REPS
call matrixMulSharedKernel<<<grid, tBlock>>>(dA,dB,dC)
end do
istat = cudaEventRecord(stopEvent, 0)
istat = cudaEventSynchronize(stopEvent)
istat = cudaEventElapsedTime(time, startEvent, stopEvent)
hC = dC
call postprocess(gold,hC,time)
istat = cudaEventDestroy(startEvent)
istat = cudaEventDestroy(stopEvent)
! 结果验证函数
contains
subroutine postprocess(ref, res, t)
real, intent(in) :: ref(:,:), res(:,:), t
if (all(res == ref)) then
write(*,'(a20)') '*** Test Passed ***'
write(*,'(a20,$)') '*** ElapsedTime(ms):'
write(*,'(f20.2)') (t/NUM_REPS)
else
write(*,'(a20)') '*** Failed ***'
end if
end subroutine postprocess
end program matrixMulMatrix