本章节翻译by chenchensmail@163.com 原文:Fortran Example (intel.com)
我们将使用一个 Fortran 示例来说明如何适当地选择内存分配,以避免冗余的数据传输并提高性能。
这个 Fortran 示例使用了 OpenMP 部署,并且是根据 NWChem (一个高性能的计算化学应用程序)进行改编的。
在这个示例中,有一个 4 层的循环嵌套。最内层的 k 循环调用了 omp_fbody
例程,该例程将两个对 sgemm
的调用部署到设备上, 然后将一个 reduction 计算( reduction 变量 emp4i
和 emp5i
)部署到设备上。
在循环嵌套中,有一些数组在主机上进行了更新(即,数组 Tkj
、 Kkj
、 Jia
、 Kia
、 Tia
、 Xia
和 t1v1
)。因此,我们需要使设备上这些数组的值与主机上的值保持一致。
版本 1
在程序的第一个(初级)版本中,我们使用 allocate
指令在共享内存中分配所有 11 个数组。
!$omp allocate allocator(omp_target_shared_mem_alloc) allocate( f1n(1:nvir,1:nvir) ) !$omp allocate allocator(omp_target_shared_mem_alloc) allocate( f2n(1:nvir,1:nvir) ) !$omp allocate allocator(omp_target_shared_mem_alloc) allocate( eorb(1:nbf) ) ...
共享分配可以由主机和设备访问,并根据需要自动在主机和设备之间迁移。因此,可以在主机和设备上使用指向共享分配的相同指针。
版本 1 如下所示。
1 include "mkl_omp_offload.f90" 2 3 subroutine omp_fbody(f1n,f2n,eorb, & 4 ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, & 5 Jia, Kia, Tia, Xia, Tkj, Kkj, & 6 t1v1,t1v2) 7 8 use omp_lib 9 use onemkl_blas_omp_offload_lp64 10 use iso_fortran_env 11 implicit none 12 13 real, intent(inout) :: emp4,emp5 14 integer, intent(in) :: ncor,nocc,nvir 15 integer, intent(in) :: a,i,j,k, klo 16 real, intent(inout) :: f1n(nvir,nvir) 17 real, intent(inout) :: f2n(nvir,nvir) 18 real, intent(in) :: eorb(*) 19 real, intent(in) :: Jia(*), Kia(*), Tia(*), Xia(*) 20 real, intent(in) :: Tkj(*), Kkj(*) 21 real, intent(in) :: t1v1(nvir),t1v2(nvir) 22 real :: emp4i,emp5i 23 real :: eaijk,denom 24 integer :: lnov,lnvv 25 integer :: b,c 26 real :: f1nbc,f1ncb,f2nbc,f2ncb 27 real :: t1v1b,t1v2b 28 29 lnov=nocc*nvir 30 lnvv=nvir*nvir 31 emp4i = 0.0 32 emp5i = 0.0 33 34 !$omp dispatch 35 call sgemm('n','t',nvir,nvir,nvir,1.0,Jia,nvir, & 36 Tkj(1+(k-klo)*lnvv),nvir,1.0,f1n,nvir) 37 38 !$omp dispatch 39 call sgemm('n','t',nvir,nvir,nvir,1.0,Kia,nvir, & 40 Tkj(1+(k-klo)*lnvv),nvir,1.0, f2n,nvir) 41 42 !$omp dispatch 43 call sgemm('n','n',nvir,nvir,nocc,-1.0,Tia,nvir, & 44 Kkj(1+(k-klo)*lnov),nocc,1.0, f1n,nvir) 45 46 !$omp dispatch 47 call sgemm('n','n',nvir,nvir,nocc,-1.0,Xia,nvir, & 48 Kkj(1+(k-klo)*lnov),nocc,1.0, f2n,nvir) 49 50 eaijk = eorb(a) - ( eorb(ncor+i)+eorb(ncor+j)+eorb(ncor+k) ) 51 52 !$omp target teams distribute parallel do collapse(2) & 53 !$omp reduction(+:emp4i,emp5i) & 54 !$omp private(f1nbc,f1ncb,f2nbc,f2ncb) & 55 !$omp private(t1v1b,t1v2b) & 56 !$omp private(denom) firstprivate(eaijk,nvir,ncor,nocc) 57 do b=1,nvir 58 do c=1,nvir 59 denom=-1.0/( eorb(ncor+nocc+b)+eorb(ncor+nocc+c)+eaijk ) 60 61 f1nbc = f1n(b,c); 62 f1ncb = f1n(c,b); 63 f2nbc = f2n(b,c); 64 f2ncb = f2n(c,b); 65 t1v1b = t1v1(b); 66 t1v2b = t1v2(b); 67 68 emp4i = emp4i + (denom*t1v1b*f1nbc) + (denom*2*f1ncb) 69 emp5i = emp5i + (denom*t1v2b*f2nbc) + (denom*3*f2ncb) 70 enddo 71 enddo 72 !$omp end target teams distribute parallel do 73 74 emp4 = emp4 + emp4i 75 emp5 = emp5 + emp5i 76 77 end 78 79 80 subroutine init_array_1(arr, m) 81 implicit none 82 real, intent(inout) :: arr(m) 83 integer m, i 84 85 do i = 1, m 86 arr(i) = 1.0/(100.0 + i-1) 87 end do 88 end subroutine init_array_1 89 90 91 subroutine init_array_2(arr, m, n) 92 implicit none 93 real, intent(inout) :: arr(m, n) 94 integer m, n, i, j 95 96 do i = 1, m 97 do j = 1, n 98 arr(i,j) = 1.0/(100.0 + ((i-1) * n) + j) 99 end do 100 end do 101 end subroutine init_array_2 102 103 104 program main 105 106 use omp_lib 107 use iso_fortran_env 108 implicit none 109 110 interface 111 subroutine omp_fbody(f1n,f2n,eorb, & 112 ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, & 113 Jia, Kia, Tia, Xia, Tkj, Kkj, & 114 t1v1,t1v2) 115 real, intent(inout) :: emp4,emp5 116 integer, intent(in) :: ncor,nocc,nvir 117 integer, intent(in) :: a,i,j,k, klo 118 real, intent(inout) :: f1n(nvir,nvir) 119 real, intent(inout) :: f2n(nvir,nvir) 120 real, intent(in) :: eorb(*) 121 real, intent(in) :: Jia(*), Kia(*), Tia(*), Xia(*), Tkj(*), Kkj(*) 122 real, intent(in) :: t1v1(nvir),t1v2(nvir) 123 end subroutine omp_fbody 124 end interface 125 126 integer :: ncor, nocc, nvir, maxiter, nkpass 127 integer :: nbf, lnvv, lnov, kchunk 128 real, allocatable :: eorb(:) 129 real, allocatable :: f1n(:,:) 130 real, allocatable :: f2n(:,:) 131 132 real, allocatable :: Jia(:) 133 real, allocatable :: Kia(:) 134 real, allocatable :: Tia(:) 135 real, allocatable :: Xia(:) 136 real, allocatable :: Tkj(:) 137 real, allocatable :: Kkj(:) 138 139 real, allocatable :: t1v1(:),t1v2(:) 140 real emp4, emp5 141 integer :: a, b, c, i, j, k 142 integer :: klo, khi, iter 143 double precision, allocatable :: timers(:) 144 double precision :: t0, t1, tsum, tmax, tmin, tavg 145 146! Run parameters 147 nocc = 256 148 nvir = 2048 149 maxiter = 50 150 nkpass = 1 151 ncor = 0 152 153 print *, "Run parameters:" 154 print *, "nocc =", nocc 155 print *, "nvir =", nvir 156 print *, "maxiter =", maxiter 157 print *, "nkpass =", nkpass 158 print *, "ncor =", ncor 159 print *, " " 160 161! Allocate and initialize arrays. 162 163 nbf = ncor + nocc + nvir 164 lnvv = nvir * nvir 165 lnov = nocc * nvir 166 kchunk = (nocc - 1)/nkpass + 1 167 168 !$omp allocate allocator(omp_target_shared_mem_alloc) 169 allocate( f1n(1:nvir,1:nvir) ) 170 171 !$omp allocate allocator(omp_target_shared_mem_alloc) 172 allocate( f2n(1:nvir,1:nvir) ) 173 174 !$omp allocate allocator(omp_target_shared_mem_alloc) 175 allocate( eorb(1:nbf) ) 176 177 !$omp allocate allocator(omp_target_shared_mem_alloc) 178 allocate( Jia(1:lnvv) ) 179 180 !$omp allocate allocator(omp_target_shared_mem_alloc) 181 allocate( Kia(1:lnvv) ) 182 183 !$omp allocate allocator(omp_target_shared_mem_alloc) 184 allocate( Tia(1:lnov*nocc) ) 185 186 !$omp allocate allocator(omp_target_shared_mem_alloc) 187 allocate( Xia(1:lnov*nocc)) 188 189 !$omp allocate allocator(omp_target_shared_mem_alloc) 190 allocate( Tkj(1:kchunk*lnvv) ) 191 192 !$omp allocate allocator(omp_target_shared_mem_alloc) 193 allocate( Kkj(1:kchunk*lnvv) ) 194 195 !$omp allocate allocator(omp_target_shared_mem_alloc) 196 allocate( t1v1(1:lnvv) ) 197 198 !$omp allocate allocator(omp_target_shared_mem_alloc) 199 allocate( t1v2(1:lnvv) ) 200! 201 call init_array_1(eorb, nbf) 202 call init_array_1(Jia, lnvv) 203 call init_array_1(Kia, lnvv) 204 call init_array_1(Tia, lnov*nocc) 205 call init_array_1(Xia, lnov*nocc) 206 call init_array_1(Tkj, kchunk*lnvv) 207 call init_array_1(Kkj, kchunk*lnov) 208 call init_array_1(t1v1, lnvv) 209 call init_array_1(t1v2, lnvv) 210 211 call init_array_2(f1n, nvir, nvir) 212 call init_array_2(f2n, nvir, nvir) 213 214 print *, "End of initialization" 215 216 allocate (timers(1:maxiter)) 217 218 emp4=0.0 219 emp5=0.0 220 a=1 221 iter=1 222 223 do klo = 1, nocc, kchunk 224 khi = MIN(nocc, klo+kchunk-1) 225 do j = 1, nocc 226 227#if defined(DO_UPDATE_ARRAYS) 228! Update elements of Tkj and KKj. 229 Tkj((khi-klo+1)*lnvv) = Tkj((khi-klo+1)*lnvv) + 1.0 230 Kkj((khi-klo+1)*lnov) = Kkj((khi-klo+1)*lnov) + 1.0 231#endif 232 233 do i = 1, nocc 234 235#if defined(DO_UPDATE_ARRAYS) 236! Update elements of Jia, Kia, Tia, Xia arrays. 237 Jia(lnvv) = Jia(lnvv) + 1.0 238 Kia(lnvv) = Kia(lnvv) + 1.0 239 Tia(lnov) = Tia(lnov) + 1.0 240 Xia(lnov) = Xia(lnov) + 1.0 241#endif 242 243 do k = klo, MIN(khi,i) 244 245#if defined(DO_UPDATE_ARRAYS) 246! Update elements of t1v1 array. 247 t1v1(:) = Tkj(lnvv-nvir+1:lnvv) 248#endif 249 250 t0 = omp_get_wtime() 251 252 call omp_fbody(f1n,f2n,eorb, & 253 ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, & 254 Jia, Kia, Tia, Xia, Tkj, Kkj, & 255 t1v1,t1v2) 256 257 t1 = omp_get_wtime() 258 timers(iter) = (t1-t0) 259 if (iter .eq. maxiter) then 260 print *, "Stopping after ", iter, "iterations" 261 print *, " " 262 goto 150 263 endif 264 265! Prevent NAN for large maxiter... 266 if (emp4 > 1000.0) then 267 emp4 = emp4 - 1000.0 268 endif 269 if (emp4 < -1000.0) then 270 emp4 = emp4 + 1000.0 271 endif 272 if (emp5 > 1000.0) then 273 emp5 = emp5 - 1000.0 274 endif 275 if (emp5 < -1000.0) then 276 emp5 = emp5 + 1000.0 277 endif 278 279 iter = iter + 1 280 281 end do ! k = klo, MIN(khi,i) 282 end do ! do i = 1, nocc 283 end do ! do j = 1, nocc 284 end do ! do klo = 1, nocc, kchunk 285 286 150 CONTINUE 287 288 tsum = 0.0 289 tmax = -1.0e10 290 tmin = 1.0e10 291 do i = 2, iter 292 tsum = tsum + timers(i) 293 tmax = MAX(tmax,timers(i)) 294 tmin = MIN(tmin,timers(i)) 295 end do 296 297 tavg = tsum / (iter - 1) 298 print *, "TOTAL ITER: ", iter 299 write(*, 110) " TIMING: min=", tmin, ", max=", tmax, ", avg of iters after first=", tavg, " seconds" 300 110 format (A, F9.6, A, F9.6, A, F9.6, A) 301 302 write(*, 120) " emp4 = ", emp4, " emp5 =", emp5 303 120 format (A, F15.3, A, F15.3) 304 305 print *, "END" 306 307 deallocate (f1n) 308 deallocate (f2n) 309 deallocate (eorb) 310 deallocate (Jia) 311 deallocate (Kia) 312 deallocate (Tia) 313 deallocate (Xia) 314 deallocate (Tkj) 315 deallocate (Kkj) 316 317 deallocate (t1v1) 318 deallocate (t1v2) 319 deallocate (timers) 320 321 end program
虽然这个版本编程简单直接,但并不是最高效的。
版本 2
在程序的第二个版本中,我们使用普通的 allocate
在系统内存中分配所有 11 个数组,并使用 map
子句将数组映射到设备
real, allocatable :: f1n(:,:) real, allocatable :: f2n(:,:) real, allocatable :: eorb(:) ... !$omp target data & map(to: f1n, f2n, eorb) & map(to: Jia, Kia, Tia, Xia) & map(to: Tkj, Kkj) & map(to: t1v1, t1v2)
当在主机上更新映射到设备的数组时,我们使用 OpenMP target update to
指令将数组的新值从主机拷贝到设备。我们尽可能地将指令放在循环嵌套的最高层,以避免不必要地拷贝数组。
Tkj((khi-klo+1)*lnvv) = Tkj((khi-klo+1)*lnvv) + 1.0 Kkj((khi-klo+1)*lnov) = Kkj((khi-klo+1)*lnov) + 1.0 !$omp target update to (Tkj, Kkj)
完整的版本 2 如下所示。
1 include "mkl_omp_offload.f90" 2 3 subroutine omp_fbody(f1n,f2n,eorb, & 4 ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, & 5 Jia, Kia, Tia, Xia, Tkj, Kkj, & 6 t1v1,t1v2) 7 8 use omp_lib 9 use onemkl_blas_omp_offload_lp64 10 use iso_fortran_env 11 implicit none 12 13 real, intent(inout) :: emp4,emp5 14 integer, intent(in) :: ncor,nocc,nvir 15 integer, intent(in) :: a,i,j,k, klo 16 real, intent(inout) :: f1n(nvir,nvir) 17 real, intent(inout) :: f2n(nvir,nvir) 18 real, intent(in) :: eorb(*) 19 real, intent(in) :: Jia(*), Kia(*), Tia(*), Xia(*) 20 real, intent(in) :: Tkj(*), Kkj(*) 21 real, intent(in) :: t1v1(nvir),t1v2(nvir) 22 real :: emp4i,emp5i 23 real :: eaijk,denom 24 integer :: lnov,lnvv 25 integer :: b,c 26 real :: f1nbc,f1ncb,f2nbc,f2ncb 27 real :: t1v1b,t1v2b 28 29 lnov=nocc*nvir 30 lnvv=nvir*nvir 31 emp4i = 0.0 32 emp5i = 0.0 33 34 !$omp dispatch 35 call sgemm('n','t',nvir,nvir,nvir,1.0,Jia,nvir, & 36 Tkj(1+(k-klo)*lnvv),nvir,1.0,f1n,nvir) 37 38 !$omp dispatch 39 call sgemm('n','t',nvir,nvir,nvir,1.0,Kia,nvir, & 40 Tkj(1+(k-klo)*lnvv),nvir,1.0, f2n,nvir) 41 42 !$omp dispatch 43 call sgemm('n','n',nvir,nvir,nocc,-1.0,Tia,nvir, & 44 Kkj(1+(k-klo)*lnov),nocc,1.0, f1n,nvir) 45 46 !$omp dispatch 47 call sgemm('n','n',nvir,nvir,nocc,-1.0,Xia,nvir, & 48 Kkj(1+(k-klo)*lnov),nocc,1.0, f2n,nvir) 49 50 eaijk = eorb(a) - ( eorb(ncor+i)+eorb(ncor+j)+eorb(ncor+k) ) 51 52 !$omp target teams distribute parallel do collapse(2) & 53 !$omp reduction(+:emp4i,emp5i) & 54 !$omp private(f1nbc,f1ncb,f2nbc,f2ncb) & 55 !$omp private(t1v1b,t1v2b) & 56 !$omp private(denom) firstprivate(eaijk,nvir,ncor,nocc) 57 do b=1,nvir 58 do c=1,nvir 59 denom=-1.0/( eorb(ncor+nocc+b)+eorb(ncor+nocc+c)+eaijk ) 60 61 f1nbc = f1n(b,c); 62 f1ncb = f1n(c,b); 63 f2nbc = f2n(b,c); 64 f2ncb = f2n(c,b); 65 t1v1b = t1v1(b); 66 t1v2b = t1v2(b); 67 68 emp4i = emp4i + (denom*t1v1b*f1nbc) + (denom*2*f1ncb) 69 emp5i = emp5i + (denom*t1v2b*f2nbc) + (denom*3*f2ncb) 70 enddo 71 enddo 72 !$omp end target teams distribute parallel do 73 74 emp4 = emp4 + emp4i 75 emp5 = emp5 + emp5i 76 77 end 78 79 80 subroutine init_array_1(arr, m) 81 implicit none 82 real, intent(inout) :: arr(m) 83 integer m, i 84 85 do i = 1, m 86 arr(i) = 1.0/(100.0 + i-1) 87 end do 88 end subroutine init_array_1 89 90 91 subroutine init_array_2(arr, m, n) 92 implicit none 93 real, intent(inout) :: arr(m, n) 94 integer m, n, i, j 95 96 do i = 1, m 97 do j = 1, n 98 arr(i,j) = 1.0/(100.0 + ((i-1) * n) + j) 99 end do 100 end do 101 end subroutine init_array_2 102 103 104 program main 105 106 use omp_lib 107 use iso_fortran_env 108 implicit none 109 110 interface 111 subroutine omp_fbody(f1n,f2n,eorb, & 112 ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, & 113 Jia, Kia, Tia, Xia, Tkj, Kkj, & 114 t1v1,t1v2) 115 real, intent(inout) :: emp4,emp5 116 integer, intent(in) :: ncor,nocc,nvir 117 integer, intent(in) :: a,i,j,k, klo 118 real, intent(inout) :: f1n(nvir,nvir) 119 real, intent(inout) :: f2n(nvir,nvir) 120 real, intent(in) :: eorb(*) 121 real, intent(in) :: Jia(*), Kia(*), Tia(*), Xia(*), Tkj(*), Kkj(*) 122 real, intent(in) :: t1v1(nvir),t1v2(nvir) 123 end subroutine omp_fbody 124 end interface 125 126 integer :: ncor, nocc, nvir, maxiter, nkpass 127 integer :: nbf, lnvv, lnov, kchunk 128 real, allocatable :: eorb(:) 129 real, allocatable :: f1n(:,:) 130 real, allocatable :: f2n(:,:) 131 132 real, allocatable :: Jia(:) 133 real, allocatable :: Kia(:) 134 real, allocatable :: Tia(:) 135 real, allocatable :: Xia(:) 136 real, allocatable :: Tkj(:) 137 real, allocatable :: Kkj(:) 138 139 real, allocatable :: t1v1(:),t1v2(:) 140 real emp4, emp5 141 integer :: a, b, c, i, j, k 142 integer :: klo, khi, iter 143 double precision, allocatable :: timers(:) 144 double precision :: t0, t1, tsum, tmax, tmin, tavg 145 146! Run parameters 147 nocc = 256 148 nvir = 2048 149 maxiter = 50 150 nkpass = 1 151 ncor = 0 152 153 print *, "Run parameters:" 154 print *, "nocc =", nocc 155 print *, "nvir =", nvir 156 print *, "maxiter =", maxiter 157 print *, "nkpass =", nkpass 158 print *, "ncor =", ncor 159 print *, " " 160 161! Allocate and initialize arrays. 162 163 nbf = ncor + nocc + nvir 164 lnvv = nvir * nvir 165 lnov = nocc * nvir 166 kchunk = (nocc - 1)/nkpass + 1 167 168 allocate( f1n(1:nvir,1:nvir) ) 169 allocate( f2n(1:nvir,1:nvir) ) 170 allocate( eorb(1:nbf) ) 171 allocate( Jia(1:lnvv) ) 172 allocate( Kia(1:lnvv) ) 173 allocate( Tia(1:lnov*nocc) ) 174 allocate( Xia(1:lnov*nocc)) 175 allocate( Tkj(1:kchunk*lnvv) ) 176 allocate( Kkj(1:kchunk*lnvv) ) 177 allocate( t1v1(1:lnvv) ) 178 allocate( t1v2(1:lnvv) ) 179! 180 call init_array_1(eorb, nbf) 181 call init_array_1(Jia, lnvv) 182 call init_array_1(Kia, lnvv) 183 call init_array_1(Tia, lnov*nocc) 184 call init_array_1(Xia, lnov*nocc) 185 call init_array_1(Tkj, kchunk*lnvv) 186 call init_array_1(Kkj, kchunk*lnov) 187 call init_array_1(t1v1, lnvv) 188 call init_array_1(t1v2, lnvv) 189 190 call init_array_2(f1n, nvir, nvir) 191 call init_array_2(f2n, nvir, nvir) 192 193 print *, "End of initialization" 194 195 allocate (timers(1:maxiter)) 196 197 emp4=0.0 198 emp5=0.0 199 a=1 200 iter=1 201 202 !$omp target data & 203 map(to: f1n, f2n, eorb) & 204 map(to: Jia, Kia, Tia, Xia) & 205 map(to: Tkj, Kkj) & 206 map(to: t1v1, t1v2) 207 208 do klo = 1, nocc, kchunk 209 khi = MIN(nocc, klo+kchunk-1) 210 do j = 1, nocc 211 212#if defined(DO_UPDATE_ARRAYS) 213! Update elements of Tkj and KKj. 214 Tkj((khi-klo+1)*lnvv) = Tkj((khi-klo+1)*lnvv) + 1.0 215 Kkj((khi-klo+1)*lnov) = Kkj((khi-klo+1)*lnov) + 1.0 216 217 !$omp target update to (Tkj, Kkj) 218#endif 219 220 do i = 1, nocc 221 222#if defined(DO_UPDATE_ARRAYS) 223! Update elements of Jia, Kia, Tia, Xia arrays. 224 Jia(lnvv) = Jia(lnvv) + 1.0 225 Kia(lnvv) = Kia(lnvv) + 1.0 226 Tia(lnov) = Tia(lnov) + 1.0 227 Xia(lnov) = Xia(lnov) + 1.0 228 229 !$omp target update to (Jia, Kia, Tia, Xia) 230#endif 231 232 do k = klo, MIN(khi,i) 233 234#if defined(DO_UPDATE_ARRAYS) 235! Update elements of t1v1 array. 236 t1v1(:) = Tkj(lnvv-nvir+1:lnvv) 237 238 !$omp target update to (t1v1) 239#endif 240 241 t0 = omp_get_wtime() 242 243 call omp_fbody(f1n,f2n,eorb, & 244 ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, & 245 Jia, Kia, Tia, Xia, Tkj, Kkj, & 246 t1v1,t1v2) 247 248 t1 = omp_get_wtime() 249 timers(iter) = (t1-t0) 250 if (iter .eq. maxiter) then 251 print *, "Stopping after ", iter, "iterations" 252 print *, " " 253 goto 150 254 endif 255 256! Prevent NAN for large maxiter... 257 if (emp4 > 1000.0) then 258 emp4 = emp4 - 1000.0 259 endif 260 if (emp4 < -1000.0) then 261 emp4 = emp4 + 1000.0 262 endif 263 if (emp5 > 1000.0) then 264 emp5 = emp5 - 1000.0 265 endif 266 if (emp5 < -1000.0) then 267 emp5 = emp5 + 1000.0 268 endif 269 270 iter = iter + 1 271 272 end do ! k = klo, MIN(khi,i) 273 end do ! do i = 1, nocc 274 end do ! do j = 1, nocc 275 end do ! do klo = 1, nocc, kchunk 276 277 150 CONTINUE 278 !$omp end target data 279 280 tsum = 0.0 281 tmax = -1.0e10 282 tmin = 1.0e10 283 do i = 2, iter 284 tsum = tsum + timers(i) 285 tmax = MAX(tmax,timers(i)) 286 tmin = MIN(tmin,timers(i)) 287 end do 288 289 tavg = tsum / (iter - 1) 290 print *, "TOTAL ITER: ", iter 291 write(*, 110) " TIMING: min=", tmin, ", max=", tmax, ", avg of iters after first=", tavg, " seconds" 292 110 format (A, F9.6, A, F9.6, A, F9.6, A) 293 294 write(*, 120) " emp4 = ", emp4, " emp5 =", emp5 295 120 format (A, F15.3, A, F15.3) 296 297 print *, "END" 298 299 deallocate (f1n) 300 deallocate (f2n) 301 deallocate (eorb) 302 deallocate (Jia) 303 deallocate (Kia) 304 deallocate (Tia) 305 deallocate (Xia) 306 deallocate (Tkj) 307 deallocate (Kkj) 308 309 deallocate (t1v1) 310 deallocate (t1v2) 311 deallocate (timers) 312 313 end program
版本 3
在程序的第三个版本中,我们考虑哪些数组只在主机上使用,哪些只在设备上使用,或者哪些既在主机上也在设备上使用。 我们还考虑数组值是否在程序执行过程中进行了更新。这些信息被用来决定在哪里分配数组,如何初始化它们,以及是否需要更新设备上的值。
数组 f1
和 f2
作为设备上的工作数组使用(用于存储对 SGEMM 的调用结果),并且它们不在主机上访问。所以我们直接在设备上分配它们。
由于 f1
和 f2
在设备上分配,我们调用 init_array_2()
在一个 OpenMP target
结构体中初始化数组。
其他 9 个数组在主机和设备上都被访问,所以我们在主机内存中分配它们,并调用 init_array_1()
初始化数组。然后使用 map
子句将数组映射到设备。
我们使用 OpenMP target update to
指令将 Tkj
, Kkj
, Jia
, Kia
, Tia
, Xia``和 ``t1v1
的更新值从主机拷贝到设备。
!$omp allocate allocator(omp_target_device_mem_alloc) allocate( f1n(1:nvir,1:nvir) ) !$omp allocate allocator(omp_target_device_mem_alloc) allocate( f2n(1:nvir,1:nvir) ) !$omp allocate allocator(omp_target_host_mem_alloc) allocate( eorb(1:nbf) ) !$omp allocate allocator(omp_target_host_mem_alloc) allocate( Jia(1:lnvv) ) !$omp allocate allocator(omp_target_host_mem_alloc) allocate( Kia(1:lnvv) ) ... !$omp target data & map(to: eorb) & map(to: Jia, Kia, Tia, Xia) & map(to: Tkj, Kkj) & map(to: t1v1, t1v2) ... Tkj((khi-klo+1)*lnvv) = Tkj((khi-klo+1)*lnvv) + 1.0 Kkj((khi-klo+1)*lnov) = Kkj((khi-klo+1)*lnov) + 1.0 !$omp target update to (Tkj, Kkj)
以下是程序的完整版本 3 。
1 include "mkl_omp_offload.f90" 2 3 subroutine omp_fbody(f1n,f2n,eorb, & 4 ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, & 5 Jia, Kia, Tia, Xia, Tkj, Kkj, & 6 t1v1,t1v2) 7 8 use omp_lib 9 use onemkl_blas_omp_offload_lp64 10 use iso_fortran_env 11 implicit none 12 13 real, intent(inout) :: emp4,emp5 14 integer, intent(in) :: ncor,nocc,nvir 15 integer, intent(in) :: a,i,j,k, klo 16 real, intent(inout) :: f1n(nvir,nvir) 17 real, intent(inout) :: f2n(nvir,nvir) 18 real, intent(in) :: eorb(*) 19 real, intent(in) :: Jia(*), Kia(*), Tia(*), Xia(*) 20 real, intent(in) :: Tkj(*), Kkj(*) 21 real, intent(in) :: t1v1(nvir),t1v2(nvir) 22 real :: emp4i,emp5i 23 real :: eaijk,denom 24 integer :: lnov,lnvv 25 integer :: b,c 26 real :: f1nbc,f1ncb,f2nbc,f2ncb 27 real :: t1v1b,t1v2b 28 29 lnov=nocc*nvir 30 lnvv=nvir*nvir 31 emp4i = 0.0 32 emp5i = 0.0 33 34 !$omp dispatch 35 call sgemm('n','t',nvir,nvir,nvir,1.0,Jia,nvir, & 36 Tkj(1+(k-klo)*lnvv),nvir,1.0,f1n,nvir) 37 38 !$omp dispatch 39 call sgemm('n','t',nvir,nvir,nvir,1.0,Kia,nvir, & 40 Tkj(1+(k-klo)*lnvv),nvir,1.0, f2n,nvir) 41 42 !$omp dispatch 43 call sgemm('n','n',nvir,nvir,nocc,-1.0,Tia,nvir, & 44 Kkj(1+(k-klo)*lnov),nocc,1.0, f1n,nvir) 45 46 !$omp dispatch 47 call sgemm('n','n',nvir,nvir,nocc,-1.0,Xia,nvir, & 48 Kkj(1+(k-klo)*lnov),nocc,1.0, f2n,nvir) 49 50 eaijk = eorb(a) - ( eorb(ncor+i)+eorb(ncor+j)+eorb(ncor+k) ) 51 52 !$omp target teams distribute parallel do collapse(2) & 53 !$omp reduction(+:emp4i,emp5i) & 54 !$omp private(f1nbc,f1ncb,f2nbc,f2ncb) & 55 !$omp private(t1v1b,t1v2b) & 56 !$omp private(denom) firstprivate(eaijk,nvir,ncor,nocc) 57 do b=1,nvir 58 do c=1,nvir 59 denom=-1.0/( eorb(ncor+nocc+b)+eorb(ncor+nocc+c)+eaijk ) 60 61 f1nbc = f1n(b,c); 62 f1ncb = f1n(c,b); 63 f2nbc = f2n(b,c); 64 f2ncb = f2n(c,b); 65 t1v1b = t1v1(b); 66 t1v2b = t1v2(b); 67 68 emp4i = emp4i + (denom*t1v1b*f1nbc) + (denom*2*f1ncb) 69 emp5i = emp5i + (denom*t1v2b*f2nbc) + (denom*3*f2ncb) 70 enddo 71 enddo 72 !$omp end target teams distribute parallel do 73 74 emp4 = emp4 + emp4i 75 emp5 = emp5 + emp5i 76 77 end 78 79 80 subroutine init_array_1(arr, m) 81 implicit none 82 real, intent(inout) :: arr(m) 83 integer m, i 84 85 do i = 1, m 86 arr(i) = 1.0/(100.0 + i-1) 87 end do 88 end subroutine init_array_1 89 90 91 subroutine init_array_2(arr, m, n) 92 implicit none 93 real, intent(inout) :: arr(m, n) 94 integer m, n, i, j 95 96 !$omp target teams distribute parallel do 97 do i = 1, m 98 do j = 1, n 99 arr(i,j) = 1.0/(100.0 + ((i-1) * n) + j) 100 end do 101 end do 102 end subroutine init_array_2 103 104 105 program main 106 107 use omp_lib 108 use iso_fortran_env 109 implicit none 110 111 interface 112 subroutine omp_fbody(f1n,f2n,eorb, & 113 ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, & 114 Jia, Kia, Tia, Xia, Tkj, Kkj, & 115 t1v1,t1v2) 116 real, intent(inout) :: emp4,emp5 117 integer, intent(in) :: ncor,nocc,nvir 118 integer, intent(in) :: a,i,j,k, klo 119 real, intent(inout) :: f1n(nvir,nvir) 120 real, intent(inout) :: f2n(nvir,nvir) 121 real, intent(in) :: eorb(*) 122 real, intent(in) :: Jia(*), Kia(*), Tia(*), Xia(*), Tkj(*), Kkj(*) 123 real, intent(in) :: t1v1(nvir),t1v2(nvir) 124 end subroutine omp_fbody 125 end interface 126 127 integer :: ncor, nocc, nvir, maxiter, nkpass 128 integer :: nbf, lnvv, lnov, kchunk 129 real, allocatable :: eorb(:) 130 real, allocatable :: f1n(:,:) 131 real, allocatable :: f2n(:,:) 132 133 real, allocatable :: Jia(:) 134 real, allocatable :: Kia(:) 135 real, allocatable :: Tia(:) 136 real, allocatable :: Xia(:) 137 real, allocatable :: Tkj(:) 138 real, allocatable :: Kkj(:) 139 140 real, allocatable :: t1v1(:),t1v2(:) 141 real emp4, emp5 142 integer :: a, b, c, i, j, k 143 integer :: klo, khi, iter 144 double precision, allocatable :: timers(:) 145 double precision :: t0, t1, tsum, tmax, tmin, tavg 146 147! Run parameters 148 nocc = 256 149 nvir = 2048 150 maxiter = 50 151 nkpass = 1 152 ncor = 0 153 154 print *, "Run parameters:" 155 print *, "nocc =", nocc 156 print *, "nvir =", nvir 157 print *, "maxiter =", maxiter 158 print *, "nkpass =", nkpass 159 print *, "ncor =", ncor 160 print *, " " 161 162! Allocate and initialize arrays. 163 164 nbf = ncor + nocc + nvir 165 lnvv = nvir * nvir 166 lnov = nocc * nvir 167 kchunk = (nocc - 1)/nkpass + 1 168 169 !$omp allocate allocator(omp_target_device_mem_alloc) 170 allocate( f1n(1:nvir,1:nvir) ) 171 172 !$omp allocate allocator(omp_target_device_mem_alloc) 173 allocate( f2n(1:nvir,1:nvir) ) 174 175 !$omp allocate allocator(omp_target_host_mem_alloc) 176 allocate( eorb(1:nbf) ) 177 178 !$omp allocate allocator(omp_target_host_mem_alloc) 179 allocate( Jia(1:lnvv) ) 180 181 !$omp allocate allocator(omp_target_host_mem_alloc) 182 allocate( Kia(1:lnvv) ) 183 184 !$omp allocate allocator(omp_target_host_mem_alloc) 185 allocate( Tia(1:lnov*nocc) ) 186 187 !$omp allocate allocator(omp_target_host_mem_alloc) 188 allocate( Xia(1:lnov*nocc)) 189 190 !$omp allocate allocator(omp_target_host_mem_alloc) 191 allocate( Tkj(1:kchunk*lnvv) ) 192 193 !$omp allocate allocator(omp_target_host_mem_alloc) 194 allocate( Kkj(1:kchunk*lnvv) ) 195 196 !$omp allocate allocator(omp_target_host_mem_alloc) 197 allocate( t1v1(1:lnvv) ) 198 199 !$omp allocate allocator(omp_target_host_mem_alloc) 200 allocate( t1v2(1:lnvv) ) 201! 202 call init_array_1(eorb, nbf) 203 call init_array_1(Jia, lnvv) 204 call init_array_1(Kia, lnvv) 205 call init_array_1(Tia, lnov*nocc) 206 call init_array_1(Xia, lnov*nocc) 207 call init_array_1(Tkj, kchunk*lnvv) 208 call init_array_1(Kkj, kchunk*lnov) 209 call init_array_1(t1v1, lnvv) 210 call init_array_1(t1v2, lnvv) 211 212 call init_array_2(f1n, nvir, nvir) 213 call init_array_2(f2n, nvir, nvir) 214 215 print *, "End of initialization" 216 217 allocate (timers(1:maxiter)) 218 219 emp4=0.0 220 emp5=0.0 221 a=1 222 iter=1 223 224 !$omp target data & 225 map(to: eorb) & 226 map(to: Jia, Kia, Tia, Xia) & 227 map(to: Tkj, Kkj) & 228 map(to: t1v1, t1v2) 229 230 do klo = 1, nocc, kchunk 231 khi = MIN(nocc, klo+kchunk-1) 232 do j = 1, nocc 233 234#if defined(DO_UPDATE_ARRAYS) 235! Update elements of Tkj and KKj. 236 Tkj((khi-klo+1)*lnvv) = Tkj((khi-klo+1)*lnvv) + 1.0 237 Kkj((khi-klo+1)*lnov) = Kkj((khi-klo+1)*lnov) + 1.0 238 239 !$omp target update to (Tkj, Kkj) 240#endif 241 242 do i = 1, nocc 243 244#if defined(DO_UPDATE_ARRAYS) 245! Update elements of Jia, Kia, Tia, Xia arrays. 246 Jia(lnvv) = Jia(lnvv) + 1.0 247 Kia(lnvv) = Kia(lnvv) + 1.0 248 Tia(lnov) = Tia(lnov) + 1.0 249 Xia(lnov) = Xia(lnov) + 1.0 250 251 !$omp target update to (Jia, Kia, Tia, Xia) 252#endif 253 254 do k = klo, MIN(khi,i) 255 256#if defined(DO_UPDATE_ARRAYS) 257! Update elements of t1v1 array. 258 t1v1(:) = Tkj(lnvv-nvir+1:lnvv) 259 260 !$omp target update to (t1v1) 261#endif 262 263 t0 = omp_get_wtime() 264 265 call omp_fbody(f1n,f2n,eorb, & 266 ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, & 267 Jia, Kia, Tia, Xia, Tkj, Kkj, & 268 t1v1,t1v2) 269 270 t1 = omp_get_wtime() 271 timers(iter) = (t1-t0) 272 if (iter .eq. maxiter) then 273 print *, "Stopping after ", iter, "iterations" 274 print *, " " 275 goto 150 276 endif 277 278! Prevent NAN for large maxiter... 279 if (emp4 > 1000.0) then 280 emp4 = emp4 - 1000.0 281 endif 282 if (emp4 < -1000.0) then 283 emp4 = emp4 + 1000.0 284 endif 285 if (emp5 > 1000.0) then 286 emp5 = emp5 - 1000.0 287 endif 288 if (emp5 < -1000.0) then 289 emp5 = emp5 + 1000.0 290 endif 291 292 iter = iter + 1 293 294 end do ! k = klo, MIN(khi,i) 295 end do ! do i = 1, nocc 296 end do ! do j = 1, nocc 297 end do ! do klo = 1, nocc, kchunk 298 299 150 CONTINUE 300 !$omp end target data 301 302 tsum = 0.0 303 tmax = -1.0e10 304 tmin = 1.0e10 305 do i = 2, iter 306 tsum = tsum + timers(i) 307 tmax = MAX(tmax,timers(i)) 308 tmin = MIN(tmin,timers(i)) 309 end do 310 311 tavg = tsum / (iter - 1) 312 print *, "TOTAL ITER: ", iter 313 write(*, 110) " TIMING: min=", tmin, ", max=", tmax, ", avg of iters after first=", tavg, " seconds" 314 110 format (A, F9.6, A, F9.6, A, F9.6, A) 315 316 write(*, 120) " emp4 = ", emp4, " emp5 =", emp5 317 120 format (A, F15.3, A, F15.3) 318 319 print *, "END" 320 321 deallocate (f1n) 322 deallocate (f2n) 323 deallocate (eorb) 324 deallocate (Jia) 325 deallocate (Kia) 326 deallocate (Tia) 327 deallocate (Xia) 328 deallocate (Tkj) 329 deallocate (Kkj) 330 331 deallocate (t1v1) 332 deallocate (t1v2) 333 deallocate (timers) 334 335 end program
性能比较
我们使用以下命令来编译、链接和运行各个版本。在下面的命令中,将源文件名替换为 TEST.f。
编译: ifx -O3 -fiopenmp -fopenmp-targets=spir64 -DMKL -qmkl -fpp -free -D DO_UPDATE_ARRAYS TEST.f -c -o TEST.o 链接: ifx -fiopenmp -fopenmp-targets=spir64 -qmkl -fsycl -L${MKLROOT}/lib/intel64 -liomp5 -lsycl -lOpenCL -lstdc++ -lpthread -lm -ldl -lmkl_sycl -D DO_UPDATE_ARRAYS TEST.o -o TEST.exe 运行: LIBOMPTARGET_LEVEL_ZERO_COMMAND_BATCH=copy OMP_TARGET_OFFLOAD=MANDATORY ZE_AFFINITY_MASK=0.0 LIBOMPTARGET_DEBUG=0 LIBOMPTARGET_PLUGIN=level0 TEST.exe
我们比较了在使用的特定 GPU (仅 1 堆栈)上运行各个版本的性能。
各个版本的平均迭代时间如下。
版本 1 (test-SharedMem.f: 0.115163 秒 版本 2 (test-Map-UpdateTo.f): 0.002012 秒 版本 3 (test-DeviceMem-Map-UpdateTo.f): 0.001978 秒