Difference between revisions of "DoLoops performance in Fortran"
From MohidWiki
(→Code) |
|||
| Line 60: | Line 60: | ||
end function elapsedtime | end function elapsedtime | ||
| + | |||
| + | module moduleDoloopsOpenmp | ||
| + | |||
| + | use omp_lib | ||
| + | |||
| + | implicit none | ||
| + | |||
| + | private | ||
| + | |||
| + | public :: makeloop | ||
| + | |||
| + | contains | ||
| + | |||
| + | subroutine makeloop(cubicmatrix) | ||
| + | |||
| + | !Arguments -------------- | ||
| + | integer, dimension(:,:,:), pointer :: cubicmatrix | ||
| + | |||
| + | !Local variables -------- | ||
| + | integer :: i, j, k, lb, ub | ||
| + | !$ integer :: CHUNK, NTHREADS | ||
| + | |||
| + | lb = lbound(cubicmatrix,3) | ||
| + | ub = ubound(cubicmatrix,3) | ||
| + | |||
| + | !$OMP PARALLEL PRIVATE(i,j,k) | ||
| + | !$ NTHREADS = OMP_GET_NUM_THREADS() | ||
| + | !!$ CHUNK = 10 | ||
| + | !$ CHUNK = max( ( ub - lb ) / NTHREADS + 1, 1) | ||
| + | !$OMP DO SCHEDULE(DYNAMIC, CHUNK) | ||
| + | do k = lb, ub | ||
| + | do j = lb, ub | ||
| + | do i = lb, ub | ||
| + | |||
| + | cubicmatrix(i,j,k) = cubicmatrix(i,j,k) + 1 | ||
| + | |||
| + | end do | ||
| + | end do | ||
| + | end do | ||
| + | !$OMP END DO | ||
| + | !$OMP END PARALLEL | ||
| + | |||
| + | end subroutine makeloop | ||
| + | |||
| + | end module moduleDoloopsOpenmp | ||
===Results=== | ===Results=== | ||
Revision as of 16:24, 21 October 2010
What is the best performance that Fortran can give when computing do-loops over large matrices? The test-case below shows a four-times performance increase when looping over (k,j,i) insteado of (i,j,k).
Test-case
Hardware
- Intel Core i7 - 870
- 8 GB Ram
Code
program DoloopsOpenmp
use moduleDoloopsOpenmp, only: makeloop
implicit none
integer, dimension(:,:,:), pointer :: mycube
integer :: M = 1
real :: elapsedtime
real :: time = 0.0
do while (M < 1000)
write(*,*) 'Insert the cube size M (or insert 1000 to exit): '
read(*,*) M
if (M > 999) exit
allocate(mycube(1:M,1:M,1:M))
!Tic()
time = elapsedtime(time)
call makeloop(mycube)
!Toc()
time = elapsedtime(time)
write(*,10) time
write(*,*)
deallocate(mycube)
nullify(mycube)
end do
10 format('Time elapsed: ',F6.2)
end program DoloopsOpenmp
!This function computes the time
real function elapsedtime(lasttime)
integer :: count, count_rate
real :: lasttime
call system_clock(count, count_rate)
elapsedtime = count * 1.0 / count_rate - lasttime
end function elapsedtime
module moduleDoloopsOpenmp
use omp_lib
implicit none
private
public :: makeloop
contains
subroutine makeloop(cubicmatrix)
!Arguments --------------
integer, dimension(:,:,:), pointer :: cubicmatrix
!Local variables --------
integer :: i, j, k, lb, ub
!$ integer :: CHUNK, NTHREADS
lb = lbound(cubicmatrix,3)
ub = ubound(cubicmatrix,3)
!$OMP PARALLEL PRIVATE(i,j,k)
!$ NTHREADS = OMP_GET_NUM_THREADS()
!!$ CHUNK = 10
!$ CHUNK = max( ( ub - lb ) / NTHREADS + 1, 1)
!$OMP DO SCHEDULE(DYNAMIC, CHUNK)
do k = lb, ub
do j = lb, ub
do i = lb, ub
cubicmatrix(i,j,k) = cubicmatrix(i,j,k) + 1
end do
end do
end do
!$OMP END DO
!$OMP END PARALLEL
end subroutine makeloop
end module moduleDoloopsOpenmp
Results
DO (i,j,k) / NO CHUNK
Table A.1 - Debug do(i,j,k) Size Time 100 0.04 200 0.37 300 1.58 400 7.60 500 19.66 600 41.65
Table A.2 - Debug openmp without !$OMP PARALLEL directives do(i,j,k) Size Time 100 0.04 200 0.37 300 1.58 400 7.27 500 19.34 600 41.34
Table A.3 - Debug openmp with one !$OMP PARALLEL DO directive do(i,j,k) Size Time 100 0.02 200 0.19 300 0.70 400 1.86 500 4.05 600 7.83
DO (k,j,i) / NO CHUNK
Table B.1 - Debug do(k,j,i) Size Time 100 0.04 200 0.31 300 1.22 400 3.36 500 7.55 600 14.88
Table B.2 - Debug openmp without !$OMP PARALLEL directives do(k,j,i) Size Time 100 0.04 200 0.31 300 1.21 400 3.36 500 7.82 600 15.07
Table B.3 - Debug openmp with one !$OMP PARALLEL DO directive do(k,j,i) Size Time 100 0.02 200 0.09 300 0.36 400 0.94 500 2.04 600 3.89
DO (k,j,i) / STATIC CHUNK = (UBOUND - LBOUND) / NTHREADS + 1
Table C.3 - Debug openmp with one !$OMP PARALLEL DO directive do(k,j,i) Size Time 100 0.02 200 0.15 300 0.42 400 1.03 500 2.12 600 3.97
DO (k,j,i) / STATIC CHUNK = 10
Table D.3 - Debug openmp with one !$OMP PARALLEL DO directive do(k,j,i) Size Time 100 0.02 200 0.16 300 0.43 400 1.04 500 2.18 600 4.05
DO (k,j,i) / DYNAMIC CHUNK = 10
Table E.3 - Debug openmp with one !$OMP PARALLEL DO directive do(k,j,i) Size Time 100 0.01 200 0.10 300 0.36 400 0.93 500 2.01 600 3.89
DO (k,j,i) / DYNAMIC CHUNK = (UBOUND - LBOUND) / NTHREADS + 1
Table F.3 - Debug openmp with one !$OMP PARALLEL DO directive do(k,j,i) Size Time 100 0.02 200 0.09 300 0.39 400 1.04 500 2.14 600 4.00
Conclusions
- do(k,j,i) Vs do(i,j,k) ==> 2 to 4 times faster!
- dynamic small chunks, or no chunk at all yield 10% increased performance over large dynamic chunks. Probably better off with no-chunk.
- More test-cases representing different scenarios of do-loops may yield different choices of CHUNK/scheduling.