Sample Code & Cluster User's Guide
Below is the code for a Sample MPI Program.
Sample MPI Program
3-Dimensional Array Smoothing.
program stress_test_mpi
!======================================================================
! Simple contrived program to see how systems perform an MPI test
! intended to test/stress distributed-memory systems.
!
! This test repeatedly "smooths" or "filters" a 3-d (gridded) array,
! starting either from a random or deterministic data array.
!
! This makes a good system "stress test", since the arrays can be as
! large or small as required, and the job can be run for as long or
! as short as required (by varying the number of filtering passes).
! The 7-point filter around the centre-point in a 3-d lattice structure
! ariss quite often numerically, and poses a performance challenge for
! scalar processors!
!
! The program reads 8 input variables, from 6 lines of standard input,
! or redirected from a file. See "program parameters" below.
!
!======================================================================
!
implicit none
!
!======================================================================
! Considerations for MPI version:
! "Global" indices have subscript _glob
! "Local" indices have subscript _loc
!
! Each "local" array arr_in (over local domain) is extended to
! contain an extra "layer", 1-point wide, right around the domain.
! This extra layer, or "shadow zone", or "halo", is used to hold
! neighbouring array values in the global (physical) domain, but
! which are really part of the local domain of a neighbouring MPI
! process. Subroutines "shado_xchange_*" are used to fill this
! "shadow zone" by exchanging boundary data between MPI processes.
!
! The extended domain idea also helps with the numerics of filtering
! since we can treat the physical (global) boundaries as if there
! were another point beyond them.
!
! Just remember that the (global) array arr_in(nx,ny,nz) in the
! sequential case becomes a set of (local) arrays
! arr_in(nx_loc,ny_loc,nz_loc)
! in the domain decomposition, which in turn become the "extended"
! "local" arrays
! arx_in(nx_loc+2,ny_loc_2,nz_loc+2)
!
! The relationship to keep in mind between these is:
! arr_in(i,j,k) == arx_in(i+1,j+1,k+1)
!
!======================================================================
!
integer nx_glob, ny_glob, nz_glob,
& nx_loc, ny_loc, nz_loc,
& i_glob, j_glob, k_glob, i_loc, j_loc, k_loc, i,j,k
integer iseed, icount, maxcount
integer iofreq, irkz, indx0, itag1
integer ierror,myrank,nprocs ! MPI-related variables
integer nprocx, nprocy, nprocz ! domain decomposition!
integer myrankx, myranky, myrankz ! relative rank in each direction
integer iloc, jloc, kloc, iparam(9)
real rparam(2)
include 'mpif.h'
integer istatus(MPI_STATUS_SIZE)
!
real xran1, twopi,xdist,ydist,zdist
! real, allocatable :: arr_in(:,:,:), arr_out(:,:,:) ! Physical domains
real, allocatable :: arx_in(:,:,:), arx_out(:,:,:) ! Extended domains
real, allocatable :: ardx(:)
integer, allocatable :: indx(:)
real valmax, wght, rwght3,rwght4,rwght5,rwght6
!
!
!=====================================================
! For an MPI program, initialize MPI first!
!=====================================================
call MPI_Init(ierror)
if(ierror.ne.MPI_SUCCESS) then
print *,'Error in MPI_INIT, ierror=',ierror
go to 999
endif
!
call MPI_COMM_SIZE(MPI_COMM_WORLD,nprocs,ierror)
if(ierror.ne.MPI_SUCCESS) then
print *,'Error in MPI_COMM_SIZE, ierror=',ierror
go to 999
endif
!
call MPI_COMM_RANK(MPI_COMM_WORLD,myrank,ierror)
if(ierror.ne.MPI_SUCCESS) then
print *,'Error from MPI_COMM_RANK, ierror=',ierror
go to 999
endif
!
! write(9+myrank,*) 'Myrank ',myrank,' initialized.'
!
!===================================================================
! Some program parameters:
! (Only one process reads these, then broadcasts to the others)
!===================================================================
if(myrank.eq.0) then
! read(*,*) maxcount ! Max. number of filter iterations (problem length)
read(*,*) iparam(1) ! Max. number of filter iterations (problem length)
! read(*,*) nx_glob,ny_glob,nz_glob ! global grid values
read(*,*) iparam(2), iparam(3), iparam(4)
! read(*,*) iofreq ! No. of iterations between writing output
read(*,*) iparam(5) ! No. of iterations between writing output
! read(*,*) wght ! Filter weights (coefficients)
read(*,*) rparam(1) ! Filter weights (coefficients)
! read(*,*) valmax ! Max. value of 3-d field (for scaling only)
read(*,*) rparam(2) ! Max. value of 3-d field (for scaling only)
! read(*,*) iseed ! Zero for deterministic array; otherwise random seed
read(*,*) iparam(6) ! Zero for deterministic array; otherwise random seed
read(*,*) iparam(7), iparam(8), iparam(9) ! domain decomposition
!
! Some suggested values:
! 100 ! maxcount, number of filter iterations
! 400, 400, 800 ! global grid values, to use 1 GB memory altogether.
! 10 ! iofreq, i.e, output every 10 iterations
! 0.05 ! wght, for relatively "light" filtering
! 100. ! valmax, just for order-1 output values
! 0 ! iseed, zero for deterministic array, best for checking correctness
!
! --- Start of MPI-specific section ----
!
! Check that domain decomp is consistent with nprocs:
if(nprocs .ne. iparam(7)*iparam(8)*iparam(9)) then
print *,'Error: domain decomposition inconsistent with total
& process count. Must quit here.'
go to 999
endif
endif
!====================================================================
! Distribute these basic control parameters to every process...
!====================================================================
call MPI_Bcast(iparam(1),9,MPI_INTEGER,0,MPI_COMM_WORLD,ierror)
call MPI_Bcast(rparam(1),2,MPI_REAL ,0,MPI_COMM_WORLD,ierror)
if(ierror.ne.MPI_SUCCESS) then
print *,'Error from MPI_Bcast, ierror=',ierror
go to 999
endif
! Unpack:
maxcount = iparam(1)
nx_glob = iparam(2)
ny_glob = iparam(3)
nz_glob = iparam(4)
iofreq = iparam(5)
iseed = iparam(6)
nprocx = iparam(7)
nprocy = iparam(8)
nprocz = iparam(9)
wght = rparam(1)
valmax = rparam(2)
!
! Now we need to get nx_loc, etc. from nx_glob, etc.
nx_loc = nx_glob/nprocx
ny_loc = ny_glob/nprocy
nz_loc = nz_glob/nprocz
!
!
! Check that nprocx, etc. divide evenly into nx_glob, etc.:
if( nx_glob.ne.nx_loc*nprocx .or.
& ny_glob.ne.ny_loc*nprocy .or.
& nz_glob.ne.nz_loc*nprocz ) then
print *,'Error: nprocx, nprocy or nprocz dont divide evenly
& into nx, ny or nz. Must quit!'
go to 999
endif
!
! Now the subtle part of figuring myrankx, myranky, myrankz.
! These are the process rankings in each of the 3 dimensions.
! (ranks (x,y,z) are stored in usual "fortran" (i,j,k) order):
myrankx = mod(myrank,nprocx)
myranky = mod(myrank,(nprocx*nprocy))/nprocx
myrankz = myrank/(nprocx*nprocy)
write(9+myrank,*) 'Myrank, and myrank in x,y,z dimensions are:'
& , myrank, myrankx,myranky, myrankz
!
! --- End of MPI-specific section ----
!
!
rwght3 = 1.0 - 3.0*wght !Weight factors derived from fundamental weight
rwght4 = 1.0 - 4.0*wght
rwght5 = 1.0 - 5.0*wght
rwght6 = 1.0 - 6.0*wght
!
!
! The beauty of MPI is that we need only allocate memory for local vars:
! allocate (arr_in(nx_loc,ny_loc,nz_loc))
allocate (arx_in(nx_loc+2,ny_loc+2,nz_loc+2)) ! extended domains for MPI
! allocate (arr_out(nx_loc,ny_loc,nz_loc))
allocate (arx_out(nx_loc+2,ny_loc+2,nz_loc+2)) ! extended MPI domain
allocate (indx(0:nprocz-1))
allocate (ardx(nz_loc))
!
if(myrank.eq.0) then
print *, 'Maxcount: ',maxcount
print *, 'nx_glob,ny_glob,nz_glob: ',nx_glob,ny_glob,nz_glob
print *, 'Output interval: ',iofreq
print *, 'Weight (wght): ',wght
print *, 'Value range: ',valmax
print *, 'Random seed: ',iseed
if(iseed.eq.0) then
print *, 'Using deterministic initialization'
else
print *, 'Using random initialization'
endif
print *, ' '
endif
!
!
!=====================================================
! Initialize the main data array,
! either deterministically, or with random numbers:
!=====================================================
if(iseed.eq.0) then
twopi = 8.0*atan(1.0)
! do k=1,nz_glob
do k_loc=1,nz_loc
k = k_loc + nz_loc*myrankz ! k is index in global domain
zdist=twopi*float(k-1)/float(nz_glob-1)
! do j=1,ny
do j_loc=1,ny_loc
j = j_loc + ny_loc*myranky ! j is index in global domain
ydist=twopi*float(j-1)/float(ny_glob-1)
! do i=1,nx
do i_loc =1,nx_loc
i = i_loc + nx_loc*myrankx ! i is index in global domain
xdist=twopi*float(i-1)/float(nx_glob-1)
! arr_in(i,j,k) = valmax*Cos(7.*xdist)*cos(13.*ydist)*
! & cos(11.*zdist)
! arr_in(i_loc,j_loc,k_loc)=valmax*Cos(7.*xdist)*cos(13.*ydist)*
! & cos(11.*zdist)
! Fill in extended array instead:
arx_in(i_loc+1,j_loc+1,k_loc+1)=valmax*Cos(7.*xdist)*
& cos(13.*ydist)*cos(11.*zdist)
enddo
enddo
enddo
!
else
!
! do k=1,nz
do k=2,nz_loc+1
! do j=1,ny
do j=2,ny_loc+1
! do i=1,nx
do i=2,nx_loc+1
! arr_in(i,j,k) = valmax*(-1. + 2.0*xran1(iseed))
! arr_in(i,j,k) = valmax*(-1. + 2.0*xran1(iseed+myrank))
! Fill in extended array instead:
arx_in(i,j,k) = valmax*(-1. + 2.0*xran1(iseed+myrank))
enddo
enddo
enddo
endif
!
if(myrank.eq.0) then ! Only process 0 need write this
if(iseed.eq.0) then
write(*,*) 'Initial (deterministic) selected values are:'
else
write(*,*) 'Initial (random) selected values are:'
endif
endif
!
! This next "write" should use same data in MPI version as in original.
! So, need to collect it, assuming global (i,j)=(2,2) are on 1st proc...
if(myrankx+myranky.eq.0) then
do irkz=0,nprocz-1
indx(irkz) = 0 ! indx(irkz) is index of points to be sent
enddo
!
do k_glob=2,nz_glob-1,nz_glob/7 ! all pts as in sequential case
irkz = (k_glob - 1)/nz_loc ! z-rank this point lives in
indx(irkz) = indx(irkz) + 1 ! One more point for this proc.
if(myrankz.eq.irkz) then
k_loc = 1 + mod(k_glob-1,nz_loc) ! Local z-index
! First try: ardx(indx(irkz)) = arr_in(2,2,k_loc)
ardx(indx(irkz)) = arx_in(3,3,k_loc+1) ! Use extended array
endif
enddo
if(myrankz.eq.0) indx0 = indx(0)+1
!
!
! So we've run through all the global z-points, identified each with a
! process rank, accumulated as necessary, and collected the real values too.
! Now to send/receive appropriately...
!
itag1=888
do irkz=1,nprocz-1 ! N.B. Start irkz at rank 1
if(indx(irkz).ge.1) then ! Something to be sent and recvd
if(myrankz.eq.irkz) call MPI_SEND(ardx(1),indx(irkz),
& MPI_REAL, 0,itag1,MPI_COMM_WORLD,ierror)
if(myrankz.eq.0) then ! rank 0 only receives here
! Note that we must receive from a "global" rank, not irkz:
call MPI_RECV(ardx(indx0),indx(irkz),MPI_REAL,
& irkz*nprocx*nprocy,itag1,MPI_COMM_WORLD,istatus,ierror)
indx0 = indx0 + indx(irkz)
endif
endif
enddo
endif ! (myrankx and myranky being both 0...)
!
!
! Next section for master process only:
if(myrank.eq.0) then
!seq write(*,100) (arr_in(2,2,k),k=2,nz-1,nz/7)
write(*,100) (ardx(k),k=1,indx0-1)
!
open (unit=12,file='stresstest.dat',form='formatted')
write(12,*) 'The Parameters of this run are:'
write(12,*) 'Maxcount: ',maxcount
write(12,*) 'nx,ny,nz (glob): ',nx_glob,ny_glob,nz_glob
write(12,*) 'nx,ny,nz (local):',nx_loc,ny_loc,nz_loc
write(12,*) 'Output interval: ',iofreq
write(12,*) 'Weight (wght): ',wght
write(12,*) 'Value range: ',valmax
write(12,*) 'Random seed: ',iseed
if(iseed.eq.0) then
write(12,*) 'Using deterministic initialization'
else
write(12,*) 'Using random initialization'
endif
write(12,*) ' '
write(12,*) 'Initial random values are (i,j,k=1,10):'
! In most cases these points will be on proc 0. If not, more work is needed
if(nx_loc.ge.10 .and. ny_loc.ge.10 .and. nz_loc.ge.10) then
! write (12,100) (arr_in(10,10,k),k=1,10)
write (12,100) (arx_in(11,11,k),k=2,11) ! Use extended array instead
else
write (12,*) 'Need to collect diagnostic points from remote
& processes... not yet implemented'
endif
write (12,*) ' '
endif
!
!
!
!=======================================================
! Start of outermost loop over smoothing iterations:
!=======================================================
!
do icount=0,maxcount
!
! Fill edges of "extended" array arx_in with data from neighbour procs:
call shado_xch_EW(arx_in,nx_loc,ny_loc,nz_loc,
& myrankx,myranky,myrankz,nprocx,nprocy,nprocz)
call shado_xch_NS(arx_in,nx_loc,ny_loc,nz_loc,
& myrankx,myranky,myrankz,nprocx,nprocy,nprocz)
call shado_xch_TB(arx_in,nx_loc,ny_loc,nz_loc,
& myrankx,myranky,myrankz,nprocx,nprocy,nprocz)
!
!======================================================================
! Main body of (3-d) data:
!
! Use of the "extended" array arx_in with "shadow zone" (needed for
! MPI case) allows us to radically simplify the smoothing numerical
! code over the entire global domain, since all physical surfaces,
! edges and corners are now "inside" the numberical ones!!
!
! No need any more for separate loops for separate domain sections.
!======================================================================
!
!
!$OMP PARALLEL DO PRIVATE(i,j,k)
! do k=2,nz-1
do k=2,nz_loc+1
! do j=2,ny-1
do j=2,ny_loc+1
! do i=2,nx-1
do i=2,nx_loc+1
arx_out(i,j,k) = rwght6*arx_in(i,j,k) + wght*(
& arx_in(i-1,j,k) + arx_in(i+1,j,k) +
& arx_in(i,j-1,k) + arx_in(i,j+1,k) +
& arx_in(i,j,k-1) + arx_in(i,j,k+1) )
enddo
enddo
enddo
!$OMP END PARALLEL DO
!
!
! Do a 2nd smoothing pass to update arx_in for next iteration:
! First, need to fill in "edges" of arx_out:
!
call shado_xch_EW(arx_out,nx_loc,ny_loc,nz_loc,
& myrankx,myranky,myrankz,nprocx,nprocy,nprocz)
call shado_xch_NS(arx_out,nx_loc,ny_loc,nz_loc,
& myrankx,myranky,myrankz,nprocx,nprocy,nprocz)
call shado_xch_TB(arx_out,nx_loc,ny_loc,nz_loc,
& myrankx,myranky,myrankz,nprocx,nprocy,nprocz)
!
!
! Main body of (3-d) data, for 2nd pass:
!$OMP PARALLEL DO PRIVATE(i,j,k)
! do k=2,nz-1
do k=2,nz_loc+1
! do j=2,ny-1
do j=2,ny_loc+1
! do i=2,nx-1
do i=2,nx_loc+1
arx_in(i,j,k) = rwght6*arx_out(i,j,k) + wght*(
& arx_out(i-1,j,k) + arx_out(i+1,j,k) +
& arx_out(i,j-1,k) + arx_out(i,j+1,k) +
& arx_out(i,j,k-1) + arx_out(i,j,k+1) )
enddo
enddo
enddo
!$OMP END PARALLEL DO
!
!
!
!
! This next "write" should use same data in MPI version as in original.
! So, need to collect it, assuming (i,j)=(2,2) are on 1st proc...
if(myrank.eq.0 .and. icount.eq.0)
& write(*,*) 'Successively smoothed values are:'
!
! Only do diagnostic output at specified intervals:
if(icount.lt.10 .or. mod(icount,iofreq).eq.0) then
if(myrankx.eq.0 .and. myranky.eq.0) then
do irkz=0,nprocz-1
indx(irkz) = 0 ! indx(irkz) is index of points to be sent
enddo
!
do k_glob=2,nz_glob-1,nz_glob/7 ! all pts as in sequential case
irkz = (k_glob - 1)/nz_loc ! z-rank this point lives in
indx(irkz) = indx(irkz) + 1 ! One more point for this proc.
if(myrankz.eq.irkz) then
k_loc = 1 + mod(k_glob-1,nz_loc) ! Local z-index
! First try: ardx(indx(irkz)) = arr_in(2,2,k_loc)
ardx(indx(irkz)) = arx_in(3,3,k_loc+1) ! Use extended array
endif
enddo
if(myrankz.eq.0) indx0 = indx(0) + 1
!
! So we've run through all the global z-points, identified each with a
! process rank, accumulated as necessary, and collected the real values too
! Now to send/receive appropriately...
!
itag1=itag1+1
do irkz=1,nprocz-1 ! N.B. Start irkz at rank 1
if(indx(irkz).ge.1) then
if(myrankz.eq.irkz) call MPI_SEND(ardx(1),indx(irkz),
& MPI_REAL, 0,itag1,MPI_COMM_WORLD,ierror)
if(myrankz.eq.0) then ! rank 0 only receives here
! Note that we must receive from a "global" rank, not irkz:
call MPI_RECV(ardx(indx0),indx(irkz),MPI_REAL,
& irkz*nprocx*nprocy,itag1,MPI_COMM_WORLD,istatus,ierror)
indx0 = indx0 + indx(irkz)
endif
endif
enddo
endif ! (myrankx and myranky being both 0...)
!
if(myrank.eq.0)
! & write(*,100) (arr_in(2,2,k),k=2,nz-1,nz/7)
& write(*,100) (ardx(k),k=1,indx0-1)
endif ! (iteration for diagnostic output)
!
!
!
enddo ! (end of outer-most "iteration" loop over icount)
!
!================================================================
!
!
if(myrank.eq.0) then
write(12,*) 'Final (smoothed?) values are (i,j,k=1,10):'
! In most cases these points will be on proc 0. If not, more work is neede
if(nx_loc.ge.10 .and. ny_loc.ge.10 .and. nz_loc.ge.10) then
write (12,100) (arx_in(11,11,k),k=2,11)
else
write (12,*) 'Need to collect diagnostic points from remote
& processes... not yet implemented'
endif
endif
!
call MPI_FINALIZE(ierror)
!
stop 'Normal end, max smoothing iterations completed.'
999 stop 'Program ended prematurely due to fatal error'
100 format(6e12.4)
end
function xran1(idum)
c--------------------------------------------------------------------
c Routine from Numerical Recipes to return a uniform random
c deviate between 0.0 and 1.0. Set idum to any negative number
c to initialize or reinitialize the sequence.
c--------------------------------------------------------------------
parameter (nn=97)
parameter (m1=259200, ia1=7141, ic1=54773, rm1=1./m1)
parameter (m2=134456, ia2=8121, ic2=28411, rm2=1./m2)
parameter (m3=243000, ia3=4561, ic3=51349)
real xran1, r(nn)
save r, ix1,ix2,ix3
c
data iff /0/
if (idum.lt.0 .or. iff.eq.0) then
iff = 1
ix1 = mod(ic1-idum,m1) ! seed the first routine
ix1 = mod(ia1*ix1+ic1,m1)
ix2 = mod(ix1,m2) ! and use it to seed the second
ix1 = mod(ia1*ix1+ic1,m1)
ix3 = mod(ix1,m3) ! and to seed the third
c
do 11,j=1,nn ! fill the table with sequential
ix1 = mod(ia1*ix1+ic1,m1) ! random deviates generated by the
ix2 = mod(ia2*ix2+ic2,m2) ! first two routines
r(j) = (float(ix1)+float(ix2)*rm2)*rm1
11 continue
c
idum = 1
endif
c
ix1 = mod(ia1*ix1+ic1,m1)
ix2 = mod(ia2*ix2+ic2,m2)
ix3 = mod(ia3*ix3+ic3,m3)
c
j = 1+(nn*ix3)/m3 ! use 3rd sequence for no. between 1 and 97
c
if(j.gt.nn.or.j.lt.1) pause
xran1 = r(j) ! return that table entry
r(j) = (float(ix1)+float(ix2)*rm2)*rm1 ! and refill it
return
end
subroutine shado_xch_EW(arx_in,nx_loc,ny_loc,nz_loc,
& myrankx,myranky,myrankz,nprocx,nprocy,nprocz)
!====================================================================
! Routine to do east-west shadow-zone exchange.
!
! If this process has a neighbour to east or west, a shadow-zone
! swap will occur.
!
! The eastmost surface of arr_in (i.e., the physical local domain)
! is sent to the westmost surface of arx_in (i.e., the extended local
! domain) of the process to the east - if there is one.
! The westmost surface of arr_in (i.e., the physical local domain)
! is sent to the eastmost surface of arx_in (i.e., the extended
! local domain) of the process to the west - if there is one.
!
! Similarly, the westmost surface of arr_in (i.e., the physical local
! domain) from any process to the east is received into the eastmost
! surface of arx_in (i.e., the extended local domain).
! The eastmost surface of arr_in (i.e., the physical local domain)
! from any process to the west is received into the westmost surface
! of arx_in (i.e., the extended local domain).
!
! To help avoid contention, let even processes send eastwards first,
! while odd processes send westwards first. That way, all processes
! should have data ready to receive when MPI_Recv is called.
!
! If at eastern or western boundaries of the global domain, just
! copy the edge of arr_in into the edge of arx_in.
!====================================================================
!
implicit none
integer myrankx,myranky,myrankz,nprocx,nprocy,nprocz
integer nx_loc, ny_loc, nz_loc, j, k
real arx_in(nx_loc+2,ny_loc+2,nz_loc+2)
real shado_E(ny_loc,nz_loc) ! For data moving eastwards
real shado_W(ny_loc,nz_loc) ! " " " westwards
logical least,lwest
!
include 'mpif.h'
integer istatus(MPI_STATUS_SIZE),ierror,ideste,idestw
integer, parameter :: itage=123, itagw=321
!
!
!
! First, western global boundary treatment:
!
if(myrankx.eq.0) then ! Am at the western boundary already
do k=2,nz_loc+1 ! Just copy west sfc of arr_in to arx_in
do j=2,ny_loc+1
arx_in(1,j,k) = arx_in(2,j,k) ! simplifies filter
enddo
enddo
endif
!
!
! Next, eastern global boundary treatment:
!
if(myrankx.eq.nprocx-1) then ! Am at the eastern boundary already
do k=2,nz_loc+1 ! Just copy east sfc of arr_in to arx_in
do j=2,ny_loc+1
arx_in(nx_loc+2,j,k) = arx_in(nx_loc+1,j,k) ! simplifes filter
enddo
enddo
endif
!
!
! For max. efficiency, let even procs send east first, then west, while
! odd procs send west first, then east.
!
least = .false.
lwest = .false.
if( mod(myrankx,2).eq.1 ) go to 102
!
! Eastern local boundary treatment:
!
101 continue
if(myrankx.lt.nprocx-1) then ! need to do a shadow-exchange
do k=2,nz_loc+1 ! Send to east, receive from east
do j=2,ny_loc+1
shado_E(j-1,k-1) = arx_in(nx_loc+1,j,k)
enddo
enddo
!
! Send/recv to/from process "ideste", with same myranky,z and myrankx+1:
ideste= myrankz*nprocx*nprocy + myranky*nprocx + myrankx + 1
!
!========================================================================
! Note: it makes sense to post the "receive" here first, since data from
! the "odd" procs is already "sent". Otherwise, if everything is "sent"
! before anything is "received", there could be too much data
! simultaneously in flight for some MPI implementations...
!========================================================================
!
call MPI_Recv(shado_W(1,1),ny_loc*nz_loc,MPI_REAL,
& ideste,itagw,MPI_COMM_WORLD,istatus,ierror)
!
call MPI_Send(shado_E(1,1),ny_loc*nz_loc,MPI_REAL,
& ideste,itage,MPI_COMM_WORLD,ierror)
!
do k=2,nz_loc+1
do j=2,ny_loc+1
arx_in(nx_loc+2,j,k) = shado_W(j-1,k-1)
enddo
enddo
!
endif
least = .true.
if(lwest) go to 103
!
!
! Western local boundary treatment:
!
102 continue
if(myrankx.gt.0) then ! need to do a shadow-exchange
do k=2,nz_loc+1 ! Send to west, receive from west
do j=2,ny_loc+1
shado_W(j-1,k-1) = arx_in(2,j,k) ! Pack up local array values
enddo
enddo
!
! Send to process "idestw", with same myranky,z and myrankx-1:
idestw= myrankz*nprocx*nprocy + myranky*nprocx + myrankx - 1
call MPI_Send(shado_W(1,1),ny_loc*nz_loc,MPI_REAL,
& idestw,itagw,MPI_COMM_WORLD,ierror)
!
! Receive from process "idestw" as well:
call MPI_Recv(shado_E(1,1),ny_loc*nz_loc,MPI_REAL,
& idestw,itage,MPI_COMM_WORLD,istatus,ierror)
!
do k=2,nz_loc+1
do j=2,ny_loc+1
arx_in(1,j,k) = shado_E(j-1,k-1)
enddo
enddo
!
endif
lwest = .true.
if(.not.least) go to 101
!
!
!
103 return
end
subroutine shado_xch_NS(arx_in,nx_loc,ny_loc,nz_loc,
& myrankx,myranky,myrankz,nprocx,nprocy,nprocz)
!======================================================================
! Routine to do north-south shadow-zone exchange.
!
! If this process has a neighbour to north or south, a shadow-zone
! swap will occur.
!
! The northmost surface of arr_in (i.e., the physical local domain)
! is sent to the southmost surface of arx_in (i.e., the extended local
! domain) of the process to the north - if there is one.
! The southmost surface of arr_in (i.e., the physical local domain)
! is sent to the northmost surface of arx_in (i.e., the extended
! local domain) of the process to the south - if there is one.
!
! Similarly, the southmost surface of arr_in (i.e., the physical local
! domain) from any process to the north is received into the northmost
! surface of arx_in (i.e., the extended local domain).
! The northmost surface of arr_in (i.e., the physical local domain)
! from any process to the south is received into the southmost surface
! of arx_in (i.e., the extended local domain).
!
! To help avoid contention, let even processes send northwards first,
! while odd processes send southwards first. That way, all processes
! should have data ready to receive when MPI_Recv is called.
!
! If at northern or southern boundaries of the global domain, just
! copy the edge of arr_in into the edge of arx_in.
!======================================================================
!
implicit none
integer myrankx,myranky,myrankz,nprocx,nprocy,nprocz
integer nx_loc, ny_loc, nz_loc, i, j, k
real arx_in(nx_loc+2,ny_loc+2,nz_loc+2)
real shado_N(nx_loc,nz_loc) ! For data moving northwards
real shado_S(nx_loc,nz_loc) ! " " " southwards
logical lnorth,lsouth
!
include 'mpif.h'
integer istatus(MPI_STATUS_SIZE),ierror,idestn,idests
integer, parameter :: itagn=425, itags=524
!
!
!
! First, southern global boundary treatment:
!
if(myranky.eq.0) then ! Am at the southern boundary already
do k=2,nz_loc+1 ! Just copy south sfc of arr_in to arx_in
do i=2,nx_loc+1
arx_in(i,1,k) = arx_in(i,2,k) ! simplifies filter
enddo
enddo
endif
!
!
! Next, northern global boundary treatment:
!
if(myranky.eq.nprocy-1) then ! Am at the northern boundary already
do k=2,nz_loc+1 ! Just copy north sfc of arr_in to arx_in
do i=2,nx_loc+1
arx_in(i,ny_loc+2,k) = arx_in(i,ny_loc+1,k) ! simplifes filter
enddo
enddo
endif
!
!
! For max. efficiency, let even procs send north first, then south, while
! odd procs send south first, then north.
!
lnorth = .false.
lsouth = .false.
if( mod(myranky,2).eq.1 ) go to 102
!
! Northern local boundary treatment:
!
101 continue
if(myranky.lt.nprocy-1) then ! need to do a shadow-exchange
do k=2,nz_loc+1 ! Send to north, receive from north
do i=2,nx_loc+1
shado_N(i-1,k-1) = arx_in(i,ny_loc+1,k)
enddo
enddo
!
! Send/recv to/from process "idestn", with same myrankx,z and myranky+1:
idestn= myrankz*nprocx*nprocy + (myranky+1)*nprocx + myrankx
!
!========================================================================
! Note: it makes sense to post the "receive" here first, since data from
! the "odd" procs is already "sent". Otherwise, if everything is "sent"
! before anything is "received", there could be too much data
! simultaneously in flight for some MPI implementations...
!========================================================================
!
call MPI_Recv(shado_S(1,1),nx_loc*nz_loc,MPI_REAL,
& idestn,itags,MPI_COMM_WORLD,istatus,ierror)
!
call MPI_Send(shado_N(1,1),nx_loc*nz_loc,MPI_REAL,
& idestn,itagn,MPI_COMM_WORLD,ierror)
!
do k=2,nz_loc+1
do i=2,nx_loc+1
arx_in(i,ny_loc+2,k) = shado_S(i-1,k-1)
enddo
enddo
!
endif
lnorth = .true.
if(lsouth) go to 103
!
!
! Southern local boundary treatment:
!
102 continue
if(myranky.gt.0) then ! need to do a shadow-exchange
do k=2,nz_loc+1 ! Send to south, receive from south
do i=2,nx_loc+1
shado_S(i-1,k-1) = arx_in(i,2,k) ! Pack up local array values
enddo
enddo
!
! Send to process "idests", with same myrankx,z and myranky-1:
idests= myrankz*nprocx*nprocy + (myranky-1)*nprocx + myrankx
call MPI_Send(shado_S(1,1),nx_loc*nz_loc,MPI_REAL,
& idests,itags,MPI_COMM_WORLD,ierror)
!
! Receive from process "idests" as well:
call MPI_Recv(shado_N(1,1),nx_loc*nz_loc,MPI_REAL,
& idests,itagn,MPI_COMM_WORLD,istatus,ierror)
!
do k=2,nz_loc+1
do i=2,nx_loc+1
arx_in(i,1,k) = shado_N(i-1,k-1)
enddo
enddo
!
endif
lsouth = .true.
if(.not.lnorth) go to 101
!
!
!
103 return
end
subroutine shado_xch_TB(arx_in,nx_loc,ny_loc,nz_loc,
& myrankx,myranky,myrankz,nprocx,nprocy,nprocz)
!======================================================================
! Routine to do top-bottom (or up-down) shadow-zone exchange.
!
! If this process has a neighbour above or below, a shadow-zone
! swap will occur.
!
! The topmost surface of arr_in (i.e., the physical local domain)
! is sent to the bottom-most surface of arx_in (the extended local
! domain) of the process above - if there is one.
! The bottom-most surface of arr_in (i.e., the physical local domain)
! is sent to the topmost surface of arx_in (i.e., the extended
! local domain) of the process below - if there is one.
!
! Similarly, the bottom-most surface of arr_in (the physical local
! domain) from any process above is received into the topmost
! surface of arx_in (i.e., the extended local domain).
! The topmost surface of arr_in (i.e., the physical local domain)
! from any process below is received into the bottom-most surface
! of arx_in (i.e., the extended local domain).
!
! To help avoid contention, let even processes send upwards first,
! while odd processes send downwards first. That way, all processes
! should have data ready to receive when MPI_Recv is called.
!
! If at topmost or bottommost boundaries of the global domain, just
! copy the edge of arr_in into the edge of arx_in.
!======================================================================
!
implicit none
integer myrankx,myranky,myrankz,nprocx,nprocy,nprocz,myrank
integer nx_loc, ny_loc, nz_loc, i, j, k
real arx_in(nx_loc+2,ny_loc+2,nz_loc+2)
real shado_T(nx_loc,ny_loc) ! For data moving upwards
real shado_B(nx_loc,ny_loc) ! " " " downwards
logical ltop,lbottom
!
include 'mpif.h'
integer istatus(MPI_STATUS_SIZE),ierror,idestt,idestb
integer, parameter :: itagt=627, itagb=726
!
!
! First, bottom global boundary treatment:
!
if(myrankz.eq.0) then ! Am at the bottom boundary already
do j=2,ny_loc+1 ! Just copy south sfc of arr_in to arx_in
do i=2,nx_loc+1
arx_in(i,j,1) = arx_in(i,j,2) ! simplifies filter
enddo
enddo
endif
!
!
! Next, top global boundary treatment:
!
if(myrankz.eq.nprocz-1) then ! Am at the top boundary already
do j=2,ny_loc+1 ! Just copy top sfc of arr_in to arx_in
do i=2,nx_loc+1
arx_in(i,j,nz_loc+2) = arx_in(i,j,nz_loc+1) ! simplifes filter
enddo
enddo
endif
!
!
! For max. efficiency, let even procs send up first, then down, while
! odd procs send down first, then up.
!
ltop = .false.
lbottom = .false.
if( mod(myrankz,2).eq.1 ) go to 102
!
! Top local boundary treatment:
!
101 continue
if(myrankz.lt.nprocz-1) then ! need to do a shadow-exchange
do j=2,ny_loc+1 ! Send above, receive from above
do i=2,nx_loc+1
shado_T(i-1,j-1) = arx_in(i,j,nz_loc+1)
enddo
enddo
!
! Send/recv to/from process "idestt", with same myrankx,y and myrankz+1:
idestt= (myrankz+1)*nprocx*nprocy + myranky*nprocx + myrankx
!
!========================================================================
! Note: it makes sense to post the "receive" here first, since data from
! the "odd" procs is already "sent". Otherwise, if everything is "sent"
! before anything is "received", there could be too much data
! simultaneously in flight for some MPI implementations...
!========================================================================
!
call MPI_Recv(shado_B(1,1),nx_loc*ny_loc,MPI_REAL,
& idestt,itagb,MPI_COMM_WORLD,istatus,ierror)
!
call MPI_Send(shado_T(1,1),nx_loc*ny_loc,MPI_REAL,
& idestt,itagt,MPI_COMM_WORLD,ierror)
!
do j=2,ny_loc+1
do i=2,nx_loc+1
arx_in(i,j,nz_loc+2) = shado_B(i-1,j-1)
enddo
enddo
!
endif
!debug write(9+myrank,*) 'Shado TB, finished send/recv above'
ltop = .true.
if(lbottom) go to 103
!
!
! Bottom local boundary treatment:
!
102 continue
if(myrankz.gt.0) then ! need to do a shadow-exchange
do j=2,ny_loc+1 ! Send below, receive from below
do i=2,nx_loc+1
shado_B(i-1,j-1) = arx_in(i,j,2) ! Pack up local array values
enddo
enddo
!
! Send to process "idestb", with same myrankx,y and myrankz-1:
idestb= (myrankz-1)*nprocx*nprocy + myranky*nprocx + myrankx
!
call MPI_Send(shado_B(1,1),nx_loc*ny_loc,MPI_REAL,
& idestb,itagb,MPI_COMM_WORLD,ierror)
!
! Receive from process "idestb" as well:
call MPI_Recv(shado_T(1,1),nx_loc*ny_loc,MPI_REAL,
& idestb,itagt,MPI_COMM_WORLD,istatus,ierror)
!
do j=2,ny_loc+1
do i=2,nx_loc+1
arx_in(i,j,1) = shado_T(i-1,j-1)
enddo
enddo
!
endif
lbottom = .true.
if(.not.ltop) go to 101
!
!
!
103 return
end