Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion configure.sh
Original file line number Diff line number Diff line change
Expand Up @@ -64,4 +64,3 @@ cmake ${SOURCE_DIR} -DCMAKE_INSTALL_PREFIX=$HERE -DCMAKE_BUILD_TYPE=Debug ${CMAK
# additional cmake arguments can be passed to configure.sh
# this also includes fesom specific options in CMakeLists, can be used as -DFESOM_COUPLED=ON
make install -j`nproc --all`

2 changes: 1 addition & 1 deletion env/levante.dkrz.de/shell.intel
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ module load openmpi/4.1.2-intel-2021.5.0
export FC=mpif90 CC=mpicc CXX=mpicxx ;
spack load [email protected]%[email protected] # this handles adding to path elegantly then using hardcoded path below
#module load intel-oneapi-mkl/2022.0.1-gcc-11.2.0
#export LD_LIBRARY_PATH=/sw/spack-levante/intel-oneapi-mkl-2022.0.1-ttdktf/mkl/2022.0.1/lib/intel64:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/sw/spack-levante/intel-oneapi-mkl-2022.0.1-ttdktf/mkl/2022.0.1/lib/intel64:$LD_LIBRARY_PATH

module load netcdf-c/4.8.1-openmpi-4.1.2-intel-2021.5.0
module load netcdf-fortran/4.5.3-openmpi-4.1.2-intel-2021.5.0
Expand Down
4 changes: 2 additions & 2 deletions env/levante.dkrz.de/shell.nvhpc
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ export CPU_MODEL=AMD_EPYC_ZEN3
module --force purge
# module load intel-oneapi-compilers/2022.0.1-gcc-11.2.0
# module load openmpi/4.1.2-intel-2021.5.0
module load nvhpc/22.5-gcc-11.2.0
module load openmpi/.4.1.4-nvhpc-22.5
module load nvhpc/23.9-gcc-11.2.0
module load openmpi/4.1.6-nvhpc-23.9
export FC=mpif90 CC=mpicc CXX=mpicxx;

module load netcdf-c/4.8.1-openmpi-4.1.2-intel-2021.5.0
Expand Down
33 changes: 19 additions & 14 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -75,12 +75,14 @@ endif()

option(ENABLE_OPENACC "compile with OpenACC support" OFF)
message(STATUS "ENABLE_OPENACC: ${ENABLE_OPENACC}")

set(NV_GPU_ARCH "cc80" CACHE STRING "GPU arch for nvfortran compiler (cc35,cc50,cc60,cc70,cc80,...)")
option(DISABLE_OPENACC_ATOMICS "disable kernels using atomic statement for reproducible results" ON)
set(GPU_COMPUTE_CAPABILITY "cc80" CACHE STRING "GPU arch for nvfortran compiler (cc35,cc50,cc60,cc70,cc80,...)")
# set(GPU_FLAGS "${GPU_COMPUTE_CAPABILITY}" CACHE STRING "GPU arch for nvfortran compiler")
set(GPU_FLAGS "cuda12.2,${GPU_COMPUTE_CAPABILITY}" CACHE STRING "GPU arch for nvfortran compiler (cc35,cc50,cc60,cc70,cc80,...)")

option(ENABLE_OPENMP "build FESOM with OpenMP" OFF)
message(STATUS "ENABLE_OPENMP: ${ENABLE_OPENMP}")
if(ENABLE_OPENMP)
if(${ENABLE_OPENMP})
find_package(OpenMP REQUIRED COMPONENTS Fortran)
endif()

Expand Down Expand Up @@ -202,7 +204,7 @@ target_include_directories(${PROJECT_NAME} PUBLIC $<INSTALL_INTERFACE:module/fes
target_link_libraries(${PROJECT_NAME} PRIVATE MPI::MPI_Fortran)

set_target_properties(${PROJECT_NAME} PROPERTIES LINKER_LANGUAGE Fortran)
if(ENABLE_OPENMP)
if(${ENABLE_OPENMP})
target_link_libraries(${PROJECT_NAME} PRIVATE OpenMP::OpenMP_Fortran)
endif()

Expand Down Expand Up @@ -384,7 +386,7 @@ elseif(${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU )
elseif(${CMAKE_Fortran_COMPILER_ID} STREQUAL Cray )
#target_compile_options(${PROJECT_NAME} PRIVATE -c -emf -hbyteswapio -hflex_mp=conservative -hfp1 -hadd_paren -Ounroll0 -hipa0 -r am -s real64 -N 1023 -g -G2 -O3)
target_compile_options(${PROJECT_NAME} PRIVATE -c -emf -hbyteswapio -hflex_mp=conservative -hfp1 -hadd_paren -Ounroll0 -hipa0 -r am -s real64 -N 1023 -g -G2 -O2 -hnoacc -M878) #-hnoacc is a workaround for cray automatically activate -hacc, -M878 is to suppress ftn-878 warning
if(ENABLE_OPENMP)
if(${ENABLE_OPENMP})
target_compile_options(${PROJECT_NAME} PRIVATE -homp)
else()
target_compile_options(${PROJECT_NAME} PRIVATE -hnoomp)
Expand All @@ -397,16 +399,19 @@ elseif(${CMAKE_Fortran_COMPILER_ID} STREQUAL Cray )
endif()
elseif(${CMAKE_Fortran_COMPILER_ID} STREQUAL NVHPC )
target_compile_definitions(${PROJECT_NAME} PRIVATE ENABLE_NVHPC_WORKAROUNDS)
target_compile_options(${PROJECT_NAME} PRIVATE -fast -fastsse -O3 -Mallocatable=95 -Mr8 -pgf90libs)
if(ENABLE_OPENACC)
# additional compiler settings
target_compile_options(${PROJECT_NAME} PRIVATE -acc -ta=tesla:${NV_GPU_ARCH} -Minfo=accel)
set(CMAKE_EXE_LINKER_FLAGS "-acc -ta=tesla:${NV_GPU_ARCH}")
endif()
if(ENABLE_OPENMP)
target_compile_options(${PROJECT_NAME} PRIVATE -Mipa=fast)
if(${ENABLE_OPENACC})
target_compile_options(${PROJECT_NAME} PRIVATE
$<$<CONFIG:DEBUG>:-Mallocatable=95 -Mr8 -pgf90libs -Mnofma -Minfo=all -acc=verystrict -gpu=math_uniform,cuda12.2,cc80>
$<$<CONFIG:RELEASE>:-Mallocatable=95 -Mr8 -pgf90libs -Minfo=all -acc=verystrict -gpu=cc80>)
set(CMAKE_EXE_LINKER_FLAGS_DEBUG "-acc=verystrict -Mnofma -gpu=math_uniform,cuda12.2,cc80")
set(CMAKE_EXE_LINKER_FLAGS_RELEASE "-acc=verystrict -gpu=cc80")
if(${DISABLE_OPENACC_ATOMICS})
target_compile_definitions(${PROJECT_NAME} PRIVATE DISABLE_OPENACC_ATOMICS)
endif()
else()
target_compile_options(${PROJECT_NAME} PRIVATE -Mipa=fast,inline)
target_compile_options(${PROJECT_NAME} PRIVATE
$<$<CONFIG:DEBUG>:-Mallocatable=95 -Mr8 -pgf90libs -Mnofma>
$<$<CONFIG:RELEASE>:-Mallocatable=95 -Mr8 -pgf90libs>)
endif()
endif()

Expand Down
2 changes: 1 addition & 1 deletion src/gen_modules_partitioning.F90
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ subroutine par_ex(COMM, mype, abort) ! finalizes MPI
call MPI_Finalize(error)
endif

#else !
#else
! TODO logic below is convoluted, COMM that is passed should be used for MPI_ABORT
! changes are easy but need to be tested with coupled configurations
! From here on the two coupled options
Expand Down
21 changes: 14 additions & 7 deletions src/ice_EVP.F90
Original file line number Diff line number Diff line change
Expand Up @@ -225,13 +225,16 @@ subroutine stress2rhs(ice, partit, mesh)
#else
!$ACC END PARALLEL LOOP
#endif
#if !defined(DISABLE_OPENACC_ATOMICS)

#ifndef ENABLE_OPENACC
!$OMP DO
#else
!$ACC PARALLEL LOOP GANG VECTOR DEFAULT(PRESENT)
#if !defined(DISABLE_OPENACC_ATOMICS)
!$ACC ATOMIC UPDAATE
#else
!$ACC UPDATE SELF(u_rhs_ice, v_rhs_ice, sigma11, sigma12, sigma22)
#endif
#ifndef ENABLE_OPENACC
!$OMP DO
#endif
do el=1,myDim_elem2D
! ===== Skip if ice is absent
Expand All @@ -251,16 +254,20 @@ subroutine stress2rhs(ice, partit, mesh)
!$OMP ORDERED
#endif
#endif
#ifdef ENABLE_OPENACC
#if !defined(DISABLE_OPENACC_ATOMICS)
!$ACC ATOMIC UPDATE
#endif
#endif
U_rhs_ice(elem2D_nodes(k,el)) = U_rhs_ice(elem2D_nodes(k,el)) &
- elem_area(el) * &
(sigma11(el)*gradient_sca(k,el) + sigma12(el)*gradient_sca(k+3,el) &
+sigma12(el)*val3*metric_factor(el)) !metrics

#ifdef ENABLE_OPENACC
#if !defined(DISABLE_OPENACC_ATOMICS)
!$ACC ATOMIC UPDATE
#endif
#endif
V_rhs_ice(elem2D_nodes(k,el)) = V_rhs_ice(elem2D_nodes(k,el)) &
- elem_area(el) * &
Expand All @@ -279,14 +286,14 @@ subroutine stress2rhs(ice, partit, mesh)
end do
#ifdef ENABLE_OPENACC
#if !defined(DISABLE_OPENACC_ATOMICS)
!$ACC END PARALLEL LOOP
#else
!$ACC UPDATE DEVICE(u_rhs_ice, v_rhs_ice)
#endif
#endif
#endif

#ifndef ENABLE_OPENACC
!$OMP END DO
#else
!$ACC END PARALLEL LOOP
#endif

#ifndef ENABLE_OPENACC
Expand Down Expand Up @@ -824,4 +831,4 @@ subroutine EVPdynamics(ice, partit, mesh)
!endif
END DO !--> do shortstep=1, ice%evp_rheol_steps

end subroutine EVPdynamics
end subroutine EVPdynamics
12 changes: 9 additions & 3 deletions src/ice_fct.F90
Original file line number Diff line number Diff line change
Expand Up @@ -570,15 +570,19 @@ subroutine ice_fem_fct(tr_array_id, ice, partit, mesh)
#endif
! Auxiliary elemental operator (mass matrix- lumped mass matrix)

!$ACC KERNELS
! do we need to make the entire array of icoef equal to 1 ?
! if so, we have to write another loop for that. For now, I am running it on cpu.
icoef = 1
!$ACC END KERNELS
#ifdef ENABLE_OPENACC
!$ACC PARALLEL LOOP GANG VECTOR DEFAULT(PRESENT)
#endif
do n=1,3 ! three upper nodes
! Cycle over rows row=elnodes(n)
icoef(n,n)=-2
end do
#ifdef ENABLE_OPENACC
!$ACC END PARALLEL LOOP
#endif


#ifndef ENABLE_OPENACC
Expand Down Expand Up @@ -1128,7 +1132,9 @@ subroutine ice_fem_fct(tr_array_id, ice, partit, mesh)
call exchange_nod(ice_temp, partit, luse_g2g = .true.)
#endif

#ifdef ENABLE_OPENACC
!$ACC END DATA
#endif

!$OMP BARRIER
end subroutine ice_fem_fct
Expand Down Expand Up @@ -1616,4 +1622,4 @@ subroutine ice_update_for_div(ice, partit, mesh)
!$ACC END PARALLEL LOOP
#endif
end subroutine ice_update_for_div
! =============================================================
! =============================================================
16 changes: 10 additions & 6 deletions src/oce_adv_tra_driver.F90
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ subroutine do_oce_adv_tra(dt, vel, w, wi, we, tr_num, dynamics, tracers, partit,
#ifndef ENABLE_OPENACC
!$OMP PARALLEL DEFAULT(SHARED) PRIVATE(n, nz)
#else
!$ACC PARALLEL LOOP GANG VECTOR DEFAULT(PRESENT) DEFAULT(PRESENT) VECTOR_LENGTH(acc_vl)
!$ACC PARALLEL LOOP GANG VECTOR DEFAULT(PRESENT) VECTOR_LENGTH(acc_vl)
#endif
!$OMP DO
do n=1, myDim_edge2D
Expand All @@ -254,8 +254,12 @@ subroutine do_oce_adv_tra(dt, vel, w, wi, we, tr_num, dynamics, tracers, partit,
!$ACC END LOOP
end do
!$OMP END DO
#ifndef ENABLE_OPENACC
#else
!$ACC END PARALLEL LOOP
!$ACC PARALLEL LOOP GANG VECTOR DEFAULT(PRESENT) DEFAULT(PRESENT) VECTOR_LENGTH(acc_vl)
#endif

!$ACC PARALLEL LOOP GANG VECTOR DEFAULT(PRESENT) VECTOR_LENGTH(acc_vl)
!$OMP DO
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ifndef ENABLE_OPENACC is missing here and few omp blocks below, not sure openmp and openacc will work with this file but if it is too much we can defer to next PR you intend to do.

do n=1, myDim_nod2D
!$ACC LOOP VECTOR
Expand Down Expand Up @@ -351,7 +355,7 @@ subroutine do_oce_adv_tra(dt, vel, w, wi, we, tr_num, dynamics, tracers, partit,
!_______________________________________________________________________
if (trim(tracers%data(tr_num)%tra_adv_lim)=='FCT') then
!$OMP PARALLEL DEFAULT(SHARED) PRIVATE(n, nz)
!$ACC PARALLEL LOOP GANG VECTOR DEFAULT(PRESENT) DEFAULT(PRESENT) VECTOR_LENGTH(acc_vl)
!$ACC PARALLEL LOOP GANG VECTOR DEFAULT(PRESENT) VECTOR_LENGTH(acc_vl)
!$OMP DO
do n=1, myDim_edge2D
!$ACC LOOP VECTOR
Expand All @@ -365,7 +369,7 @@ subroutine do_oce_adv_tra(dt, vel, w, wi, we, tr_num, dynamics, tracers, partit,
!$OMP END DO
!$ACC END PARALLEL LOOP

!$ACC PARALLEL LOOP GANG VECTOR DEFAULT(PRESENT) DEFAULT(PRESENT) VECTOR_LENGTH(acc_vl)
!$ACC PARALLEL LOOP GANG VECTOR DEFAULT(PRESENT) VECTOR_LENGTH(acc_vl)
!$OMP DO
do n=1, myDim_nod2D
!$ACC LOOP VECTOR
Expand All @@ -384,7 +388,7 @@ subroutine do_oce_adv_tra(dt, vel, w, wi, we, tr_num, dynamics, tracers, partit,
!_______________________________________________________________________
else
!$OMP PARALLEL DEFAULT(SHARED) PRIVATE(n, nz)
!$ACC PARALLEL LOOP GANG VECTOR DEFAULT(PRESENT) DEFAULT(PRESENT) VECTOR_LENGTH(acc_vl)
!$ACC PARALLEL LOOP GANG VECTOR DEFAULT(PRESENT) VECTOR_LENGTH(acc_vl)
!$OMP DO
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not guarded openmp meaning we can´t use openmp with openacc for other parts. can defer to next PR if you want to.

do n=1, myDim_edge2D
!$ACC LOOP VECTOR
Expand All @@ -396,7 +400,7 @@ subroutine do_oce_adv_tra(dt, vel, w, wi, we, tr_num, dynamics, tracers, partit,
!$OMP END DO
!$ACC END PARALLEL LOOP

!$ACC PARALLEL LOOP GANG VECTOR DEFAULT(PRESENT) DEFAULT(PRESENT) VECTOR_LENGTH(acc_vl)
!$ACC PARALLEL LOOP GANG VECTOR DEFAULT(PRESENT) VECTOR_LENGTH(acc_vl)
!$OMP DO
do n=1, myDim_nod2D
!$ACC LOOP VECTOR
Expand Down
6 changes: 5 additions & 1 deletion src/oce_tracer_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@ SUBROUTINE init_tracers_AB(tr_num, tracers, partit, mesh)
type(t_tracer), intent(inout), target :: tracers
integer :: n,nz

#ifdef ENABLE_OPENACC
!$ACC parallel loop collapse(2) default(present) !!!async(1)
#endif
do n=1, partit%myDim_nod2D+partit%eDim_nod2D
do nz=1, mesh%nl-1
! del_ttf will contain all advection / diffusion contributions for this tracer. Set it to 0 at the beginning!
Expand All @@ -34,7 +36,9 @@ SUBROUTINE init_tracers_AB(tr_num, tracers, partit, mesh)
tracers%work%del_ttf_advvert (nz, n) = 0.0_WP
end do
end do
#ifdef ENABLE_OPENACC
!$ACC end parallel loop
#endif
!$OMP PARALLEL DO
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not guarded openmp meaning we can´t use openmp with openacc for other parts. can defer to next PR if you want to.

do n=1, partit%myDim_nod2D+partit%eDim_nod2D
! AB interpolation
Expand Down Expand Up @@ -220,4 +224,4 @@ SUBROUTINE relax_to_clim(tr_num, tracers, partit, mesh)
!$OMP END PARALLEL DO
END IF
END SUBROUTINE relax_to_clim
END MODULE o_tracers
END MODULE o_tracers
75 changes: 75 additions & 0 deletions work/job_gpu_levante
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#!/bin/bash
#SBATCH --job-name=fesom_gpu_test
#SBATCH --partition=gpu
#SBATCH --nodes=4 # Specify number of nodes
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=4
#SBATCH --gpus=16 # 4 # 8 for 2 nodes
#SBATCH --gpus-per-task=1 #specific case when tasks=gpues
#SBATCH --exclusive
#SBATCH --mem=0 # Request all memory available on all nodes
#SBATCH --time=00:20:00 # Set a limit on the total run time
#SBATCH -o slurm.out
#SBATCH -e slurm.err
#SBATCH --account=ab0995

set -e
export SLURM_CPUS_PER_TASK=4

source /sw/etc/profile.levante
#source ../env/levante.dkrz.de/shell
read -r USED_SHELL <../bin/current_shell_path
source $USED_SHELL

#source /work/ab0995/a270232/refactoring/fesom2/env/levante.dkrz.de/shell.nvhpc
echo "using environment from" $USED_SHELL

ulimit -s 204800 # https://docs.dkrz.de/doc/levante/running-jobs/runtime-settings.html

echo Submitted job: $jobid
squeue -u $USER

# Check GPUs available for the job
nvidia-smi

# determine JOBID
JOBID=$(echo $SLURM_JOB_ID | cut -d"." -f1)

rm -f fesom.x
ln -s ../bin/fesom.x . # cp -n ../bin/fesom.x

export OMP_NUM_THREADS=4
cp -n ../config/namelist.config .
cp -n ../config/namelist.forcing .
cp -n ../config/namelist.oce .
cp -n ../config/namelist.ice .
cp -n ../config/namelist.icepack .
cp -n ../config/namelist.tra .
cp -n ../config/namelist.io .
cp -n ../config/namelist.cvmix .
cp -n ../config/namelist.dyn .

## levante specific gpu env used for ICON otherwise segfault
export OMPI_MCA_pml=ucx # Use UCX to support InfiniBand devices and CUDA [1]

export OMPI_MCA_btl="self" # Only use self transport to reduce overhead [2]

export UCX_RNDV_SCHEME=put_zcopy # Preferred communication scheme with Rendezvous protocol
export UCX_RNDV_THRESH=16384 # Threshold when to switch transport from TCP to NVLINK [3]

export UCX_IB_GPU_DIRECT_RDMA=yes # Allow remote direct memory access from/to GPU

export UCX_TLS=cma,rc,mm,cuda_ipc,cuda_copy,gdr_copy # Include cuda and gdr based transport layers for communication [4]

export UCX_MEMTYPE_CACHE=n

date
srun -l fesom.x >fesom2.out 2>&1 #> "fesom2.0.out" 2>&1
# srun -l nsys profile -t cuda,osrt,mpi fesom.x > fesom2.out 2>&1 #> "fesom2.0.out" 2>&1
date

# qstat -f $PBS_JOBID
#export EXITSTATUS=$?
#if [ ${EXITSTATUS} -eq 0 ] || [ ${EXITSTATUS} -eq 127 ] ; then
#sbatch job_mistral
#fi