diff --git a/configure.sh b/configure.sh index 690ad707d..cee7933f0 100755 --- a/configure.sh +++ b/configure.sh @@ -64,4 +64,3 @@ cmake ${SOURCE_DIR} -DCMAKE_INSTALL_PREFIX=$HERE -DCMAKE_BUILD_TYPE=Debug ${CMAK # additional cmake arguments can be passed to configure.sh # this also includes fesom specific options in CMakeLists, can be used as -DFESOM_COUPLED=ON make install -j`nproc --all` - diff --git a/env/levante.dkrz.de/shell.intel b/env/levante.dkrz.de/shell.intel index e087c7f84..777e55f0d 100755 --- a/env/levante.dkrz.de/shell.intel +++ b/env/levante.dkrz.de/shell.intel @@ -9,7 +9,7 @@ module load openmpi/4.1.2-intel-2021.5.0 export FC=mpif90 CC=mpicc CXX=mpicxx ; spack load intel-oneapi-mkl@2022.0.1%gcc@11.2.0 # this handles adding to path elegantly then using hardcoded path below #module load intel-oneapi-mkl/2022.0.1-gcc-11.2.0 -#export LD_LIBRARY_PATH=/sw/spack-levante/intel-oneapi-mkl-2022.0.1-ttdktf/mkl/2022.0.1/lib/intel64:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=/sw/spack-levante/intel-oneapi-mkl-2022.0.1-ttdktf/mkl/2022.0.1/lib/intel64:$LD_LIBRARY_PATH module load netcdf-c/4.8.1-openmpi-4.1.2-intel-2021.5.0 module load netcdf-fortran/4.5.3-openmpi-4.1.2-intel-2021.5.0 diff --git a/env/levante.dkrz.de/shell.nvhpc b/env/levante.dkrz.de/shell.nvhpc index eb2b776f6..5f9bf063b 100755 --- a/env/levante.dkrz.de/shell.nvhpc +++ b/env/levante.dkrz.de/shell.nvhpc @@ -5,8 +5,8 @@ export CPU_MODEL=AMD_EPYC_ZEN3 module --force purge # module load intel-oneapi-compilers/2022.0.1-gcc-11.2.0 # module load openmpi/4.1.2-intel-2021.5.0 -module load nvhpc/22.5-gcc-11.2.0 -module load openmpi/.4.1.4-nvhpc-22.5 +module load nvhpc/23.9-gcc-11.2.0 +module load openmpi/4.1.6-nvhpc-23.9 export FC=mpif90 CC=mpicc CXX=mpicxx; module load netcdf-c/4.8.1-openmpi-4.1.2-intel-2021.5.0 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 1f72aeb75..7cdbf08fd 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -75,12 +75,14 @@ endif() option(ENABLE_OPENACC "compile with OpenACC support" OFF) message(STATUS "ENABLE_OPENACC: ${ENABLE_OPENACC}") - -set(NV_GPU_ARCH "cc80" CACHE STRING "GPU arch for nvfortran compiler (cc35,cc50,cc60,cc70,cc80,...)") +option(DISABLE_OPENACC_ATOMICS "disable kernels using atomic statement for reproducible results" ON) +set(GPU_COMPUTE_CAPABILITY "cc80" CACHE STRING "GPU arch for nvfortran compiler (cc35,cc50,cc60,cc70,cc80,...)") +# set(GPU_FLAGS "${GPU_COMPUTE_CAPABILITY}" CACHE STRING "GPU arch for nvfortran compiler") +set(GPU_FLAGS "cuda12.2,${GPU_COMPUTE_CAPABILITY}" CACHE STRING "GPU arch for nvfortran compiler (cc35,cc50,cc60,cc70,cc80,...)") option(ENABLE_OPENMP "build FESOM with OpenMP" OFF) message(STATUS "ENABLE_OPENMP: ${ENABLE_OPENMP}") -if(ENABLE_OPENMP) +if(${ENABLE_OPENMP}) find_package(OpenMP REQUIRED COMPONENTS Fortran) endif() @@ -202,7 +204,7 @@ target_include_directories(${PROJECT_NAME} PUBLIC $:-Mallocatable=95 -Mr8 -pgf90libs -Mnofma -Minfo=all -acc=verystrict -gpu=math_uniform,cuda12.2,cc80> + $<$:-Mallocatable=95 -Mr8 -pgf90libs -Minfo=all -acc=verystrict -gpu=cc80>) + set(CMAKE_EXE_LINKER_FLAGS_DEBUG "-acc=verystrict -Mnofma -gpu=math_uniform,cuda12.2,cc80") + set(CMAKE_EXE_LINKER_FLAGS_RELEASE "-acc=verystrict -gpu=cc80") + if(${DISABLE_OPENACC_ATOMICS}) + target_compile_definitions(${PROJECT_NAME} PRIVATE DISABLE_OPENACC_ATOMICS) + endif() else() - target_compile_options(${PROJECT_NAME} PRIVATE -Mipa=fast,inline) + target_compile_options(${PROJECT_NAME} PRIVATE + $<$:-Mallocatable=95 -Mr8 -pgf90libs -Mnofma> + $<$:-Mallocatable=95 -Mr8 -pgf90libs>) endif() endif() diff --git a/src/gen_modules_partitioning.F90 b/src/gen_modules_partitioning.F90 index 7e74bab7d..1c1996f8b 100644 --- a/src/gen_modules_partitioning.F90 +++ b/src/gen_modules_partitioning.F90 @@ -127,7 +127,7 @@ subroutine par_ex(COMM, mype, abort) ! finalizes MPI call MPI_Finalize(error) endif -#else ! +#else ! TODO logic below is convoluted, COMM that is passed should be used for MPI_ABORT ! changes are easy but need to be tested with coupled configurations ! From here on the two coupled options diff --git a/src/ice_EVP.F90 b/src/ice_EVP.F90 index 5a14e175e..21ed55fe2 100755 --- a/src/ice_EVP.F90 +++ b/src/ice_EVP.F90 @@ -225,13 +225,16 @@ subroutine stress2rhs(ice, partit, mesh) #else !$ACC END PARALLEL LOOP #endif -#if !defined(DISABLE_OPENACC_ATOMICS) + +#ifndef ENABLE_OPENACC +!$OMP DO +#else !$ACC PARALLEL LOOP GANG VECTOR DEFAULT(PRESENT) +#if !defined(DISABLE_OPENACC_ATOMICS) + !$ACC ATOMIC UPDAATE #else !$ACC UPDATE SELF(u_rhs_ice, v_rhs_ice, sigma11, sigma12, sigma22) #endif -#ifndef ENABLE_OPENACC -!$OMP DO #endif do el=1,myDim_elem2D ! ===== Skip if ice is absent @@ -251,16 +254,20 @@ subroutine stress2rhs(ice, partit, mesh) !$OMP ORDERED #endif #endif +#ifdef ENABLE_OPENACC #if !defined(DISABLE_OPENACC_ATOMICS) !$ACC ATOMIC UPDATE +#endif #endif U_rhs_ice(elem2D_nodes(k,el)) = U_rhs_ice(elem2D_nodes(k,el)) & - elem_area(el) * & (sigma11(el)*gradient_sca(k,el) + sigma12(el)*gradient_sca(k+3,el) & +sigma12(el)*val3*metric_factor(el)) !metrics +#ifdef ENABLE_OPENACC #if !defined(DISABLE_OPENACC_ATOMICS) !$ACC ATOMIC UPDATE +#endif #endif V_rhs_ice(elem2D_nodes(k,el)) = V_rhs_ice(elem2D_nodes(k,el)) & - elem_area(el) * & @@ -279,14 +286,14 @@ subroutine stress2rhs(ice, partit, mesh) end do #ifdef ENABLE_OPENACC #if !defined(DISABLE_OPENACC_ATOMICS) - !$ACC END PARALLEL LOOP -#else !$ACC UPDATE DEVICE(u_rhs_ice, v_rhs_ice) #endif -#endif +#endif #ifndef ENABLE_OPENACC !$OMP END DO +#else + !$ACC END PARALLEL LOOP #endif #ifndef ENABLE_OPENACC @@ -824,4 +831,4 @@ subroutine EVPdynamics(ice, partit, mesh) !endif END DO !--> do shortstep=1, ice%evp_rheol_steps -end subroutine EVPdynamics +end subroutine EVPdynamics \ No newline at end of file diff --git a/src/ice_fct.F90 b/src/ice_fct.F90 index 31071f09b..ce4564fa6 100755 --- a/src/ice_fct.F90 +++ b/src/ice_fct.F90 @@ -570,15 +570,19 @@ subroutine ice_fem_fct(tr_array_id, ice, partit, mesh) #endif ! Auxiliary elemental operator (mass matrix- lumped mass matrix) - !$ACC KERNELS + ! do we need to make the entire array of icoef equal to 1 ? + ! if so, we have to write another loop for that. For now, I am running it on cpu. icoef = 1 - !$ACC END KERNELS +#ifdef ENABLE_OPENACC !$ACC PARALLEL LOOP GANG VECTOR DEFAULT(PRESENT) +#endif do n=1,3 ! three upper nodes ! Cycle over rows row=elnodes(n) icoef(n,n)=-2 end do +#ifdef ENABLE_OPENACC !$ACC END PARALLEL LOOP +#endif #ifndef ENABLE_OPENACC @@ -1128,7 +1132,9 @@ subroutine ice_fem_fct(tr_array_id, ice, partit, mesh) call exchange_nod(ice_temp, partit, luse_g2g = .true.) #endif +#ifdef ENABLE_OPENACC !$ACC END DATA +#endif !$OMP BARRIER end subroutine ice_fem_fct @@ -1616,4 +1622,4 @@ subroutine ice_update_for_div(ice, partit, mesh) !$ACC END PARALLEL LOOP #endif end subroutine ice_update_for_div -! ============================================================= +! ============================================================= \ No newline at end of file diff --git a/src/oce_adv_tra_driver.F90 b/src/oce_adv_tra_driver.F90 index ac80a3092..9b9606ea8 100644 --- a/src/oce_adv_tra_driver.F90 +++ b/src/oce_adv_tra_driver.F90 @@ -243,7 +243,7 @@ subroutine do_oce_adv_tra(dt, vel, w, wi, we, tr_num, dynamics, tracers, partit, #ifndef ENABLE_OPENACC !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(n, nz) #else - !$ACC PARALLEL LOOP GANG VECTOR DEFAULT(PRESENT) DEFAULT(PRESENT) VECTOR_LENGTH(acc_vl) + !$ACC PARALLEL LOOP GANG VECTOR DEFAULT(PRESENT) VECTOR_LENGTH(acc_vl) #endif !$OMP DO do n=1, myDim_edge2D @@ -254,8 +254,12 @@ subroutine do_oce_adv_tra(dt, vel, w, wi, we, tr_num, dynamics, tracers, partit, !$ACC END LOOP end do !$OMP END DO +#ifndef ENABLE_OPENACC +#else !$ACC END PARALLEL LOOP - !$ACC PARALLEL LOOP GANG VECTOR DEFAULT(PRESENT) DEFAULT(PRESENT) VECTOR_LENGTH(acc_vl) +#endif + + !$ACC PARALLEL LOOP GANG VECTOR DEFAULT(PRESENT) VECTOR_LENGTH(acc_vl) !$OMP DO do n=1, myDim_nod2D !$ACC LOOP VECTOR @@ -351,7 +355,7 @@ subroutine do_oce_adv_tra(dt, vel, w, wi, we, tr_num, dynamics, tracers, partit, !_______________________________________________________________________ if (trim(tracers%data(tr_num)%tra_adv_lim)=='FCT') then !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(n, nz) - !$ACC PARALLEL LOOP GANG VECTOR DEFAULT(PRESENT) DEFAULT(PRESENT) VECTOR_LENGTH(acc_vl) + !$ACC PARALLEL LOOP GANG VECTOR DEFAULT(PRESENT) VECTOR_LENGTH(acc_vl) !$OMP DO do n=1, myDim_edge2D !$ACC LOOP VECTOR @@ -365,7 +369,7 @@ subroutine do_oce_adv_tra(dt, vel, w, wi, we, tr_num, dynamics, tracers, partit, !$OMP END DO !$ACC END PARALLEL LOOP - !$ACC PARALLEL LOOP GANG VECTOR DEFAULT(PRESENT) DEFAULT(PRESENT) VECTOR_LENGTH(acc_vl) + !$ACC PARALLEL LOOP GANG VECTOR DEFAULT(PRESENT) VECTOR_LENGTH(acc_vl) !$OMP DO do n=1, myDim_nod2D !$ACC LOOP VECTOR @@ -384,7 +388,7 @@ subroutine do_oce_adv_tra(dt, vel, w, wi, we, tr_num, dynamics, tracers, partit, !_______________________________________________________________________ else !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(n, nz) - !$ACC PARALLEL LOOP GANG VECTOR DEFAULT(PRESENT) DEFAULT(PRESENT) VECTOR_LENGTH(acc_vl) + !$ACC PARALLEL LOOP GANG VECTOR DEFAULT(PRESENT) VECTOR_LENGTH(acc_vl) !$OMP DO do n=1, myDim_edge2D !$ACC LOOP VECTOR @@ -396,7 +400,7 @@ subroutine do_oce_adv_tra(dt, vel, w, wi, we, tr_num, dynamics, tracers, partit, !$OMP END DO !$ACC END PARALLEL LOOP - !$ACC PARALLEL LOOP GANG VECTOR DEFAULT(PRESENT) DEFAULT(PRESENT) VECTOR_LENGTH(acc_vl) + !$ACC PARALLEL LOOP GANG VECTOR DEFAULT(PRESENT) VECTOR_LENGTH(acc_vl) !$OMP DO do n=1, myDim_nod2D !$ACC LOOP VECTOR diff --git a/src/oce_tracer_mod.F90 b/src/oce_tracer_mod.F90 index b74a55d84..1a0fe05d4 100755 --- a/src/oce_tracer_mod.F90 +++ b/src/oce_tracer_mod.F90 @@ -25,7 +25,9 @@ SUBROUTINE init_tracers_AB(tr_num, tracers, partit, mesh) type(t_tracer), intent(inout), target :: tracers integer :: n,nz +#ifdef ENABLE_OPENACC !$ACC parallel loop collapse(2) default(present) !!!async(1) +#endif do n=1, partit%myDim_nod2D+partit%eDim_nod2D do nz=1, mesh%nl-1 ! del_ttf will contain all advection / diffusion contributions for this tracer. Set it to 0 at the beginning! @@ -34,7 +36,9 @@ SUBROUTINE init_tracers_AB(tr_num, tracers, partit, mesh) tracers%work%del_ttf_advvert (nz, n) = 0.0_WP end do end do +#ifdef ENABLE_OPENACC !$ACC end parallel loop +#endif !$OMP PARALLEL DO do n=1, partit%myDim_nod2D+partit%eDim_nod2D ! AB interpolation @@ -220,4 +224,4 @@ SUBROUTINE relax_to_clim(tr_num, tracers, partit, mesh) !$OMP END PARALLEL DO END IF END SUBROUTINE relax_to_clim -END MODULE o_tracers +END MODULE o_tracers \ No newline at end of file diff --git a/work/job_gpu_levante b/work/job_gpu_levante new file mode 100755 index 000000000..04680163e --- /dev/null +++ b/work/job_gpu_levante @@ -0,0 +1,75 @@ +#!/bin/bash +#SBATCH --job-name=fesom_gpu_test +#SBATCH --partition=gpu +#SBATCH --nodes=4 # Specify number of nodes +#SBATCH --ntasks-per-node=4 +#SBATCH --cpus-per-task=4 +#SBATCH --gpus=16 # 4 # 8 for 2 nodes +#SBATCH --gpus-per-task=1 #specific case when tasks=gpues +#SBATCH --exclusive +#SBATCH --mem=0 # Request all memory available on all nodes +#SBATCH --time=00:20:00 # Set a limit on the total run time +#SBATCH -o slurm.out +#SBATCH -e slurm.err +#SBATCH --account=ab0995 + +set -e +export SLURM_CPUS_PER_TASK=4 + +source /sw/etc/profile.levante +#source ../env/levante.dkrz.de/shell +read -r USED_SHELL <../bin/current_shell_path +source $USED_SHELL + +#source /work/ab0995/a270232/refactoring/fesom2/env/levante.dkrz.de/shell.nvhpc +echo "using environment from" $USED_SHELL + +ulimit -s 204800 # https://docs.dkrz.de/doc/levante/running-jobs/runtime-settings.html + +echo Submitted job: $jobid +squeue -u $USER + +# Check GPUs available for the job +nvidia-smi + +# determine JOBID +JOBID=$(echo $SLURM_JOB_ID | cut -d"." -f1) + +rm -f fesom.x +ln -s ../bin/fesom.x . # cp -n ../bin/fesom.x + +export OMP_NUM_THREADS=4 +cp -n ../config/namelist.config . +cp -n ../config/namelist.forcing . +cp -n ../config/namelist.oce . +cp -n ../config/namelist.ice . +cp -n ../config/namelist.icepack . +cp -n ../config/namelist.tra . +cp -n ../config/namelist.io . +cp -n ../config/namelist.cvmix . +cp -n ../config/namelist.dyn . + +## levante specific gpu env used for ICON otherwise segfault +export OMPI_MCA_pml=ucx # Use UCX to support InfiniBand devices and CUDA [1] + +export OMPI_MCA_btl="self" # Only use self transport to reduce overhead [2] + +export UCX_RNDV_SCHEME=put_zcopy # Preferred communication scheme with Rendezvous protocol +export UCX_RNDV_THRESH=16384 # Threshold when to switch transport from TCP to NVLINK [3] + +export UCX_IB_GPU_DIRECT_RDMA=yes # Allow remote direct memory access from/to GPU + +export UCX_TLS=cma,rc,mm,cuda_ipc,cuda_copy,gdr_copy # Include cuda and gdr based transport layers for communication [4] + +export UCX_MEMTYPE_CACHE=n + +date +srun -l fesom.x >fesom2.out 2>&1 #> "fesom2.0.out" 2>&1 +# srun -l nsys profile -t cuda,osrt,mpi fesom.x > fesom2.out 2>&1 #> "fesom2.0.out" 2>&1 +date + +# qstat -f $PBS_JOBID +#export EXITSTATUS=$? +#if [ ${EXITSTATUS} -eq 0 ] || [ ${EXITSTATUS} -eq 127 ] ; then +#sbatch job_mistral +#fi