Update jobscript with findings from #9

Urban-M4 · Aug 15, 2024 · ca32205 · ca32205
1 parent 6c90d2a
commit ca32205
Showing 1 changed file with 28 additions and 23 deletions.
diff --git a/workflows/snakemake/wrf.job b/workflows/snakemake/wrf.job
@@ -1,37 +1,42 @@
 #!/bin/bash
-#SBATCH --job-name=wrf_experiment     # Job name
-#SBATCH --partition=rome              # Request thin partition. Up to one hour goes to fast queue
-#SBATCH --time=00:05:00                # Maximum runtime (D-HH:MM:SS)
-#SBATCH --nodes=1                     # Number of nodes (one thin node has 128 cpu cores)
-#SBATCH --ntasks=32                   # Number of tasks per node / number of patches in the domain - parallelized with MPI / DMPAR / multiprocessing
-#SBATCH --cpus-per-task=4             # Number of CPU cores per task / number of tiles within each patch - parallelized with OpenMP / SMPAR / multithreading
-
-# Note: number cpus-per-task * ntasks should not exceed the total available cores on requested nodes
-# 8*16 = 128 exactly fits on one thin node.
-
-# Each process can do multithreading but limited to the number of cpu cores allocated to each process
-export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK}
-# export OMP_PLACES=cores
-# export OMP_PROC_BIND=close
-
-# https://journal.fluidnumerics.com/wrf-v4-on-google-cloud#h.vin1ct6ww426
-export OMP_PLACES=threads
-export OMP_PROC_BIND=true
-
+#
 # From WRF/run, submit as
 #   sbatch wrf.job
 #
 # or, from any other directory, submit as
 #   sbatch wrf.job /path/to/wrf.exe
-wrf_executable="${1:-wrf.exe}"
+#
+#
+# Number cpus-per-task * ntasks should not exceed the total available cores on requested nodes
+# 8*16 = 128 exactly fits on one rome node.
+# 8*24 = 192 exactly fits on one genoa node.
+#
+# For reference, see e.g.
+#
+# SURF docs: https://servicedesk.surf.nl/wiki/display/WIKI/Methods+of+parallelization
+# Nice guide on Hybrid MPI/OpenMP: https://nrel.github.io/HPC/blog/2021-06-18-srun/#6-hybrid-mpiopenmpi
+# Rome WRF benchmark: https://www.dell.com/support/kbdoc/en-us/000152654/wrf-performance-on-amd-rome-platform-multi-node-study
+# Genoa WRF benchmark: https://infohub.delltechnologies.com/en-us/p/hpc-application-performance-on-dell-poweredge-r6625-with-amd-epyc-genoa/
+#
+#
+#SBATCH --job-name=wrf_experiment     # Job name
+#SBATCH --partition=genoa              # Request thin partition. Up to one hour goes to fast queue
+#SBATCH --time=5-00:00:00                # Maximum runtime (D-HH:MM:SS)
+#SBATCH --nodes=1                     # Number of nodes (one thin node has 128 cpu cores)
+#SBATCH --ntasks=24                   # Number of tasks per node / number of patches in the domain - parallelized with MPI / DMPAR / multiprocessing
+#SBATCH --cpus-per-task=8             # Number of CPU cores per task / number of tiles within each patch - parallelized with OpenMP / SMPAR / multithreading
 
 # Load dependencies
 module load 2023
 module load netCDF-Fortran/4.6.1-gompi-2023a  # also loads gcc and gompi
 export NETCDF=$(nf-config --prefix)
 
-# mpiexec -np ${SLURM_NTASKS} --map-by node:PE=$OMP_NUM_THREADS --rank-by core $wrf_executable
+# Configure OpenMP threads & core affinity
+export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK}
+export OMP_PLACES=cores
+export OMP_PROC_BIND=close
 
-# https://journal.fluidnumerics.com/wrf-v4-on-google-cloud#h.vin1ct6ww426
-mpiexec -np ${SLURM_NTASKS} --map-by core --bind-to core $wrf_executable
+# If wrf executable not passed explicitly to script, then default to wrf.exe in working dir
+wrf_executable="${1:-wrf.exe}"
 
+srun $wrf_executable