lint

ihmeuw · Nov 1, 2023 · 2c97b35 · 2c97b35
1 parent ca0b18c
commit 2c97b35
Show file tree

Hide file tree

Showing 4 changed files with 29 additions and 40 deletions.
diff --git a/src/linker/configuration.py b/src/linker/configuration.py
@@ -58,9 +58,7 @@ def get_resources(self) -> Dict[str, str]:
         }
 
     def get_spark_resources(self) -> Dict[str, str]:
-        return {
-            **self.environment["spark"]
-        }
+        return {**self.environment["spark"]}
 
     ####################
     # Helper Functions #

diff --git a/src/linker/runner.py b/src/linker/runner.py
@@ -10,6 +10,7 @@
 from linker.utilities.singularity_utils import run_with_singularity
 from linker.utilities.slurm_utils import get_slurm_drmaa, launch_slurm_job
 
+
 def main(
     config: Config,
     results_dir: Path,

diff --git a/src/linker/utilities/slurm_utils.py b/src/linker/utilities/slurm_utils.py
@@ -1,10 +1,8 @@
-
 import os
 import shutil
 import types
-from pathlib import Path
-
 from datetime import datetime
+from pathlib import Path
 from typing import Dict, List
 
 from loguru import logger
@@ -146,4 +144,4 @@ def submit_spark_cluster_job(
     # TODO: clean up if job failed?
     logger.info(f"Job {job_id} finished with status '{job_status}'")
     session.deleteJobTemplate(jt)
-    session.exit()
+    session.exit()
diff --git a/src/linker/utilities/spark_utils.py b/src/linker/utilities/spark_utils.py
@@ -1,15 +1,17 @@
 import atexit
 import os
-from pathlib import Path
 import tempfile
-
+from pathlib import Path
 from typing import TextIO
 
 from linker.utilities.slurm_utils import submit_spark_cluster_job
 
-CONDA_PATH="/ihme/homes/mkappel/miniconda3/condabin/conda "# must be accessible within container
-CONDA_ENV="pvs_like_case_study_spark_node"
-SINGULARITY_IMG="docker://apache/spark@sha256:a1dd2487a97fb5e35c5a5b409e830b501a92919029c62f9a559b13c4f5c50f63"
+CONDA_PATH = (
+    "/ihme/homes/mkappel/miniconda3/condabin/conda "  # must be accessible within container
+)
+CONDA_ENV = "pvs_like_case_study_spark_node"
+SINGULARITY_IMG = "docker://apache/spark@sha256:a1dd2487a97fb5e35c5a5b409e830b501a92919029c62f9a559b13c4f5c50f63"
+
 
 def build_cluster():
     """Builds a Spark cluster.
@@ -24,16 +26,12 @@ def build_cluster():
 
     # submit job
 
-
     # grep log for spark master url or is there a better approach?
 
     return spark_master_url
 
 
-def build_cluster_launch_script(
-    worker_settings_file: Path,
-    worker_log_directory: Path,
-) -> TextIO:
+def build_cluster_launch_script() -> TextIO:
     """Generates a shell file that, on execution, spins up a Spark cluster."""
     launcher = tempfile.NamedTemporaryFile(
         mode="w",
@@ -48,58 +46,52 @@ def build_cluster_launch_script(
     launcher.write(
         f"""
 #!/bin/bash
+# start_spark_slurm.sh generated by PRL ecosystem tool linker
 
 unset SPARK_HOME
 CONDA_PATH=/opt/conda/condabin/conda # must be accessible within container
 CONDA_ENV=spark_cluster
 SINGULARITY_IMG=image.sif
 
-export sparkLogs=$HOME/.spark_temp/logs
 export SPARK_ROOT=/opt/spark # within the container
-export SPARK_WORKER_DIR=$sparkLogs
-export SPARK_LOCAL_DIRS=$sparkLogs
+export SPARK_WORKER_DIR=$HOME/.spark_temp/logs
+export SPARK_LOCAL_DIRS=$HOME/.spark_temp/logs
 export SPARK_MASTER_PORT=28508
 export SPARK_MASTER_WEBUI_PORT=28509
 export SPARK_WORKER_CORES=$SLURM_CPUS_PER_TASK
+# shellcheck disable=SC2004
 export SPARK_DAEMON_MEMORY=$(( $SLURM_MEM_PER_CPU * $SLURM_CPUS_PER_TASK / 2 ))m
 export SPARK_MEM=$SPARK_DAEMON_MEMORY
 
 # This section will be run when started by sbatch
 if [ "$1" != 'multi_job' ]; then
     this=$0
-    # I experienced problems with some nodes not finding the script:
-    #   slurmstepd: execve(): /var/spool/slurm/job123/slurm_script:
-    #   No such file or directory
-    # that's why this script is being copied to a shared location to which
-    # all nodes have access:
-    mkdir -p $HOME/.spark_temp
-    script=$HOME/.spark_temp/${SLURM_JOBID}_$( basename -- "$0" )
+    mkdir -p "$HOME/.spark_temp"
+    script=$HOME/.spark_temp/${{SLURM_JOBID}}_$( basename -- "$0" )
     cp "$this" "$script"
 
-    srun $script 'multi_job'
+    srun "$script 'multi_job'"
 # If run by srun, then decide by $SLURM_PROCID whether we are master or worker
 else
     if [ "$SLURM_PROCID" -eq 0 ]; then
-        export SPARK_MASTER_IP="$(hostname).cluster.ihme.washington.edu"
-        MASTER_NODE=$( scontrol show hostname $SLURM_NODELIST | head -n 1 )
+        HOSTNAME=$(hostname)
+        # TODO: use fqdn from configuration
+        export SPARK_MASTER_IP="$HOSTNAME.cluster.ihme.washington.edu"
+        MASTER_NODE=$( scontrol show hostname "$SLURM_NODELIST "| head -n 1 )
 
-        mkdir -p /tmp/pvs_like_case_study_spark_local_$USER
-        singularity exec -B /mnt:/mnt,/tmp/pvs_like_case_study_spark_local_$USER:/tmp $SINGULARITY_IMG $CONDA_PATH run --no-capture-output -n $CONDA_ENV "$SPARK_ROOT/bin/spark-class" org.apache.spark.deploy.master.Master --host "$SPARK_MASTER_IP" --port "$SPARK_MASTER_PORT" --webui-port "$SPARK_MASTER_WEBUI_PORT"
+        mkdir -p "/tmp/pvs_like_case_study_spark_local_$USER"
+        singularity exec -B /mnt:/mnt,"/tmp/pvs_like_case_study_spark_local_$USER":/tmp $SINGULARITY_IMG $CONDA_PATH run --no-capture-output -n $CONDA_ENV "$SPARK_ROOT/bin/spark-class" org.apache.spark.deploy.master.Master --host "$SPARK_MASTER_IP" --port "$SPARK_MASTER_PORT" --webui-port "$SPARK_MASTER_WEBUI_PORT"
     else
-        # $(scontrol show hostname) is used to convert e.g. host20[39-40]
-        # to host2039.
         # TODO: This step assumes that SLURM_PROCID=0 corresponds to the first node in SLURM_NODELIST. Is this reasonable?
-        MASTER_NODE=spark://$( scontrol show hostname $SLURM_NODELIST | head -n 1 ):$SPARK_MASTER_PORT
+        MASTER_NODE=spark://$( scontrol show hostname "$SLURM_NODELIST" | head -n 1 ):"$SPARK_MASTER_PORT"
 
-        mkdir -p /tmp/pvs_like_case_study_spark_local_$USER
-        singularity exec -B /mnt:/mnt,/tmp/pvs_like_case_study_spark_local_$USER:/tmp $SINGULARITY_IMG $CONDA_PATH run --no-capture-output -n $CONDA_ENV "$SPARK_ROOT/bin/spark-class" org.apache.spark.deploy.worker.Worker $MASTER_NODE
+        mkdir -p "/tmp/spark_cluster_$USER"
+        singularity exec -B /mnt:/mnt,"/tmp/spark_cluster_$USER":/tmp "$SINGULARITY_IMG" "$CONDA_PATH" run --no-capture-output -n "$CONDA_ENV" "$SPARK_ROOT/bin/spark-class" org.apache.spark.deploy.worker.Worker "$MASTER_NODE"
     fi
 fi
-    """
+"""
     )
     launcher.close()
 
     atexit.register(lambda: os.remove(launcher.name))
     return launcher
-
-