Skip to content

Commit

Permalink
lint
Browse files Browse the repository at this point in the history
  • Loading branch information
mattkappel committed Nov 1, 2023
1 parent ca0b18c commit 2c97b35
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 40 deletions.
4 changes: 1 addition & 3 deletions src/linker/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,7 @@ def get_resources(self) -> Dict[str, str]:
}

def get_spark_resources(self) -> Dict[str, str]:
return {
**self.environment["spark"]
}
return {**self.environment["spark"]}

####################
# Helper Functions #
Expand Down
1 change: 1 addition & 0 deletions src/linker/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from linker.utilities.singularity_utils import run_with_singularity
from linker.utilities.slurm_utils import get_slurm_drmaa, launch_slurm_job


def main(
config: Config,
results_dir: Path,
Expand Down
6 changes: 2 additions & 4 deletions src/linker/utilities/slurm_utils.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@

import os
import shutil
import types
from pathlib import Path

from datetime import datetime
from pathlib import Path
from typing import Dict, List

from loguru import logger
Expand Down Expand Up @@ -146,4 +144,4 @@ def submit_spark_cluster_job(
# TODO: clean up if job failed?
logger.info(f"Job {job_id} finished with status '{job_status}'")
session.deleteJobTemplate(jt)
session.exit()
session.exit()
58 changes: 25 additions & 33 deletions src/linker/utilities/spark_utils.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
import atexit
import os
from pathlib import Path
import tempfile

from pathlib import Path
from typing import TextIO

from linker.utilities.slurm_utils import submit_spark_cluster_job

CONDA_PATH="/ihme/homes/mkappel/miniconda3/condabin/conda "# must be accessible within container
CONDA_ENV="pvs_like_case_study_spark_node"
SINGULARITY_IMG="docker://apache/spark@sha256:a1dd2487a97fb5e35c5a5b409e830b501a92919029c62f9a559b13c4f5c50f63"
CONDA_PATH = (
"/ihme/homes/mkappel/miniconda3/condabin/conda " # must be accessible within container
)
CONDA_ENV = "pvs_like_case_study_spark_node"
SINGULARITY_IMG = "docker://apache/spark@sha256:a1dd2487a97fb5e35c5a5b409e830b501a92919029c62f9a559b13c4f5c50f63"


def build_cluster():
"""Builds a Spark cluster.
Expand All @@ -24,16 +26,12 @@ def build_cluster():

# submit job


# grep log for spark master url or is there a better approach?

return spark_master_url


def build_cluster_launch_script(
worker_settings_file: Path,
worker_log_directory: Path,
) -> TextIO:
def build_cluster_launch_script() -> TextIO:
"""Generates a shell file that, on execution, spins up a Spark cluster."""
launcher = tempfile.NamedTemporaryFile(
mode="w",
Expand All @@ -48,58 +46,52 @@ def build_cluster_launch_script(
launcher.write(
f"""
#!/bin/bash
# start_spark_slurm.sh generated by PRL ecosystem tool linker
unset SPARK_HOME
CONDA_PATH=/opt/conda/condabin/conda # must be accessible within container
CONDA_ENV=spark_cluster
SINGULARITY_IMG=image.sif
export sparkLogs=$HOME/.spark_temp/logs
export SPARK_ROOT=/opt/spark # within the container
export SPARK_WORKER_DIR=$sparkLogs
export SPARK_LOCAL_DIRS=$sparkLogs
export SPARK_WORKER_DIR=$HOME/.spark_temp/logs
export SPARK_LOCAL_DIRS=$HOME/.spark_temp/logs
export SPARK_MASTER_PORT=28508
export SPARK_MASTER_WEBUI_PORT=28509
export SPARK_WORKER_CORES=$SLURM_CPUS_PER_TASK
# shellcheck disable=SC2004
export SPARK_DAEMON_MEMORY=$(( $SLURM_MEM_PER_CPU * $SLURM_CPUS_PER_TASK / 2 ))m
export SPARK_MEM=$SPARK_DAEMON_MEMORY
# This section will be run when started by sbatch
if [ "$1" != 'multi_job' ]; then
this=$0
# I experienced problems with some nodes not finding the script:
# slurmstepd: execve(): /var/spool/slurm/job123/slurm_script:
# No such file or directory
# that's why this script is being copied to a shared location to which
# all nodes have access:
mkdir -p $HOME/.spark_temp
script=$HOME/.spark_temp/${SLURM_JOBID}_$( basename -- "$0" )
mkdir -p "$HOME/.spark_temp"
script=$HOME/.spark_temp/${{SLURM_JOBID}}_$( basename -- "$0" )
cp "$this" "$script"
srun $script 'multi_job'
srun "$script 'multi_job'"
# If run by srun, then decide by $SLURM_PROCID whether we are master or worker
else
if [ "$SLURM_PROCID" -eq 0 ]; then
export SPARK_MASTER_IP="$(hostname).cluster.ihme.washington.edu"
MASTER_NODE=$( scontrol show hostname $SLURM_NODELIST | head -n 1 )
HOSTNAME=$(hostname)
# TODO: use fqdn from configuration
export SPARK_MASTER_IP="$HOSTNAME.cluster.ihme.washington.edu"
MASTER_NODE=$( scontrol show hostname "$SLURM_NODELIST "| head -n 1 )
mkdir -p /tmp/pvs_like_case_study_spark_local_$USER
singularity exec -B /mnt:/mnt,/tmp/pvs_like_case_study_spark_local_$USER:/tmp $SINGULARITY_IMG $CONDA_PATH run --no-capture-output -n $CONDA_ENV "$SPARK_ROOT/bin/spark-class" org.apache.spark.deploy.master.Master --host "$SPARK_MASTER_IP" --port "$SPARK_MASTER_PORT" --webui-port "$SPARK_MASTER_WEBUI_PORT"
mkdir -p "/tmp/pvs_like_case_study_spark_local_$USER"
singularity exec -B /mnt:/mnt,"/tmp/pvs_like_case_study_spark_local_$USER":/tmp $SINGULARITY_IMG $CONDA_PATH run --no-capture-output -n $CONDA_ENV "$SPARK_ROOT/bin/spark-class" org.apache.spark.deploy.master.Master --host "$SPARK_MASTER_IP" --port "$SPARK_MASTER_PORT" --webui-port "$SPARK_MASTER_WEBUI_PORT"
else
# $(scontrol show hostname) is used to convert e.g. host20[39-40]
# to host2039.
# TODO: This step assumes that SLURM_PROCID=0 corresponds to the first node in SLURM_NODELIST. Is this reasonable?
MASTER_NODE=spark://$( scontrol show hostname $SLURM_NODELIST | head -n 1 ):$SPARK_MASTER_PORT
MASTER_NODE=spark://$( scontrol show hostname "$SLURM_NODELIST" | head -n 1 ):"$SPARK_MASTER_PORT"
mkdir -p /tmp/pvs_like_case_study_spark_local_$USER
singularity exec -B /mnt:/mnt,/tmp/pvs_like_case_study_spark_local_$USER:/tmp $SINGULARITY_IMG $CONDA_PATH run --no-capture-output -n $CONDA_ENV "$SPARK_ROOT/bin/spark-class" org.apache.spark.deploy.worker.Worker $MASTER_NODE
mkdir -p "/tmp/spark_cluster_$USER"
singularity exec -B /mnt:/mnt,"/tmp/spark_cluster_$USER":/tmp "$SINGULARITY_IMG" "$CONDA_PATH" run --no-capture-output -n "$CONDA_ENV" "$SPARK_ROOT/bin/spark-class" org.apache.spark.deploy.worker.Worker "$MASTER_NODE"
fi
fi
"""
"""
)
launcher.close()

atexit.register(lambda: os.remove(launcher.name))
return launcher


0 comments on commit 2c97b35

Please sign in to comment.