From 55079624b516dad6c8a54a7ac624f03e464d6126 Mon Sep 17 00:00:00 2001 From: Juan Esteban Arango Ossa Date: Tue, 9 Jul 2024 13:48:51 -0400 Subject: [PATCH] =?UTF-8?q?=F0=9F=94=A7=20slurm=20batch=20system=20improve?= =?UTF-8?q?ments=20(#62)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 🔧 change bash env in slurm * 🔧 update slurm submission * 🔧 update file mode * 🔧 fix slurm efficiency printout job * ✅ update GH travis token --- .travis.yml | 2 +- isabl_cli/batch_systems/slurm.py | 37 ++++++++++++++++++++++---------- setup.py | 0 3 files changed, 27 insertions(+), 12 deletions(-) mode change 100644 => 100755 setup.py diff --git a/.travis.yml b/.travis.yml index 55ce9d9..c9c8a19 100644 --- a/.travis.yml +++ b/.travis.yml @@ -19,4 +19,4 @@ env: - secure: fkbh052mzzgDi7Bf6m/5kkSlypyLDkTYyNvRJH3O/jjVKB3b4zX1WDXnRL+/8MJEYm240g2JGCu/TdAph5t2+R9I1LmELBIjP/d/uMqKFCjA0TLSmLgxKZUg5c7cT2dFQh5z6pgcJPAgPsFrWITkTXqWmKfreA27icDqQdiFi1FksEAkrfeT3Y+WnkR0XNYzQrWnemS1QWh8wti/vGUO+znLIBw5mP8C3Da1KJiL/Uo6+zb11ddmsCJw3a27EqtfwtcGdUco59av0gVLLNwyc2DQYiOQpCRlf+z44JC0n4buq9H1FuwIpuWRGbqLn0HupfXpt9yfV79y2MfCqibc9MQFeoC50fqzTbyQGtz8cROsQazZ3N3cmEBf00uPmxCJffQedwnn1z9QmrZhFHnJGZc1crYUFOvaAqpmg78BE6xv8j7wg51oqx7eVRjJJY/EGMCQoaSZoH0FGfLyADEp/eKpbIqphxCoyPDaIuLFBEonXMaQqgyAWw94aWmv1lB6V/gGJpnBG6zkczEPup5+1LF6UTrmogVeoGgMd4+g9pWp9OUJQAnAQ/m+PG7LEmmEnHxFFshZ+EOmmFYIgWX9BqXd+ajr7DjO0zrcAF5Zmm1Ja7xVA595l+X3WvHjuNzkyFLEDZrn3mLwGeKxk+4xbVa50dGKCi4NICXOAxbv/bA= - secure: hgGml3e2YgUkgM76iQaDkeY5k/rk7+lr1beqYkwmfqn3SmNShrJ5Q3PQpqiRi+FkYVbNgbsWMP49/MzTn6K26S9JBG0cXrnHbvJvKOYyomE/jgumwyMj5zNjqHjeDfozYRWvR6K3DtC9eafaP4NTs/EQu3gsVFWwepFkltJXNsfP3hMao/C1dy/ObxVlMTkr7RsXTYaCmSKs53Vf3TDwoP5hh2hWbgCudKk0C8hJ2znSz+sQ+3HcRklQWaG5qPnBXPGjE/gTTBV2aL9ZjNsHK1B9n4+zskFYvSZt9OAC4LfERHGm6FstEJgof3+HeOuQanxryUsb9ZPtsWv85T5vjVu27PNYmXful7qVD4iWyhsV6G6reU3YZcu/DIAnVsQAyqhdZwCq4KIh3kFJFPiSi/3E7uMBhUont5+aV+FihlK3ENNc8NdjMe0C4SEhMDZ5i470CienAIUGrAdhl4HDmoWTzh0qTuGhGJwoMTOzHAHTO8enTFbFjmSt/B2zHAkYBfI0OBX9bsOu6JUKmMWm86GItdpQfSY8mlR7wa06Gqz0/sE7GoF78tb9wgUv5/yxIOinmhDHlwHFkCr+W98q9UZsWepEMjHPe6el0puegvdc5NYnc7T7+Fu2B66yMCJbmDbcB2Hn6JWKpTk7l/pM3tFco01Go3wsKebdCIj5ASk= - secure: dXbqdNJZzG06SsWBHTAXSlqBqKnuyLVaM0Oath9Ni/i5ZbAqCC1I750WZ3pmsQwpjHLSWVPUH7DgQ5MH0frIeaBj5ehrFBJXRmWcYQK2x0S+SIc5kVsJxIZXbLZ7wTsZS5QsrTVe8sRcdJ1XUruIzUH23ZFmBnVSQofXS0VQVEsdL94DEjh6GxGEuJtprqNKkXIAjOA9uFwNqQluJKdJI0GvkZdyTXUYwsFh5b1n9tiq0JeFs74MZet5viW+KRddBnckZBjV7MYzG7Zx4AlPrVmB+ug8+zTyNHKa2rnzehS6tbtIK01LWxK8g76uBP/i4miy/EKdBWYwcZf8RgkLqwmIYkL9NYiDk7rXqYx3q+5JKaGxkDhwjF8OSzuRbmit0/O5HWGSsDYxqXXkDJFkZhJBWBkFYDrK4vuYF+fafRn2Dn3B1Iv6Gj1GWY8LTmTkI0QtlorrEJ4kKyNAlxcnvM1G2xg6ZXOfDsdO/y0Sjl8sTdJad3XJaSttTj4HqdOJnyUzmzCcT6t8QV4Q/6qWVoCs842inYeudFpnmF4wMJADx3zzfJ5zB9VMQg/CPPy4SMaF1BITtCI+yCshbzNUqhUK2y6/Rs6AeRMOoO/FuyPSMV+GqBy2/qKEG7SSerxPYPniVizgY1NoceviWP0CkINHtuYo9TybRvpk6xP3/mE= - - secure: gs6SipJHOWK48rZzW9WDJwJqFfrB+zO7TtZ5+W5JYE9JNxpWJjG2QK89Kd16ex+n6v3bN6h0EPAUNZV3HwZwb/deoHuOHvLiH51kSYHgloFvMUBbeyKRGGGfxL7w+seFulh6IIcafjTk5ugdAF2mW5o69GArfC3fa/Bms8uIc0Pp5UGubiLFbn6oiGwFjf1o0UPVh369rEVlMjFNyRe8zSFNzroKnn6TmBvyhmPL0uPhty4GxtJ2R51wTNvOzKDm8vix3FHk5d4NZrdQwBIzSu0r4ocRISu/KhXm+L7Db5bSXVIdeeWvHJNjMFZ8wt2WkhbRCi3r9HtiI1z51+YMdTDLjmJXggXVdmMCNfD1lkaRz0gXdj8tI84Nj/gfrtRAtQxHTOt/Aa0Sm/iUL2NWkQbzU34SFWvs07mGosVcNhfVLBqSvCQowCeiHNt7yfbjZgFwuSyzLCMFTuQ+6GIclsL+E1TQT7wF3nDlfezhQecdw3+GJicBTFBY5FM98Ael+5fq+PY2vG+zuei1Ia4Eo3wD3+BEijk5i/U87Ll18iZUeRylhlVlVjGDTntTpbtGKyD3YqVcimS7tZ6zYgR/uonuH9R/aiWuSWeqlMrdT4SCwL15WujPQJJhB8gOKGHg7vkUgF7c8Fj/UMGayYewca4assAY8CMNneMLnR+eyW8= + - secure: VeRd+1WNVHeYJ59fvgmllQskwbtyITrR2v7loUCc6nagoAeSmwZfoe5/7lySIMO6/uzIJfkiaxNbzf3vatYKhDYqFd/V5Swrdu3RUxT7xGgvGpOzaOrhrHi+CPR/1dk58/bgGHmy8q8S0uesId/45LeRELEbcCRiCtlWZFz4UuHFKq0qRJv6LdFZvWFk6F/BcWq7n8fC7SGT+Krn0lf0HR3zX4UyUNPrWKe15l0h+Z05RYcwOLi5cK+sqFx0os5SU44DfYkO2jIsRhqDMJc+ipC/YZfpYxC+V/AT4CieDIcq/JFB2DHx/bnknm8OE62a1IvZJ+hbl6Uv/Khx/QnoGf3wgVZ3guKWju+SnzeZTLU2aTQcyArKmTKVD0me/M95TzjbCLECwwCrNdTGpXuqTOlnLIrjXUvpJ1Q+5EVk+8ULrYbCeYhVqJ/+iYvDUjehmBd+yW59Mgvt0eCc4IeabYpNBgiZ44UCHSt57GfH95xbZQmjkW4k8Lm1uoyRypzD8tcUgi2bkwoiK/jNaNQ562AoxUqIEPnbzw+2EkadULkRr4nianbvAgACaMZsavLBqUpchi4x6vnRaWGUiz71h9FCHS5e8eRV/UFe2AS37CDt4ycuAyumaFfTd0eQdF6w4WomYkRz+U+q7AhkZvsBorQjbGS6jI0LPUuozFi2Yno= \ No newline at end of file diff --git a/isabl_cli/batch_systems/slurm.py b/isabl_cli/batch_systems/slurm.py index 15c0818..d0e1615 100644 --- a/isabl_cli/batch_systems/slurm.py +++ b/isabl_cli/batch_systems/slurm.py @@ -87,9 +87,9 @@ def submit_slurm_array( Arguments: commands (list): of (path to bash script, on exit command) tuples. - requirements (str): string of LSF requirements. + requirements (str): string of SLURM requirements. jobname (str): slurm array jobname. - extra_args (str): extra LSF args. + extra_args (str): extra SLURM args. throttle_by (int): max number of jobs running at same time. wait (bool): if true, wait until clean command finishes. @@ -122,41 +122,56 @@ def submit_slurm_array( # submit a dependency job on failure # important when the scheduler kills the head job dependency = "${SLURM_ARRAY_JOB_ID}_${SLURM_ARRAY_TASK_ID}" - afternotok = ( + after_not_ok_job = ( f"sbatch {extra_args} --depend=afternotok:{dependency} --kill-on-invalid-dep yes " - f'-o {join(rundir, "head_job.exit")} -J "EXIT: {dependency}" ' + f'--export=TMP,TMPDIR,TMP_DIR -o {join(rundir, "head_job.exit")} -J "EXIT: {dependency}" ' f"<< EOF\n#!/bin/bash\n{exit_command}\nEOF\n" ) # use random sleep to avoid parallel API hits f.write( - f"#!/bin/sh\nsleep {random.uniform(0, 10):.3} && " - f"({afternotok}) && bash {command}" + f"#!/bin/bash\n\n" + f"sleep {random.uniform(0, 10):.3} && " + f"({after_not_ok_job}) && bash {command}" ) - for j in "log", "err", "exit": + for j in "log", "err", "exit", "slurm": src = join(rundir, f"head_job.{j}") dst = join(root, f"{j}.{index}") open(src, "w").close() utils.force_symlink(src, dst) with open(join(root, "in.sh"), "w") as f: - f.write(f"#!/bin/sh\nbash {root}/in.$SLURM_ARRAY_TASK_ID") + f.write(f"#!/bin/bash\nbash {root}/in.$SLURM_ARRAY_TASK_ID") with open(join(root, "clean.sh"), "w") as f: - f.write(f"#!/bin/sh\nrm -rf {root}") + f.write(f"#!/bin/bash\nrm -rf {root}") + # Main job array cmd = ( f"sbatch {requirements} {extra_args} --array 1-{total}%{throttle_by} " f"-o '{root}/log.%a' -e '{root}/err.%a' " f'-J "ISABL: {jobname}" --parsable {root}/in.sh' ) - jobid = subprocess.check_output(cmd, shell=True).decode("utf-8").strip() + # Job to print out slurm job metrics upon main job completion + seff_jobids = [] + for i in range(1, total + 1): + seff_cmd = ( + f"sbatch {extra_args} --kill-on-invalid-dep=yes " + f"--dependency=afterany:{jobid}_{i} -o '{root}/slurm.{i}' -J 'SEFF: {jobname}' " + f"--wrap='seff {jobid}_{i}'" + ) + seff_jobid = subprocess.check_output(seff_cmd, shell=True).decode("utf-8").strip() + seff_jobids.append(seff_jobid.split()[-1]) + + # Job to clean job array rundir + with open(join(root, "clean.sh"), "w") as f: + f.write(f"#!/bin/bash\nrm -rf {root}") cmd = ( f"sbatch {extra_args} -J 'CLEAN: {jobname}' {wait} --kill-on-invalid-dep yes " - f"-o /dev/null -e /dev/null --depend=afterany:{jobid} --parsable {root}/clean.sh" + f"-o /dev/null -e /dev/null --depend=afterany:{':'.join(seff_jobids)} --parsable {root}/clean.sh" ) return subprocess.check_output(cmd, shell=True).decode("utf-8").strip() diff --git a/setup.py b/setup.py old mode 100644 new mode 100755