Skip to content

Commit

Permalink
Merge pull request #4572 from jedwards4b/jedwards/fix_dry_run_and_res…
Browse files Browse the repository at this point in the history
…olve_subgroup

fix issue getting correct parameters for job subgroups
Merge without fixing e3sm test - seems to be a container issue.
  • Loading branch information
jedwards4b authored Feb 1, 2024
2 parents 6490e2e + da717c3 commit 3d961a9
Show file tree
Hide file tree
Showing 7 changed files with 58 additions and 42 deletions.
45 changes: 22 additions & 23 deletions CIME/XML/env_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -813,6 +813,7 @@ def submit_jobs(
dry_run=dry_run,
workflow=workflow,
)

batch_job_id = str(alljobs.index(job)) if dry_run else result
depid[job] = batch_job_id
jobcmds.append((job, result))
Expand Down Expand Up @@ -921,35 +922,33 @@ def _submit_single_job(
logger.info("Starting job script {}".format(job))
function_name = job.replace(".", "_")
job_name = "." + job
if not dry_run:
args = self._build_run_args(
job,
True,
skip_pnl=skip_pnl,
set_continue_run=resubmit_immediate,
submit_resubmits=workflow and not resubmit_immediate,
)
try:
if hasattr(case, function_name):
getattr(case, function_name)(
**{k: v for k, (v, _) in args.items()}
)
args = self._build_run_args(
job,
True,
skip_pnl=skip_pnl,
set_continue_run=resubmit_immediate,
submit_resubmits=workflow and not resubmit_immediate,
)
try:
if hasattr(case, function_name):
getattr(case, function_name)(**{k: v for k, (v, _) in args.items()})
else:
expect(
os.path.isfile(job_name),
"Could not find file {}".format(job_name),
)
if dry_run:
return os.path.join(self._caseroot, job_name)
else:
expect(
os.path.isfile(job_name),
"Could not find file {}".format(job_name),
)
run_cmd_no_fail(
os.path.join(self._caseroot, job_name),
combine_output=True,
verbose=True,
from_dir=self._caseroot,
)
except Exception as e:
# We don't want exception from the run phases getting into submit phase
logger.warning(
"Exception from {}: {}".format(function_name, str(e))
)
except Exception as e:
# We don't want exception from the run phases getting into submit phase
logger.warning("Exception from {}: {}".format(function_name, str(e)))

return

Expand Down Expand Up @@ -1088,10 +1087,10 @@ def _submit_single_job(
# add ` before cd $CASEROOT and at end of command
submitcmd = submitcmd.replace("cd $CASEROOT", "'cd $CASEROOT") + "'"

submitcmd = case.get_resolved_value(submitcmd, subgroup=job)
if dry_run:
return submitcmd
else:
submitcmd = case.get_resolved_value(submitcmd)
logger.info("Submitting job script {}".format(submitcmd))
output = run_cmd_no_fail(submitcmd, combine_output=True)
jobid = self.get_job_id(output)
Expand Down
8 changes: 7 additions & 1 deletion CIME/XML/env_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,13 @@ def get_job_specs(self, case, job):
if ngpus_per_node > max_gpus_per_node:
ngpus_per_node = max_gpus_per_node

return task_count, num_nodes, tasks_per_node, thread_count, ngpus_per_node
return (
task_count,
num_nodes,
tasks_per_node,
thread_count,
ngpus_per_node,
)

# pylint: disable=arguments-differ
def get_value(self, item, attribute=None, resolved=True, subgroup="PRIMARY"):
Expand Down
7 changes: 5 additions & 2 deletions CIME/XML/generic_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -612,7 +612,9 @@ def set_value(

return value if valnodes else None

def get_resolved_value(self, raw_value, allow_unresolved_envvars=False):
def get_resolved_value(
self, raw_value, allow_unresolved_envvars=False, subgroup=None
):
"""
A value in the xml file may contain references to other xml
variables or to environment variables. These are refered to in
Expand Down Expand Up @@ -662,7 +664,8 @@ def get_resolved_value(self, raw_value, allow_unresolved_envvars=False):
logger.debug("find: {}".format(var))
# The overridden versions of this method do not simply return None
# so the pylint should not be flagging this
ref = self.get_value(var) # pylint: disable=assignment-from-none
# pylint: disable=assignment-from-none
ref = self.get_value(var, subgroup=subgroup)

if ref is not None:
logger.debug("resolve: " + str(ref))
Expand Down
11 changes: 8 additions & 3 deletions CIME/case/case.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,7 @@ def get_value(self, item, attribute=None, resolved=True, subgroup=None):

if result is not None:
if resolved and isinstance(result, str):
result = self.get_resolved_value(result)
result = self.get_resolved_value(result, subgroup=subgroup)
vtype = env_file.get_type_info(item)
if vtype is not None and vtype != "char":
result = convert_to_type(result, vtype, item)
Expand Down Expand Up @@ -548,13 +548,17 @@ def get_type_info(self, item):

return result

def get_resolved_value(self, item, recurse=0, allow_unresolved_envvars=False):
def get_resolved_value(
self, item, recurse=0, allow_unresolved_envvars=False, subgroup=None
):
num_unresolved = item.count("$") if item else 0
recurse_limit = 10
if num_unresolved > 0 and recurse < recurse_limit:
for env_file in self._env_entryid_files:
item = env_file.get_resolved_value(
item, allow_unresolved_envvars=allow_unresolved_envvars
item,
allow_unresolved_envvars=allow_unresolved_envvars,
subgroup=subgroup,
)
if "$" not in item:
return item
Expand All @@ -563,6 +567,7 @@ def get_resolved_value(self, item, recurse=0, allow_unresolved_envvars=False):
item,
recurse=recurse + 1,
allow_unresolved_envvars=allow_unresolved_envvars,
subgroup=subgroup,
)

return item
Expand Down
26 changes: 14 additions & 12 deletions CIME/case/case_submit.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@
from CIME.locked_files import unlock_file, lock_file
from CIME.test_status import *

import socket

logger = logging.getLogger(__name__)


Expand All @@ -39,6 +37,7 @@ def _submit(
batch_args=None,
workflow=True,
chksum=False,
dryrun=False,
):
if job is None:
job = case.get_first_job()
Expand Down Expand Up @@ -164,9 +163,6 @@ def _submit(
case.check_case(skip_pnl=skip_pnl, chksum=chksum)
if job == case.get_primary_job():
case.check_DA_settings()
if case.get_value("MACH") == "mira":
with open(".original_host", "w") as fd:
fd.write(socket.gethostname())

# Load Modules
case.load_env()
Expand All @@ -185,16 +181,20 @@ def _submit(
mail_type=mail_type,
batch_args=batch_args,
workflow=workflow,
dry_run=dryrun,
)

xml_jobids = []
for jobname, jobid in job_ids.items():
logger.info("Submitted job {} with id {}".format(jobname, jobid))
if jobid:
xml_jobids.append("{}:{}".format(jobname, jobid))
if dryrun:
for job in job_ids:
xml_jobids.append("{}:{}".format(job[0], job[1]))
else:
for jobname, jobid in job_ids.items():
logger.info("Submitted job {} with id {}".format(jobname, jobid))
if jobid:
xml_jobids.append("{}:{}".format(jobname, jobid))

xml_jobid_text = ", ".join(xml_jobids)
if xml_jobid_text:
if xml_jobid_text and not dryrun:
case.set_value("JOB_IDS", xml_jobid_text)

return xml_jobid_text
Expand All @@ -214,6 +214,7 @@ def submit(
batch_args=None,
workflow=True,
chksum=False,
dryrun=False,
):
if resubmit_immediate and self.get_value("MACH") in ["mira", "cetus"]:
logger.warning(
Expand Down Expand Up @@ -266,6 +267,7 @@ def submit(
batch_args=batch_args,
workflow=workflow,
chksum=chksum,
dryrun=dryrun,
)
run_and_log_case_status(
functor,
Expand Down Expand Up @@ -353,7 +355,7 @@ def check_case(self, skip_pnl=False, chksum=False):

expect(
self.get_value("BUILD_COMPLETE"),
"Build complete is " "not True please rebuild the model by calling case.build",
"Build complete is not True please rebuild the model by calling case.build",
)
logger.info("Check case OK")

Expand Down
1 change: 1 addition & 0 deletions CIME/tests/test_unit_case.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ def test_submit(
batch_args=None,
workflow=True,
chksum=True,
dryrun=False,
)


Expand Down
2 changes: 1 addition & 1 deletion docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ RUN curl -L -k -o "${PWD}/pnetcdf.tar.gz" \
make install && \
rm -rf "${PWD}/pnetcdf"

RUN mkdir /root/.cime
RUN mkdir -p /root/.cime /storage/timings

COPY config_machines.xml /root/.cime/
COPY docker.cmake /root/.cime/
Expand Down

0 comments on commit 3d961a9

Please sign in to comment.