From 59f51d7a65890822cb727c8665cab45e983e60a2 Mon Sep 17 00:00:00 2001 From: Christopher Harrop Date: Thu, 30 May 2019 14:43:41 +0000 Subject: [PATCH] Fix bug in slurm jobid recovery code. A missing return statement caused incorrect text to be assigned to the jobid. --- lib/workflowmgr/slurmbatchsystem.rb | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/lib/workflowmgr/slurmbatchsystem.rb b/lib/workflowmgr/slurmbatchsystem.rb index 7005247..de1853e 100644 --- a/lib/workflowmgr/slurmbatchsystem.rb +++ b/lib/workflowmgr/slurmbatchsystem.rb @@ -289,6 +289,10 @@ def submit(task) queued_jobs="" errors="" exit_status=0 + + # Wait a few seconds for information to propagate before trying to look if job was still submitted + sleep(5) + begin # Get the username of this process @@ -316,9 +320,9 @@ def submit(task) # Look for a job that matches the randomID we inserted into the comment queued_jobs.split("\n").each { |job| - # Skip headers - next if job=~/CLUSTER/ - next if job=~/JOBID/ + # Skip headings + next if job[0..4] == 'JOBID' + next if job[0..7] == 'CLUSTER:' # Extract job id jobid=job[0..39].strip @@ -331,6 +335,10 @@ def submit(task) end } + WorkflowMgr.stderr("WARNING: Unable to retrieve jobid after sbatch failed with socket time out when submitting #{task.attributes[:name]}",1) + + return nil,output + else WorkflowMgr.stderr("WARNING: job submission failed: #{output}", 1) return nil,output