From 547aa0c5896de61a9ccbbc270804e5228451358d Mon Sep 17 00:00:00 2001 From: Alex Domingo Date: Wed, 14 Dec 2022 16:37:33 +0100 Subject: [PATCH 1/2] poll job status while waiting for single-user server to be reachable --- batchspawner/batchspawner.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/batchspawner/batchspawner.py b/batchspawner/batchspawner.py index 7c2836c..4e81228 100644 --- a/batchspawner/batchspawner.py +++ b/batchspawner/batchspawner.py @@ -448,6 +448,13 @@ async def start(self): # don't actually run the single-user server yet. if hasattr(self, "mock_port"): self.port = self.mock_port + # Check if job is still running + status = await self.poll() + if status: + raise RuntimeError( + "The Jupyter batch job started" + " but died before launching the single-user server." + ) self.db.commit() self.log.info( From 451cf0a40ec73a8d3489b94d11606682bbf15133 Mon Sep 17 00:00:00 2001 From: Min RK Date: Tue, 19 Mar 2024 09:35:15 +0100 Subject: [PATCH 2/2] update tests for early poll - add expected additional poll call during start - add test for new behavior when poll stops early --- batchspawner/tests/test_spawners.py | 64 ++++++++++++++++++----------- 1 file changed, 40 insertions(+), 24 deletions(-) diff --git a/batchspawner/tests/test_spawners.py b/batchspawner/tests/test_spawners.py index bd378c5..e5d43c0 100644 --- a/batchspawner/tests/test_spawners.py +++ b/batchspawner/tests/test_spawners.py @@ -349,20 +349,19 @@ async def test_torque(db, event_loop): re.compile(r"ppn=5"), re.compile(r"^#PBS some_option_asdf", re.M), ] + poll_running = ( + re.compile(r"sudo.*qstat"), + f"R{testhost}/1", + ) script = [ (re.compile(r"sudo.*qsub"), str(testjob)), ( re.compile(r"sudo.*qstat"), "Q", ), # pending - ( - re.compile(r"sudo.*qstat"), - f"R{testhost}/1", - ), # running - ( - re.compile(r"sudo.*qstat"), - f"R{testhost}/1", - ), # running + poll_running, + poll_running, + poll_running, (re.compile(r"sudo.*qdel"), "STOP"), (re.compile(r"sudo.*qstat"), ""), ] @@ -394,17 +393,16 @@ async def test_moab(db, event_loop): re.compile(r"ppn=5"), re.compile(r"^#PBS some_option_asdf", re.M), ] + poll_running = ( + re.compile(r"sudo.*mdiag"), + f'State="Running" AllocNodeList="{testhost}"', + ) script = [ (re.compile(r"sudo.*msub"), str(testjob)), (re.compile(r"sudo.*mdiag"), 'State="Idle"'), # pending - ( - re.compile(r"sudo.*mdiag"), - f'State="Running" AllocNodeList="{testhost}"', - ), # running - ( - re.compile(r"sudo.*mdiag"), - f'State="Running" AllocNodeList="{testhost}"', - ), # running + poll_running, + poll_running, + poll_running, (re.compile(r"sudo.*mjobctl.*-c"), "STOP"), (re.compile(r"sudo.*mdiag"), ""), ] @@ -436,17 +434,16 @@ async def test_pbs(db, event_loop): re.compile(r"@some_pbs_admin_node"), re.compile(r"^#PBS some_option_asdf", re.M), ] + poll_running = ( + re.compile(r"sudo.*qstat"), + f"job_state = R\nexec_host = {testhost}/2*1", + ) script = [ (re.compile(r"sudo.*qsub"), str(testjob)), (re.compile(r"sudo.*qstat"), "job_state = Q"), # pending - ( - re.compile(r"sudo.*qstat"), - f"job_state = R\nexec_host = {testhost}/2*1", - ), # running - ( - re.compile(r"sudo.*qstat"), - f"job_state = R\nexec_host = {testhost}/2*1", - ), # running + poll_running, + poll_running, + poll_running, (re.compile(r"sudo.*qdel"), "STOP"), (re.compile(r"sudo.*qstat"), ""), ] @@ -504,6 +501,7 @@ async def test_slurm(db, event_loop): ), # unknown (re.compile(r"sudo.*squeue"), "RUNNING " + testhost), # running (re.compile(r"sudo.*squeue"), "RUNNING " + testhost), + (re.compile(r"sudo.*squeue"), "RUNNING " + testhost), (re.compile(r"sudo.*scancel"), "STOP"), (re.compile(r"sudo.*squeue"), ""), ] @@ -573,6 +571,7 @@ async def test_condor(db, event_loop): (re.compile(r"sudo.*condor_q"), "1,"), # pending (re.compile(r"sudo.*condor_q"), f"2, @{testhost}"), # runing (re.compile(r"sudo.*condor_q"), f"2, @{testhost}"), + (re.compile(r"sudo.*condor_q"), f"2, @{testhost}"), (re.compile(r"sudo.*condor_rm"), "STOP"), (re.compile(r"sudo.*condor_q"), ""), ] @@ -611,6 +610,7 @@ async def test_lfs(db, event_loop): (re.compile(r"sudo.*bjobs"), "PEND "), # pending (re.compile(r"sudo.*bjobs"), f"RUN {testhost}"), # running (re.compile(r"sudo.*bjobs"), f"RUN {testhost}"), + (re.compile(r"sudo.*bjobs"), f"RUN {testhost}"), (re.compile(r"sudo.*bkill"), "STOP"), (re.compile(r"sudo.*bjobs"), ""), ] @@ -652,3 +652,19 @@ async def test_keepvars(db, event_loop): spawner_kwargs=spawner_kwargs, batch_script_re_list=batch_script_re_list, ) + + +async def test_early_stop(db, event_loop): + script = [ + (re.compile(r"sudo.*sbatch"), str(testjob)), + (re.compile(r"sudo.*squeue"), "PENDING "), # pending + ( + re.compile(r"sudo.*squeue"), + "slurm_load_jobs error: Unable to contact slurm controller", + ), # unknown + # job exits early during start + (re.compile(r"sudo.*squeue"), ""), + (re.compile(r"sudo.*scancel"), "STOP"), + ] + with pytest.raises(RuntimeError, match="job has disappeared"): + await run_spawner_script(db, SlurmSpawner, script)