From cde68f9608fb402844dd14d8a56ba0179defbcaf Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Fri, 1 Nov 2024 08:06:32 -0700 Subject: [PATCH] job-exec: send SIGUSR1 to the IMP, not SIGKILL Problem: RFC 15 states that the IMP handles SIGUSR1 by sending SIGKILL to the entire cgroup. For multi-user, send the IMP SIGUSR1 rather than SIGKILL after shell signaling mechanisms have failed to clean up. --- src/modules/job-exec/job-exec.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/modules/job-exec/job-exec.c b/src/modules/job-exec/job-exec.c index e71a5f4393ff..646acd13ef4e 100644 --- a/src/modules/job-exec/job-exec.c +++ b/src/modules/job-exec/job-exec.c @@ -438,13 +438,21 @@ static void kill_shell_timer_cb (flux_reactor_t *r, { struct jobinfo *job = arg; struct idset *active_ranks; + int actual_kill_signal = kill_signal; + + /* RFC 15 states that the IMP handles SIGUSR1 by sending SIGKILL to + * the entire cgroup. Sending SIGKILL to the IMP is not productive. + */ + if (job->multiuser) + actual_kill_signal = SIGUSR1; flux_log (job->h, LOG_DEBUG, - "Sending %s to job shell for job %s", - sigutil_signame (kill_signal), + "Sending %s to %s for job %s", + sigutil_signame (actual_kill_signal), + job->multiuser ? "IMP" : "job shell", idf58 (job->id)); - (*job->impl->kill) (job, kill_signal); + (*job->impl->kill) (job, actual_kill_signal); job->kill_shell_count++; /* Since we've transitioned to killing the shell directly, stop the