From ec3ce12b89914e8f7cc51d7c12064273fea54d41 Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Fri, 1 Nov 2024 17:36:13 +0000 Subject: [PATCH 1/3] imp: signals: do not use the SA_RESTART flag Problem: The SA_RESTART flag is used when establishing signal handlers in the IMP, but this is not necessary (affected system calls like waitpid(2) already check for EINTR). It may also be convenient in future code to allow blocking system calls to be interrupted by signals to force the recheck of a condition, such as checking the contents of cgroup.procs. Drop the use of the SA_RESTART flag. --- src/imp/signals.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/imp/signals.c b/src/imp/signals.c index d43ad79..670398c 100644 --- a/src/imp/signals.c +++ b/src/imp/signals.c @@ -93,7 +93,7 @@ void imp_setup_signal_forwarding (struct imp_state *imp) memset(&sa, 0, sizeof(sa)); sa.sa_handler = fwd_signal; - sa.sa_flags = SA_RESTART; + sa.sa_flags = 0; sigemptyset(&sa.sa_mask); sigfillset (&mask); From 3feae11157a56d44da3efc802e3e025c8674ee13 Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Fri, 1 Nov 2024 18:35:45 +0000 Subject: [PATCH 2/3] imp: exec: wait for empty cgroup before exiting Problem: The IMP immediately exits after waitpid(2) returns for its immediate child, but this could leave extra processes running in the job's cgroup. Wait for the job cgroup to be empty before allowing the IMP to exit. An empty cgroup is considered one in which the only process is cgroup.procs is the IMP itself. --- src/imp/cgroup.c | 23 +++++++++++++++++++++++ src/imp/cgroup.h | 4 ++++ src/imp/exec/exec.c | 3 +++ 3 files changed, 30 insertions(+) diff --git a/src/imp/cgroup.c b/src/imp/cgroup.c index 8591164..9375ec2 100644 --- a/src/imp/cgroup.c +++ b/src/imp/cgroup.c @@ -241,5 +241,28 @@ int cgroup_kill (struct cgroup_info *cgroup, int sig) return count; } +int cgroup_wait_for_empty (struct cgroup_info *cgroup) +{ + int n; + + /* Only wait for empty cgroup if cgroup kill is enabled. + */ + if (!cgroup->use_cgroup_kill) + return 0; + + while ((n = cgroup_kill (cgroup, 0)) > 0) { + /* Note: inotify/poll() do not work on the cgroup.procs virtual + * file. Therefore, wait at most 1s and check to see if the cgroup + * is empty again. If the job execution system requests a signal to + * be delivered then the sleep will be interrupted, in which case a + * a small delay is added in hopes that any terminated processes + * will have been removed from cgroup.procs by then. + */ + if (usleep (1e6) < 0 && errno == EINTR) + usleep (2000); + } + return 0; +} + /* vi: ts=4 sw=4 expandtab */ diff --git a/src/imp/cgroup.h b/src/imp/cgroup.h index 0c9bdc5..2c0169e 100644 --- a/src/imp/cgroup.h +++ b/src/imp/cgroup.h @@ -27,4 +27,8 @@ void cgroup_info_destroy (struct cgroup_info *cgroup); */ int cgroup_kill (struct cgroup_info *cgroup, int sig); +/* Wait for all processes in cgroup (except this one) to exit. + */ +int cgroup_wait_for_empty (struct cgroup_info *cgroup); + #endif /* !HAVE_IMP_CGROUP_H */ diff --git a/src/imp/exec/exec.c b/src/imp/exec/exec.c index 005b38f..c880995 100644 --- a/src/imp/exec/exec.c +++ b/src/imp/exec/exec.c @@ -293,6 +293,9 @@ int imp_exec_privileged (struct imp_state *imp, struct kv *kv) imp_die (1, "waitpid: %s", strerror (errno)); } + if (cgroup_wait_for_empty (exec->imp->cgroup) < 0) + imp_warn ("error waiting for processes in job cgroup"); + #if HAVE_PAM /* Call privliged IMP plugins/containment finalization */ if (imp_supports_pam (exec)) From 1e23343506ebcb4116ce83ad04d64cecd7c3260c Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Fri, 1 Nov 2024 18:37:57 +0000 Subject: [PATCH 3/3] testsuite: add test that ensures the IMP waits for an empty cgroup Problem: No test ensures that imp exec waits for the job cgroup to be empty before exiting. Add a test that sends SIGTERM to an IMP managing a process tree. This kills the immediate child of the IMP, and the IMP must wait until SIGUSR1 is sent, which terminates the cgroup with SIGKILL. --- t/t2000-imp-exec.t | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/t/t2000-imp-exec.t b/t/t2000-imp-exec.t index 98a5435..0cfd3a4 100755 --- a/t/t2000-imp-exec.t +++ b/t/t2000-imp-exec.t @@ -329,6 +329,22 @@ test_expect_success SUDO,CGROUPFS,NO_CHAIN_LINT \ test_must_be_empty ${CGROUP_PATH}/cgroup.procs ' +test_expect_success SUDO,CGROUPFS,NO_CHAIN_LINT \ + 'flux-imp exec: SIGUSR1 waits for cgroup to be empty' ' + fake_input_sign_none | \ + $SUDO FLUX_IMP_CONFIG_PATTERN=sign-none.toml \ + ./run-in-cgroup.sh "$CGROUP_PATH" \ + $flux_imp exec $(pwd)/sleeper.sh 15 & + imp_pid=$! && + test_when_finished "rm -f sleeper.pid" && + wait_for_file sleeper.pid && + kill -TERM $(cat sleeper.pid) && + sleep .5 && + kill -USR1 $(cat sleeper.pid) && + test_expect_code 143 wait $imp_pid && + test_must_be_empty ${CGROUP_PATH}/cgroup.procs +' + $flux_imp version | grep -q pam || test_set_prereq NO_PAM test_expect_success NO_PAM,SUDO 'flux-imp exec: fails if not built with PAM but pam-support=true' ' ( export FLUX_IMP_CONFIG_PATTERN=pam-test.toml &&