From 649e65fa15b65f45ab35c56d3bf509769cda4d27 Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Mon, 4 Nov 2024 05:55:10 -0800 Subject: [PATCH 1/7] build: require flux-security >= 0.13.0 Problem: flux core now requires the IMP signal forwarding features of flux-security 0.13.0, but configure only checks for >= 0.9.0. Modify configure to require >= 0.13.0. --- configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 23af5a3289c4..7eb756f813e7 100644 --- a/configure.ac +++ b/configure.ac @@ -399,7 +399,7 @@ AS_IF([test x$enable_code_coverage = xyes], [ AC_ARG_WITH([flux-security], AS_HELP_STRING([--with-flux-security], [Build with flux-security])) AS_IF([test "x$with_flux_security" = "xyes"], [ - PKG_CHECK_MODULES([FLUX_SECURITY], [flux-security >= 0.9.0], + PKG_CHECK_MODULES([FLUX_SECURITY], [flux-security >= 0.13.0], [flux_sec_incdir=`$PKG_CONFIG --variable=includedir flux-security`], [flux_sec_incdir=;]) AS_IF([test "x$flux_sec_incdir" = x], From ec7098cff81489fe8a18585d30d38abcf066a81a Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Mon, 4 Nov 2024 05:59:29 -0800 Subject: [PATCH 2/7] ci: specify flux-security-0.13.0 Problem: flux core now requires the IMP signal forwarding features of flux-security 0.13.0, but CI specifies requires 0.11.0. Modify docker-run-checks.sh to require the newer version. --- src/test/docker/docker-run-checks.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/docker/docker-run-checks.sh b/src/test/docker/docker-run-checks.sh index d8ad6806b209..566ac9586100 100755 --- a/src/test/docker/docker-run-checks.sh +++ b/src/test/docker/docker-run-checks.sh @@ -16,7 +16,7 @@ JOBS=2 MOUNT_HOME_ARGS="--volume=$HOME:$HOME -e HOME" if test "$PROJECT" = "flux-core"; then - FLUX_SECURITY_VERSION=0.11.0 + FLUX_SECURITY_VERSION=0.13.0 POISON=t fi From ab6a1acacd35f563f1147b764bff99f7575baa06 Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Fri, 1 Nov 2024 07:55:07 -0700 Subject: [PATCH 3/7] job-exec: don't use flux imp kill Problem: job-exec uses 'flux imp kill' to deliver signals to multi-user jobs but that command is deprecated. Don't call bulk_exec_set_imp_path(). --- src/modules/job-exec/exec.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/modules/job-exec/exec.c b/src/modules/job-exec/exec.c index 6d0f054f51a8..149716027c72 100644 --- a/src/modules/job-exec/exec.c +++ b/src/modules/job-exec/exec.c @@ -560,10 +560,6 @@ static int exec_init (struct jobinfo *job) flux_log_error (job->h, "exec_init: bulk_exec_create"); goto err; } - if (job->multiuser && bulk_exec_set_imp_path (exec, imp_path)) { - flux_log_error (job->h, "exec_ctx_create: bulk_exec_set_imp_path"); - goto err; - } if (!(ctx = exec_ctx_create (job, ranks, &error))) { flux_log (job->h, LOG_ERR, "exec_ctx_create: %s", error.text); goto err; From a891285656325e20b4a18e3fcae07c4d753973bd Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Fri, 1 Nov 2024 08:06:32 -0700 Subject: [PATCH 4/7] job-exec: send SIGUSR1 to the IMP, not SIGKILL Problem: RFC 15 states that the IMP handles SIGUSR1 by sending SIGKILL to the entire cgroup. For multi-user, send the IMP SIGUSR1 rather than SIGKILL after shell signaling mechanisms have failed to clean up. Update test faux imp shell script used in test. --- src/modules/job-exec/job-exec.c | 14 +++++++++++--- t/job-exec/imp-fail.sh | 10 ---------- t/t2404-job-exec-multiuser.t | 5 ++--- 3 files changed, 13 insertions(+), 16 deletions(-) diff --git a/src/modules/job-exec/job-exec.c b/src/modules/job-exec/job-exec.c index e71a5f4393ff..646acd13ef4e 100644 --- a/src/modules/job-exec/job-exec.c +++ b/src/modules/job-exec/job-exec.c @@ -438,13 +438,21 @@ static void kill_shell_timer_cb (flux_reactor_t *r, { struct jobinfo *job = arg; struct idset *active_ranks; + int actual_kill_signal = kill_signal; + + /* RFC 15 states that the IMP handles SIGUSR1 by sending SIGKILL to + * the entire cgroup. Sending SIGKILL to the IMP is not productive. + */ + if (job->multiuser) + actual_kill_signal = SIGUSR1; flux_log (job->h, LOG_DEBUG, - "Sending %s to job shell for job %s", - sigutil_signame (kill_signal), + "Sending %s to %s for job %s", + sigutil_signame (actual_kill_signal), + job->multiuser ? "IMP" : "job shell", idf58 (job->id)); - (*job->impl->kill) (job, kill_signal); + (*job->impl->kill) (job, actual_kill_signal); job->kill_shell_count++; /* Since we've transitioned to killing the shell directly, stop the diff --git a/t/job-exec/imp-fail.sh b/t/job-exec/imp-fail.sh index 4ab0fc921aeb..c29dcbdfef57 100755 --- a/t/job-exec/imp-fail.sh +++ b/t/job-exec/imp-fail.sh @@ -14,16 +14,6 @@ case "$cmd" in printf "test-imp: Going to fail on rank 1\n" >&2 if test $(flux getattr rank) = 1; then exit 0; fi exec "$@" ;; - kill) - # Note: kill must be implemented in test since job-exec - # module will run `flux-imp kill PID`. - # - signal=$2; - pid=$3; - printf "test-imp: kill -$signal $pid\n" >&2 - shift 3; - printf "test-imp: Kill pid $pid signal $signal\n" >&2 - kill -$signal $pid ;; *) printf "test-imp: Fatal: Unknown cmd=$cmd\n" >&2; exit 1 ;; esac diff --git a/t/t2404-job-exec-multiuser.t b/t/t2404-job-exec-multiuser.t index 7a7ee1c1d524..aeea73dfb546 100755 --- a/t/t2404-job-exec-multiuser.t +++ b/t/t2404-job-exec-multiuser.t @@ -81,7 +81,7 @@ test_expect_success 'job-exec: reconfig and reload module' ' flux config reload && flux module reload -f job-exec ' -test_expect_success NO_ASAN 'job-exec: kill multiuser job uses the IMP' ' +test_expect_success NO_ASAN 'job-exec: kill multiuser job works' ' FAKE_USERID=42 && flux run --dry-run -n2 -N2 sleep 1000 | \ flux python ${SIGN_AS} ${FAKE_USERID} > sleep-job.signed && @@ -91,8 +91,7 @@ test_expect_success NO_ASAN 'job-exec: kill multiuser job uses the IMP' ' jq -e ".userid == 42" < ${id}.json && flux job wait-event -p exec -vt 30 ${id} shell.start && flux cancel ${id} && - test_expect_code 143 run_timeout 30 flux job status -v ${id} && - flux dmesg | grep "test-imp: Kill .*signal 15" + test_expect_code 143 run_timeout 30 flux job status -v ${id} ' # Configure failing IMP From 9fdfebe849ecfd91edf4bc4b0d40a506a80c7917 Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Fri, 1 Nov 2024 11:00:56 -0700 Subject: [PATCH 5/7] job-manager: don't use flux imp kill Problem: housekeeping and perilog use 'flux imp kill' to send signals to housekeeping and prolog/epilog processes, but the IMP will now forward signals and 'flux imp kill' is deprecated. Don't call bulk_exec_set_imp_path() in housekeeping and perilog. Fixes #6409 --- src/modules/job-manager/housekeeping.c | 2 -- src/modules/job-manager/plugins/perilog.c | 9 --------- 2 files changed, 11 deletions(-) diff --git a/src/modules/job-manager/housekeeping.c b/src/modules/job-manager/housekeeping.c index dde91f617ab0..9d1de0e4e5ae 100644 --- a/src/modules/job-manager/housekeeping.c +++ b/src/modules/job-manager/housekeeping.c @@ -190,8 +190,6 @@ static struct allocation *allocation_create (struct housekeeping *hk, id, "housekeeping", a)) - || (hk->imp_path - && bulk_exec_set_imp_path (a->bulk_exec, hk->imp_path) < 0) || update_cmd_env (hk->cmd, id, userid) < 0 || bulk_exec_push_cmd (a->bulk_exec, a->pending, hk->cmd, 0) < 0) { allocation_destroy (a); diff --git a/src/modules/job-manager/plugins/perilog.c b/src/modules/job-manager/plugins/perilog.c index 4130cd27d05c..7f626870c44c 100644 --- a/src/modules/job-manager/plugins/perilog.c +++ b/src/modules/job-manager/plugins/perilog.c @@ -827,15 +827,6 @@ static struct perilog_proc *procdesc_run (flux_t *h, idf58 (id)); goto error; } - /* If using IMP, push path to IMP into bulk_exec for IMP kill support: - */ - if (pd->uses_imp - && bulk_exec_set_imp_path (bulk_exec, perilog_config.imp_path) < 0) { - flux_log_error (h, - "%s: failed to set IMP path", - perilog_proc_name (proc)); - goto error; - } if (bulk_exec_start (h, bulk_exec) < 0) { flux_log_error (h, "%s: bulk_exec_start", perilog_proc_name (proc)); goto error; From cef76a7e08b42bde3d2f2410008b1f11307974d8 Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Mon, 4 Nov 2024 06:54:12 -0800 Subject: [PATCH 6/7] flux-exec: don't use flux imp kill Problem: flux-exec uses 'flux imp kill' to send signals to remote process if the IMP started them, but the IMP will now forward signals and 'flux imp kill' is deprecated. Don't call 'flux imp kill'. When the IMP starts a process, translate SIGKILL to SIGUSR1 per RFC 15. Drop note about 'flux imp kill' from the flux-exec(1) man page. --- doc/man1/flux-exec.rst | 6 +--- src/cmd/flux-exec.c | 67 ++++++++---------------------------------- 2 files changed, 14 insertions(+), 59 deletions(-) diff --git a/doc/man1/flux-exec.rst b/doc/man1/flux-exec.rst index 8d8af5a02711..ec42202fca5e 100644 --- a/doc/man1/flux-exec.rst +++ b/doc/man1/flux-exec.rst @@ -95,11 +95,7 @@ OPTIONS Prepend the full path to :program:`flux-imp run` to *COMMAND*. This option is mostly meant for testing or as a convenience to execute a configured - ``prolog`` or ``epilog`` command under the IMP. Note: When this option is - used, or if :program:`flux-imp` is detected as the first argument of - *COMMAND*, :program:`flux exec` will use :program:`flux-imp kill` to - signal remote commands instead of the normal builtin subprocess signaling - mechanism. + ``prolog`` or ``epilog`` command under the IMP. CAVEATS ======= diff --git a/src/cmd/flux-exec.c b/src/cmd/flux-exec.c index 173f127bb107..6e29e9cddd85 100644 --- a/src/cmd/flux-exec.c +++ b/src/cmd/flux-exec.c @@ -335,68 +335,27 @@ static void stdin_cb (flux_reactor_t *r, } } -static void kill_completion_cb (flux_subprocess_t *p) -{ - flux_subprocess_destroy (p); -} - -static flux_subprocess_t *imp_kill (flux_subprocess_t *p, int signum) -{ - flux_cmd_t *cmd; - flux_subprocess_ops_t ops = { - .on_completion = kill_completion_cb, - .on_state_change = NULL, - .on_channel_out = NULL, - .on_stdout = output_cb, - .on_stderr = output_cb, - }; - - pid_t pid = flux_subprocess_pid (p); - int rank = flux_subprocess_rank (p); - - if (!(cmd = flux_cmd_create (0, NULL, environ)) - || flux_cmd_argv_append (cmd, imp_path) < 0 - || flux_cmd_argv_append (cmd, "kill") < 0 - || flux_cmd_argv_appendf (cmd, "%d", signum) < 0 - || flux_cmd_argv_appendf (cmd, "-%ld", (long) pid) < 0) { - fprintf (stderr, - "Failed to create flux-imp kill command for rank %d pid %d\n", - rank, pid); - return NULL; - } - /* Note: subprocess object destroyed in completion callback - */ - return flux_rexec (flux_handle, - rank, - FLUX_SUBPROCESS_FLAGS_LOCAL_UNBUF, - cmd, - &ops); -} - static void killall (zlistx_t *l, int signum) { flux_subprocess_t *p = zlistx_first (l); while (p) { if (flux_subprocess_state (p) == FLUX_SUBPROCESS_RUNNING) { - if (use_imp) { - if (!imp_kill (p, signum)) + /* RFC 15 states that the IMP will treat SIGUSR1 as a surrogate + * for SIGKILL. + */ + if (use_imp && signum == SIGKILL) + signum = SIGUSR1; + + flux_future_t *f = flux_subprocess_kill (p, signum); + if (!f) { + if (optparse_getopt (opts, "verbose", NULL) > 0) fprintf (stderr, "failed to signal rank %d: %s\n", - flux_subprocess_rank (p), - strerror (errno)); - } - else { - flux_future_t *f = flux_subprocess_kill (p, signum); - if (!f) { - if (optparse_getopt (opts, "verbose", NULL) > 0) - fprintf (stderr, - "failed to signal rank %d: %s\n", - flux_subprocess_rank (p), - strerror (errno)); - } - /* don't care about response */ - flux_future_destroy (f); + flux_subprocess_rank (p), + strerror (errno)); } + /* don't care about response */ + flux_future_destroy (f); } p = zlistx_next (l); } From f6cf51d1850456dbdb696cee3ea8b2f3f09ec539 Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Sun, 3 Nov 2024 14:30:06 -0800 Subject: [PATCH 7/7] libsubprocess: drop bulk_exec_set_imp_path() Problem: bulk_exec_set_imp_path() and associated code no longer has any users. Remove it. Update unit test. --- src/common/libsubprocess/bulk-exec.c | 182 +----------------- src/common/libsubprocess/bulk-exec.h | 5 - .../libsubprocess/test/bulk-exec-einval.c | 4 - 3 files changed, 4 insertions(+), 187 deletions(-) diff --git a/src/common/libsubprocess/bulk-exec.c b/src/common/libsubprocess/bulk-exec.c index 016f91263441..f4e7c7c4dd2a 100644 --- a/src/common/libsubprocess/bulk-exec.c +++ b/src/common/libsubprocess/bulk-exec.c @@ -37,7 +37,6 @@ struct bulk_exec { char *service; flux_jobid_t id; char *name; - char *imp_path; /* Path to IMP. If set, use IMP for kill */ struct aux_item *aux; @@ -504,7 +503,6 @@ void bulk_exec_destroy (struct bulk_exec *exec) aux_destroy (&exec->aux); free (exec->name); free (exec->service); - free (exec->imp_path); free (exec); errno = saved_errno; } @@ -553,19 +551,6 @@ int bulk_exec_set_max_per_loop (struct bulk_exec *exec, int max) return 0; } -int bulk_exec_set_imp_path (struct bulk_exec *exec, - const char *imp_path) -{ - if (!exec || !imp_path) { - errno = EINVAL; - return -1; - } - free (exec->imp_path); - if (!(exec->imp_path = strdup (imp_path))) - return -1; - return 0; -} - int bulk_exec_push_cmd (struct bulk_exec *exec, const struct idset *ranks, flux_cmd_t *cmd, @@ -683,14 +668,14 @@ void bulk_exec_kill_log_error (flux_future_t *f, flux_jobid_t id) } } -static flux_future_t *bulk_exec_simple_kill (struct bulk_exec *exec, - const struct idset *ranks, - int signum) +flux_future_t *bulk_exec_kill (struct bulk_exec *exec, + const struct idset *ranks, + int signum) { flux_subprocess_t *p; flux_future_t *cf; - if (!exec) { + if (!exec || signum < 0) { errno = EINVAL; return NULL; } @@ -738,165 +723,6 @@ static flux_future_t *bulk_exec_simple_kill (struct bulk_exec *exec, return cf; } -static void imp_kill_output (struct bulk_exec *kill, - flux_subprocess_t *p, - const char *stream, - const char *data, - int len, - void *arg) -{ - int rank = flux_subprocess_rank (p); - flux_log (kill->h, - LOG_INFO, - "%s (rank %d): imp kill: %.*s", - flux_get_hostbyrank (kill->h, rank), - rank, - len, - data); -} - -static void imp_kill_complete (struct bulk_exec *kill, void *arg) -{ - flux_future_t *f = arg; - if (bulk_exec_rc (kill) < 0) - flux_future_fulfill_error (f, 0, NULL); - else - flux_future_fulfill (f, NULL, NULL); -} - -static void imp_kill_error (struct bulk_exec *kill, - flux_subprocess_t *p, - void *arg) -{ - int rank = flux_subprocess_rank (p); - flux_log_error (kill->h, - "imp kill on %s (rank %d) failed", - flux_get_hostbyrank (kill->h, rank), - rank); -} - - -struct bulk_exec_ops imp_kill_ops = { - .on_output = imp_kill_output, - .on_error = imp_kill_error, - .on_complete = imp_kill_complete, -}; - -static int bulk_exec_push_one (struct bulk_exec *exec, - int rank, - flux_cmd_t *cmd, - int flags) -{ - int rc = -1; - struct idset *ids = idset_create (0, IDSET_FLAG_AUTOGROW); - if (!ids || idset_set (ids, rank) < 0) - return -1; - rc = bulk_exec_push_cmd (exec, ids, cmd, flags); - idset_destroy (ids); - return rc; -} - -/* Kill all currently executing processes in bulk-exec object `exec` - * using "flux-imp kill" helper for processes potentially running - * under a different userid. - * - * Spawns "flux-imp kill " on each rank. - */ -flux_future_t *bulk_exec_imp_kill (struct bulk_exec *exec, - const char *imp_path, - const struct idset *ranks, - int signum) -{ - struct bulk_exec *killcmd = NULL; - flux_subprocess_t *p = NULL; - flux_future_t *f = NULL; - int count = 0; - - if (!exec || !imp_path) { - errno = EINVAL; - return NULL; - } - - /* Empty future for return value - */ - if (!(f = flux_future_create (NULL, NULL))) { - flux_log_error (exec->h, "bulk_exec_imp_kill: future_create"); - goto err; - } - flux_future_set_flux (f, exec->h); - - if (!(killcmd = bulk_exec_create (&imp_kill_ops, - exec->service, - exec->id, - "imp-kill", - f))) - return NULL; - - /* Tie bulk exec object destruction to future */ - flux_future_aux_set (f, NULL, killcmd, (flux_free_f) bulk_exec_destroy); - - p = zlist_first (exec->processes); - while (p) { - if ((!ranks || idset_test (ranks, flux_subprocess_rank (p))) - && (flux_subprocess_state (p) == FLUX_SUBPROCESS_RUNNING - || flux_subprocess_state (p) == FLUX_SUBPROCESS_INIT)) { - - pid_t pid = flux_subprocess_pid (p); - int rank = flux_subprocess_rank (p); - flux_cmd_t *cmd = flux_cmd_create (0, NULL, environ); - - if (!cmd - || flux_cmd_argv_append (cmd, imp_path) < 0 - || flux_cmd_argv_append (cmd, "kill") < 0 - || flux_cmd_argv_appendf (cmd, "%d", signum) < 0 - || flux_cmd_argv_appendf (cmd, "%ld", (long) pid)) { - flux_log_error (exec->h, - "bulk_exec_imp_kill: flux_cmd_argv_append"); - goto err; - } - - if (bulk_exec_push_one (killcmd, rank, cmd, 0) < 0) { - flux_log_error (exec->h, "bulk_exec_imp_kill: push_cmd"); - goto err; - } - - count++; - flux_cmd_destroy (cmd); - } - p = zlist_next (exec->processes); - } - - if (count == 0) { - errno = ENOENT; - goto err; - } - - bulk_exec_aux_set (killcmd, "future", f, NULL); - - if (bulk_exec_start (exec->h, killcmd) < 0) { - flux_log_error (exec->h, "bulk_exec_start"); - goto err; - } - - return f; -err: - flux_future_destroy (f); - return NULL; -} - -flux_future_t *bulk_exec_kill (struct bulk_exec *exec, - const struct idset *ranks, - int signum) -{ - if (!exec || signum < 0) { - errno = EINVAL; - return NULL; - } - if (exec->imp_path) - return bulk_exec_imp_kill (exec, exec->imp_path, ranks, signum); - return bulk_exec_simple_kill (exec, ranks, signum); -} - int bulk_exec_aux_set (struct bulk_exec *exec, const char *key, void *val, diff --git a/src/common/libsubprocess/bulk-exec.h b/src/common/libsubprocess/bulk-exec.h index a08eba15ef99..417f3c54a507 100644 --- a/src/common/libsubprocess/bulk-exec.h +++ b/src/common/libsubprocess/bulk-exec.h @@ -60,11 +60,6 @@ int bulk_exec_aux_set (struct bulk_exec *exec, */ int bulk_exec_set_max_per_loop (struct bulk_exec *exec, int max); -/* Set path to an IMP to use for bulk_exec_kill(). - */ -int bulk_exec_set_imp_path (struct bulk_exec *exec, - const char *imp_path); - void bulk_exec_destroy (struct bulk_exec *exec); int bulk_exec_push_cmd (struct bulk_exec *exec, diff --git a/src/common/libsubprocess/test/bulk-exec-einval.c b/src/common/libsubprocess/test/bulk-exec-einval.c index 831269b568c5..0667aea96527 100644 --- a/src/common/libsubprocess/test/bulk-exec-einval.c +++ b/src/common/libsubprocess/test/bulk-exec-einval.c @@ -27,16 +27,12 @@ int main (int argc, char *argv[]) "bulk_exec_aux_set (NULL, ..) returns EINVAL"); ok (bulk_exec_set_max_per_loop (NULL, 1) < 0 && errno == EINVAL, "bulk_exec_set_max_per_loop (NULL, 1) returns EINVAL"); - ok (bulk_exec_set_imp_path (NULL, NULL) < 0 && errno == EINVAL, - "bulk_exec_set_imp_path (NULL, NULL) returns EINVAL"); ok (bulk_exec_push_cmd (NULL, NULL, NULL, 0) < 0 && errno == EINVAL, "bulk_exec_push_cmd (NULL, ...) returns EINVAL"); ok (bulk_exec_start (NULL, NULL) < 0 && errno == EINVAL, "bulk_exec_start (NULL, NULL) returns EINVAL"); ok (bulk_exec_kill (NULL, NULL, 0) == NULL && errno == EINVAL, "bulk_exec_kill (NULL, NULL, 0) returns EINVAL"); - ok (bulk_exec_imp_kill (NULL, NULL, NULL, 0) == NULL && errno == EINVAL, - "bulk_exec_imp_kill (NULL, NULL, NULL, 0) returns EINVAL"); ok (bulk_exec_cancel (NULL) < 0 && errno == EINVAL, "bulk_exec_cancel (NULL) returns EINVAL"); ok (bulk_exec_rc (NULL) < 0 && errno == EINVAL,