From 35eb5e5a0aa7cc27e8bfe0bdf3607bfc6f24e9d3 Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Mon, 28 Oct 2024 15:06:54 +0000 Subject: [PATCH 01/11] imp: split signal handling out of exec implementation Problem: Signal forwarding is currently implemented directly in the IMP exec subcommand which makes it impossible to reuse the code in other IMP subcommands. Move signal handling to imp/signals.[ch] for better code reuse. Pass the imp state object into the signal handler for possible future use, otherwise behavior of the IMP exec command should be identical to before this change. --- src/imp/Makefile.am | 2 + src/imp/exec/exec.c | 77 +++++---------------------------------- src/imp/signals.c | 89 +++++++++++++++++++++++++++++++++++++++++++++ src/imp/signals.h | 29 +++++++++++++++ 4 files changed, 130 insertions(+), 67 deletions(-) create mode 100644 src/imp/signals.c create mode 100644 src/imp/signals.h diff --git a/src/imp/Makefile.am b/src/imp/Makefile.am index a9b5e1e2..f3919312 100644 --- a/src/imp/Makefile.am +++ b/src/imp/Makefile.am @@ -58,6 +58,8 @@ IMP_SOURCES = \ passwd.h \ pidinfo.c \ pidinfo.h \ + signals.c \ + signals.h \ kill.c \ run.c \ exec/user.h \ diff --git a/src/imp/exec/exec.c b/src/imp/exec/exec.c index acb33ab2..49419c19 100644 --- a/src/imp/exec/exec.c +++ b/src/imp/exec/exec.c @@ -46,6 +46,7 @@ #include "passwd.h" #include "user.h" #include "safe_popen.h" +#include "signals.h" #if HAVE_PAM #include "pam.h" @@ -67,8 +68,6 @@ struct imp_exec { int specsz; }; -static pid_t imp_child = (pid_t) -1; - extern const char *imp_get_security_config_pattern (void); extern int imp_get_security_flags (void); @@ -225,68 +224,10 @@ static void __attribute__((noreturn)) imp_exec (struct imp_exec *exec) imp_die (exit_code, "%s: %s", exec->shell, strerror (errno)); } -static void fwd_signal (int signal) -{ - if (imp_child > 0) - kill (imp_child, signal); -} - -/* Setup signal handlers in the IMP for common signals which - * we want to forward to any child process. - */ -static void setup_signal_forwarding (void) -{ - struct sigaction sa; - sigset_t mask; - int i; - int signals[] = { - SIGTERM, - SIGINT, - SIGHUP, - SIGCONT, - SIGALRM, - SIGWINCH, - SIGTTIN, - SIGTTOU, - }; - int nsignals = sizeof (signals) / sizeof (signals[0]); - - memset(&sa, 0, sizeof(sa)); - sa.sa_handler = fwd_signal; - sa.sa_flags = SA_RESTART; - sigemptyset(&sa.sa_mask); - - sigfillset (&mask); - for (i = 0; i < nsignals; i++) { - sigdelset (&mask, signals[i]); - if (sigaction(signals[i], &sa, NULL) < 0) - imp_warn ("sigaction (signal=%d): %s", - signals[i], - strerror (errno)); - } - if (sigprocmask (SIG_SETMASK, &mask, NULL) < 0) - imp_die (1, "failed to block signals: %s", strerror (errno)); -} - -static void sigblock_all (void) -{ - sigset_t mask; - sigfillset (&mask); - if (sigprocmask (SIG_SETMASK, &mask, NULL) < 0) - imp_die (1, "failed to block signals: %s", strerror (errno)); -} - -static void sigunblock_all (void) -{ - sigset_t mask; - sigemptyset (&mask); - if (sigprocmask (SIG_SETMASK, &mask, NULL) < 0) - imp_die (1, "failed to unblock signals: %s", strerror (errno)); -} - int imp_exec_privileged (struct imp_state *imp, struct kv *kv) { int status; + pid_t child; struct imp_exec *exec = imp_exec_create (imp); if (!exec) imp_die (1, "exec: failed to initialize state"); @@ -322,15 +263,17 @@ int imp_exec_privileged (struct imp_state *imp, struct kv *kv) } /* Block signals so parent IMP isn't unduly terminated */ - sigblock_all (); + imp_sigblock_all (); - if ((imp_child = fork ()) < 0) + if ((child = fork ()) < 0) imp_die (1, "exec: fork: %s", strerror (errno)); - if (imp_child == 0) { + imp_set_signal_child (child); + + if (child == 0) { /* unblock all signals */ - sigunblock_all (); + imp_sigunblock_all (); /* Irreversibly switch to user */ imp_switch_user (exec->user_pwd->pw_uid); @@ -342,10 +285,10 @@ int imp_exec_privileged (struct imp_state *imp, struct kv *kv) /* Ensure common signals received by this IMP are forwarded to * the child process */ - setup_signal_forwarding (); + imp_setup_signal_forwarding (imp); /* Parent: wait for child to exit */ - while (waitpid (imp_child, &status, 0) != imp_child) { + while (waitpid (child, &status, 0) != child) { if (errno != EINTR) imp_die (1, "waitpid: %s", strerror (errno)); } diff --git a/src/imp/signals.c b/src/imp/signals.c new file mode 100644 index 00000000..bc198ee2 --- /dev/null +++ b/src/imp/signals.c @@ -0,0 +1,89 @@ +/************************************************************\ + * Copyright 2024 Lawrence Livermore National Security, LLC + * (c.f. AUTHORS, NOTICE.LLNS, COPYING) + * + * This file is part of the Flux resource manager framework. + * For details, see https://github.com/flux-framework. + * + * SPDX-License-Identifier: LGPL-3.0 +\************************************************************/ + +#if HAVE_CONFIG_H +#include +#endif + +#include +#include +#include + +#include "signals.h" +#include "imp_log.h" + +static const struct imp_state *imp_state = NULL; +static pid_t imp_child = (pid_t) -1; + +void imp_set_signal_child (pid_t child) +{ + imp_child = child; +} + +void imp_sigblock_all (void) +{ + sigset_t mask; + sigfillset (&mask); + if (sigprocmask (SIG_SETMASK, &mask, NULL) < 0) + imp_die (1, "failed to block signals: %s", strerror (errno)); +} + +void imp_sigunblock_all (void) +{ + sigset_t mask; + sigemptyset (&mask); + if (sigprocmask (SIG_SETMASK, &mask, NULL) < 0) + imp_die (1, "failed to unblock signals: %s", strerror (errno)); +} + +static void fwd_signal (int signum) +{ + if (imp_child > 0) + kill (imp_child, signum); +} + +void imp_setup_signal_forwarding (struct imp_state *imp) +{ + struct sigaction sa; + sigset_t mask; + int i; + int signals[] = { + SIGTERM, + SIGINT, + SIGHUP, + SIGCONT, + SIGALRM, + SIGWINCH, + SIGTTIN, + SIGTTOU, + }; + int nsignals = sizeof (signals) / sizeof (signals[0]); + + imp_state = imp; + + memset(&sa, 0, sizeof(sa)); + sa.sa_handler = fwd_signal; + sa.sa_flags = SA_RESTART; + sigemptyset(&sa.sa_mask); + + sigfillset (&mask); + for (i = 0; i < nsignals; i++) { + sigdelset (&mask, signals[i]); + if (sigaction(signals[i], &sa, NULL) < 0) + imp_warn ("sigaction (signal=%d): %s", + signals[i], + strerror (errno)); + } + if (sigprocmask (SIG_SETMASK, &mask, NULL) < 0) + imp_die (1, "failed to block signals: %s", strerror (errno)); +} + +/* vi: ts=4 sw=4 expandtab + */ diff --git a/src/imp/signals.h b/src/imp/signals.h new file mode 100644 index 00000000..a97c76f9 --- /dev/null +++ b/src/imp/signals.h @@ -0,0 +1,29 @@ +/************************************************************\ + * Copyright 2024 Lawrence Livermore National Security, LLC + * (c.f. AUTHORS, NOTICE.LLNS, COPYING) + * + * This file is part of the Flux resource manager framework. + * For details, see https://github.com/flux-framework. + * + * SPDX-License-Identifier: LGPL-3.0 +\************************************************************/ + +#ifndef HAVE_IMP_SIGNALS_H +#define HAVE_IMP_SIGNALS_H 1 + +#include +#include "imp_state.h" + +/* Set the target of IMP signal forwarding + */ +void imp_set_signal_child (pid_t child); + +/* Setup RFC 15 standard IMP signal forwarding + */ +void imp_setup_signal_forwarding (struct imp_state *imp); + +void imp_sigblock_all (void); + +void imp_sigunblock_all (void); + +#endif /* !HAVE_IMP_SIGNALS_H */ From 2dd3aebe23750ebfc561579b58d2e251b43b14c9 Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Mon, 28 Oct 2024 16:40:53 +0000 Subject: [PATCH 02/11] imp: discover current cgroup information at IMP startup Problem: To comply with RFC 15, the IMP needs to know attributes of its current cgroup, but this information is not currently available. Add a cgroup_info structure to the imp_state object, and populate it at IMP startup. This moves and modifies some internal code from pidinfo.c. In the future, this duplicate code in pidinfo.c will be removed. --- src/imp/Makefile.am | 2 + src/imp/cgroup.c | 193 ++++++++++++++++++++++++++++++++++++++++++++ src/imp/cgroup.h | 25 ++++++ src/imp/imp.c | 5 ++ src/imp/imp_state.h | 2 + 5 files changed, 227 insertions(+) create mode 100644 src/imp/cgroup.c create mode 100644 src/imp/cgroup.h diff --git a/src/imp/Makefile.am b/src/imp/Makefile.am index f3919312..31ac9761 100644 --- a/src/imp/Makefile.am +++ b/src/imp/Makefile.am @@ -60,6 +60,8 @@ IMP_SOURCES = \ pidinfo.h \ signals.c \ signals.h \ + cgroup.c \ + cgroup.h \ kill.c \ run.c \ exec/user.h \ diff --git a/src/imp/cgroup.c b/src/imp/cgroup.c new file mode 100644 index 00000000..40f98999 --- /dev/null +++ b/src/imp/cgroup.c @@ -0,0 +1,193 @@ +/************************************************************\ + * Copyright 2024 Lawrence Livermore National Security, LLC + * (c.f. AUTHORS, NOTICE.LLNS, COPYING) + * + * This file is part of the Flux resource manager framework. + * For details, see https://github.com/flux-framework. + * + * SPDX-License-Identifier: LGPL-3.0 +\************************************************************/ + +#if HAVE_CONFIG_H +#include "config.h" +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#ifdef HAVE_LINUX_MAGIC_H +#include +#endif +#ifndef TMPFS_MAGIC +#define TMPFS_MAGIC 0x01021994 /* from linux/magic.h */ +#endif +#ifndef CGROUP_SUPER_MAGIC +#define CGROUP_SUPER_MAGIC 0x27e0eb +#endif + +#include "src/libutil/strlcpy.h" + +#include "cgroup.h" +#include "imp_log.h" + +/* + * Look up the current cgroup relative path from /proc/self/cgroup. + * + * If cgroup->unified is true, then look for the first entry where + * 'subsys' is an empty string. + * + * Otherwise, use the `name=systemd` line. + * + * See NOTES: /proc/[pid]/cgroup in cgroups(7). + */ +static int cgroup_init_path (struct cgroup_info *cgroup) +{ + int rc = -1; + int n; + FILE *fp; + size_t size = 0; + char *line = NULL; + int saved_errno; + + if (!(fp = fopen ("/proc/self/cgroup", "r"))) + return -1; + + while ((n = getline (&line, &size, fp)) >= 0) { + char *nl; + char *relpath = NULL; + char *subsys = strchr (line, ':'); + if ((nl = strchr (line, '\n'))) + *nl = '\0'; + if (subsys == NULL + || *(++subsys) == '\0' + || !(relpath = strchr (subsys, ':'))) + continue; + + /* Nullify subsys, relpath is already nul-terminated at newline */ + *(relpath++) = '\0'; + + /* If unified cgroups are being used, then stop when we find + * subsys="". Otherwise stop at subsys="name=systemd": + */ + if ((cgroup->unified && subsys[0] == '\0') + || (!cgroup->unified && strcmp (subsys, "name=systemd") == 0)) { + int len = sizeof (cgroup->path); + if (snprintf (cgroup->path, + len, + "%s%s", + cgroup->mount_dir, + relpath) < len) + rc = 0; + break; + } + } + if (rc < 0) + errno = ENOENT; + + saved_errno = errno; + free (line); + fclose (fp); + errno = saved_errno; + return rc; +} + +/* Determine if this system is using the unified (v2) or legacy (v1) + * cgroups hierarchy (See https://systemd.io/CGROUP_DELEGATION/) + * and mount point for systemd managed cgroups. + */ +static int cgroup_init_mount_dir_and_type (struct cgroup_info *cg) +{ + struct statfs fs; + + /* Assume unified unless we discover otherwise + */ + cg->unified = true; + + /* Check if either /sys/fs/cgroup or /sys/fs/cgroup/unified + * are mounted as type cgroup2. If so, use this as the mount dir + * (Note: these paths are guaranteed to fit in cg->mount_dir, so + * no need to check for truncation) + */ + (void) strlcpy (cg->mount_dir, "/sys/fs/cgroup", sizeof (cg->mount_dir)); + if (statfs (cg->mount_dir, &fs) < 0) + return -1; + +#ifdef CGROUP2_SUPER_MAGIC + /* if cgroup2 fs mounted: unified hierarchy for all users of cgroupfs + */ + if (fs.f_type == CGROUP2_SUPER_MAGIC) + return 0; + + /* O/w, check if cgroup2 unified hierarchy mounted at + * /sys/fs/cgroup/unified + */ + (void) strlcpy (cg->mount_dir, + "/sys/fs/cgroup/unified", + sizeof (cg->mount_dir)); + if (statfs (cg->mount_dir, &fs) < 0) + return -1; + + if (fs.f_type == CGROUP2_SUPER_MAGIC) + return 0; + +#endif /* CGROUP2_SUPER_MAGIC */ + /* O/w, if /sys/fs/cgroup is mounted as tmpfs, we need to check + * for /sys/fs/cgroup/systemd mounted as cgroupfs (legacy). + */ + if (fs.f_type == TMPFS_MAGIC) { + + (void) strlcpy (cg->mount_dir, + "/sys/fs/cgroup/systemd", + sizeof (cg->mount_dir)); + if (statfs (cg->mount_dir, &fs) == 0 + && fs.f_type == CGROUP_SUPER_MAGIC) { + cg->unified = false; + return 0; + } + } + + /* Unable to determine cgroup mount point and/or unified vs legacy */ + return -1; +} + +void cgroup_info_destroy (struct cgroup_info *cg) +{ + if (cg) { + int saved_errno = errno; + free (cg); + errno = saved_errno; + } +} + +struct cgroup_info *cgroup_info_create (void) +{ + struct cgroup_info *cgroup = calloc (1, sizeof (*cgroup)); + if (!cgroup) + return NULL; + + if (cgroup_init_mount_dir_and_type (cgroup) < 0 + || cgroup_init_path (cgroup) < 0) { + cgroup_info_destroy (cgroup); + return NULL; + } + /* Note: GNU basename(3) never modifies its argument. (_GNU_SOURCE + * is defined in config.h.) + */ + if (strncmp (basename (cgroup->path), "imp-shell", 9) == 0) + cgroup->use_cgroup_kill = true; + + return cgroup; +} + +/* vi: ts=4 sw=4 expandtab + */ diff --git a/src/imp/cgroup.h b/src/imp/cgroup.h new file mode 100644 index 00000000..485afe8f --- /dev/null +++ b/src/imp/cgroup.h @@ -0,0 +1,25 @@ +/************************************************************\ + * Copyright 2024 Lawrence Livermore National Security, LLC + * (c.f. AUTHORS, NOTICE.LLNS, COPYING) + * + * This file is part of the Flux resource manager framework. + * For details, see https://github.com/flux-framework. + * + * SPDX-License-Identifier: LGPL-3.0 +\************************************************************/ + +#ifndef HAVE_IMP_CGROUP_H +#define HAVE_IMP_CGROUP_H 1 + +struct cgroup_info { + char mount_dir[PATH_MAX + 1]; + char path[PATH_MAX + 1]; + bool unified; + bool use_cgroup_kill; +}; + +struct cgroup_info *cgroup_info_create (void); + +void cgroup_info_destroy (struct cgroup_info *cgroup); + +#endif /* !HAVE_IMP_CGROUP_H */ diff --git a/src/imp/imp.c b/src/imp/imp.c index 8463b1ae..23c9873c 100644 --- a/src/imp/imp.c +++ b/src/imp/imp.c @@ -59,6 +59,11 @@ int main (int argc, char *argv[]) if (!(imp.conf = imp_conf_load (imp_get_config_pattern ()))) imp_die (1, "Failed to load configuration"); + /* Get current IMP cgroup information: + */ + if (!(imp.cgroup = cgroup_info_create ())) + imp_die (1, "Failed to get current cgroup info"); + /* Audit subsystem initialization */ // Skip. diff --git a/src/imp/imp_state.h b/src/imp/imp_state.h index 9347968c..107028f0 100644 --- a/src/imp/imp_state.h +++ b/src/imp/imp_state.h @@ -13,12 +13,14 @@ #include "src/libutil/cf.h" #include "privsep.h" +#include "cgroup.h" struct imp_state { int argc; char **argv; /* cmdline arguments from main() */ cf_t *conf; /* IMP configuration */ privsep_t *ps; /* Privilege separation handle */ + struct cgroup_info *cgroup; }; #endif /* !HAVE_IMP_STATE_H */ From 7446e9bab7ffd15dceb877a1297f45ff966970a1 Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Mon, 28 Oct 2024 20:42:11 +0000 Subject: [PATCH 03/11] imp: support SIGUSR1 as surrogate for SIGKILL in imp exec Problem: RFC 15 specifies that the IMP should handle SIGUSR1 as a surrogate for SIGKILL, but this is not currently supported. Add cgroup_kill() which will send a signal to all member's of the IMP's cgroup (besides the IMP's own pid). Add SIGUSR1 to the set of signals handled by the signal forwarding implementation of the IMP. In the case of SIGUSR1, deliver SIGKILL to all members of the current cgroup if use_cgroup_kill is true, otherwise fallback to pid_kill_children(). --- src/imp/cgroup.c | 39 +++++++++++++++++++++++++++++++++++++++ src/imp/cgroup.h | 5 +++++ src/imp/signals.c | 24 +++++++++++++++++++++++- 3 files changed, 67 insertions(+), 1 deletion(-) diff --git a/src/imp/cgroup.c b/src/imp/cgroup.c index 40f98999..7e1f5152 100644 --- a/src/imp/cgroup.c +++ b/src/imp/cgroup.c @@ -34,6 +34,7 @@ #ifndef CGROUP_SUPER_MAGIC #define CGROUP_SUPER_MAGIC 0x27e0eb #endif +#include #include "src/libutil/strlcpy.h" @@ -189,5 +190,43 @@ struct cgroup_info *cgroup_info_create (void) return cgroup; } +int cgroup_kill (struct cgroup_info *cgroup, int sig) +{ + int count = 0; + int rc = 0; + int saved_errno = 0; + char path [PATH_MAX+14]; /* cgroup->path[PATH_MAX] + "/cgroup.procs" */ + FILE *fp; + unsigned long child; + pid_t current_pid = getpid (); + + /* Note: path is guaranteed to have enough space to append "/cgroup.procs" + */ + (void) snprintf (path, sizeof (path), "%s/cgroup.procs", cgroup->path); + + if (!(fp = fopen (path, "r"))) + return -1; + while (fscanf (fp, "%lu", &child) == 1) { + pid_t pid = child; + if (pid == current_pid) + continue; + if (kill (pid, sig) < 0) { + saved_errno = errno; + rc = -1; + imp_warn ("Failed to send signal %d to pid %lu", + sig, + child); + continue; + } + count++; + } + fclose (fp); + if (rc < 0 && count == 0) { + count = -1; + errno = saved_errno; + } + return count; +} + /* vi: ts=4 sw=4 expandtab */ diff --git a/src/imp/cgroup.h b/src/imp/cgroup.h index 485afe8f..0c9bdc5d 100644 --- a/src/imp/cgroup.h +++ b/src/imp/cgroup.h @@ -22,4 +22,9 @@ struct cgroup_info *cgroup_info_create (void); void cgroup_info_destroy (struct cgroup_info *cgroup); +/* Send signal to all pids (excluding the current pid) in the + * current cgroup. + */ +int cgroup_kill (struct cgroup_info *cgroup, int sig); + #endif /* !HAVE_IMP_CGROUP_H */ diff --git a/src/imp/signals.c b/src/imp/signals.c index bc198ee2..516c878f 100644 --- a/src/imp/signals.c +++ b/src/imp/signals.c @@ -17,6 +17,7 @@ #include #include "signals.h" +#include "pidinfo.h" #include "imp_log.h" static const struct imp_state *imp_state = NULL; @@ -45,7 +46,27 @@ void imp_sigunblock_all (void) static void fwd_signal (int signum) { - if (imp_child > 0) + if (signum == SIGUSR1) { + int count = -1; + + /* RFC 15 specifies that SIGUSR1 is a surrogate for SIGKILL, and + * that the IMP SHALL deliver the signal to all processes in the + * job's container (using cgroup.procs if able). + */ + if (imp_state->cgroup->use_cgroup_kill) + count = cgroup_kill (imp_state->cgroup, SIGKILL); + + /* If cgroup wasn't used or fails, try with pid_kill_children + */ + if (count < 0) + count = pid_kill_children (getpid (), SIGKILL); + + /* O/w, log an error, not much more to do + */ + if (count < 0) + imp_warn ("Failed to forward SIGKILL: %s", strerror (errno)); + } + else if (imp_child > 0) kill (imp_child, signum); } @@ -63,6 +84,7 @@ void imp_setup_signal_forwarding (struct imp_state *imp) SIGWINCH, SIGTTIN, SIGTTOU, + SIGUSR1, }; int nsignals = sizeof (signals) / sizeof (signals[0]); From 08f3750b3419a09bc0ec6b8527364cea0b29d8b1 Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Tue, 29 Oct 2024 18:58:51 +0000 Subject: [PATCH 04/11] imp: exec: raise signal if child was terminated by one Problem: The lingering IMP exits with 127+signo if the invoked child was terminated by a signal during `flux-imp exec`, but ideally the IMP would exit with the exact wait status of the child. Add a convenience function, `imp_raise()`, which raises a signal after setting the handler to the default disposition for the signal in question. Use imp_raise() to deliver the same signal which terminated the child in `flux-imp exec` if the child is detected to have exited due to a signal. --- src/imp/exec/exec.c | 2 +- src/imp/signals.c | 12 ++++++++++++ src/imp/signals.h | 5 +++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/src/imp/exec/exec.c b/src/imp/exec/exec.c index 49419c19..005b38f0 100644 --- a/src/imp/exec/exec.c +++ b/src/imp/exec/exec.c @@ -303,7 +303,7 @@ int imp_exec_privileged (struct imp_state *imp, struct kv *kv) if (WIFEXITED (status)) exit (WEXITSTATUS (status)); else if (WIFSIGNALED (status)) - exit (WTERMSIG (status) + 128); + imp_raise (WTERMSIG (status)); else exit (1); diff --git a/src/imp/signals.c b/src/imp/signals.c index 516c878f..d43ad79e 100644 --- a/src/imp/signals.c +++ b/src/imp/signals.c @@ -12,6 +12,7 @@ #include #endif +#include #include #include #include @@ -107,5 +108,16 @@ void imp_setup_signal_forwarding (struct imp_state *imp) imp_die (1, "failed to block signals: %s", strerror (errno)); } +void imp_raise (int sig) +{ + signal (sig, SIG_DFL); + if (raise (sig) == 0) + pause (); + /* If we get here, either raise(3) failed or for some reason signal + * failed to kill the IMP during pause(). Exit with standard 128+sig. + */ + imp_die (128 + sig, "failed to raise signal %d", sig); +} + /* vi: ts=4 sw=4 expandtab */ diff --git a/src/imp/signals.h b/src/imp/signals.h index a97c76f9..0bf0208a 100644 --- a/src/imp/signals.h +++ b/src/imp/signals.h @@ -26,4 +26,9 @@ void imp_sigblock_all (void); void imp_sigunblock_all (void); +/* Set default signal disposition and then raise signal 'sig'. + * If raise fails for any reason, then exit with standard 128+sig. + */ +void imp_raise (int sig) __attribute__ ((noreturn));; + #endif /* !HAVE_IMP_SIGNALS_H */ From 87f2c7f8dcb22eff56a7d1a3aa33722e258c36c6 Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Wed, 30 Oct 2024 02:34:13 +0000 Subject: [PATCH 05/11] docker: mount cgroups rw Problem: Some tests in the flux-security testsuite may need write access to cgroups, but docker-run-checks.sh explicitly mounts cgroups as read-only. Mount cgroups read-write for testing purposes. --- src/test/docker/docker-run-checks.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/docker/docker-run-checks.sh b/src/test/docker/docker-run-checks.sh index 5476678f..b917ad6f 100755 --- a/src/test/docker/docker-run-checks.sh +++ b/src/test/docker/docker-run-checks.sh @@ -177,7 +177,7 @@ else docker run --rm \ --workdir=$WORKDIR \ --volume=$TOP:$WORKDIR \ - -v /sys/fs/cgroup:/sys/fs/cgroup:ro \ + -v /sys/fs/cgroup:/sys/fs/cgroup:rw \ ${PLATFORM} \ $MOUNT_HOME_ARGS \ -e CC \ From 1b8fcd676bacc3d95ec16cb1221bd3103637a8d5 Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Tue, 29 Oct 2024 16:45:03 +0000 Subject: [PATCH 06/11] testsuite: simplify sudo/sign-none tests in t2000-imp-exec.t Problem: The IMP exec tests that use sign-none.toml and sudo are unnecessarily complicated because the IMP has to be run in a subshell, sometimes in the background. This not only duplicates a lot of code, but makes it impossible to get the exit code of the IMP when it is run in the background. Add a function to run `flux-imp exec` under sudo with the sign-none.toml config. While we're at it, create the `sleeper.sh` helper at the beginning of the tests, instead of as part of another test. Update callers to use `imp_exec_sign_none()` where possible. --- t/t2000-imp-exec.t | 72 ++++++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 35 deletions(-) diff --git a/t/t2000-imp-exec.t b/t/t2000-imp-exec.t index 3441cd9a..4b5637a7 100755 --- a/t/t2000-imp-exec.t +++ b/t/t2000-imp-exec.t @@ -18,6 +18,27 @@ echo "# Using ${flux_imp}" fake_imp_input() { printf '{"J":"%s"}' $(echo $1 | $sign) } +# Some shells do not support passing environment variable in the same +# line as the invocation. Therefore add a sign-none.toml version of +# the fake_imp_input() helper function: (sign-none.toml is defined later) +fake_input_sign_none() { + printf '{"J":"%s"}' \ + $(echo foo | env FLUX_IMP_CONFIG_PATTERN=sign-none.toml $sign) +} +wait_for_file() { + count=0 && + while ! test -f $1; do + sleep 0.1 + count=$((count+1)) + test $count -gt 20 && break + test_debug "echo retrying count=${count}" + done +} +imp_exec_sign_none() { + fake_input_sign_none | \ + $SUDO FLUX_IMP_CONFIG_PATTERN=sign-none.toml \ + $flux_imp exec $@ +} cat <helper.sh #!/bin/sh @@ -26,6 +47,13 @@ test -z "\$IMP_HELPER_FAIL" EOF chmod +x helper.sh +cat <sleeper.sh +#!/bin/sh +printf "\$PPID\n" >$(pwd)/sleeper.pid +exec /bin/sleep "\$@" +EOF +chmod +x sleeper.sh + test_expect_success 'create config allowing current user to exec imp' ' cat <<-EOF >imp-test.toml allow-sudo = true @@ -192,61 +220,35 @@ test_expect_success 'flux-imp exec: shell must be in allowed users' ' grep -i "not in allowed-users list" nousers.log ' test_expect_success SUDO 'flux-imp exec: user must be in allowed-shells' ' - ( export FLUX_IMP_CONFIG_PATTERN=sign-none.toml && - fake_imp_input foo | \ - test_must_fail $SUDO FLUX_IMP_CONFIG_PATTERN=sign-none.toml \ - $flux_imp exec printf good >badshell.log 2>&1 - ) && + test_must_fail imp_exec_sign_none printf good >badshell.log 2>&1 && + test_debug "cat badshell.log" && grep -i "not in allowed-shells" badshell.log ' test_expect_success SUDO 'flux-imp exec works under sudo' ' - ( export FLUX_IMP_CONFIG_PATTERN=sign-none.toml && - fake_imp_input foo | \ - $SUDO FLUX_IMP_CONFIG_PATTERN=sign-none.toml \ - $flux_imp exec id -u >id-sudo.out - ) && + imp_exec_sign_none id -u >id-sudo.out && test_debug "echo expecting uid=$(id -u), got $(cat id-sudo.out)" && id -u > id-sudo.expected && test_cmp id-sudo.expected id-sudo.out ' test_expect_success SUDO 'flux-imp exec passes more than one argument to shell' ' - ( export FLUX_IMP_CONFIG_PATTERN=sign-none.toml && - fake_imp_input foo | \ - $SUDO FLUX_IMP_CONFIG_PATTERN=sign-none.toml \ - $flux_imp exec id --zero --user >id-sudo2.out - ) && + imp_exec_sign_none id --zero --user >id-sudo2.out && test_debug "echo expecting uid=$(id -u), got $(cat -v id-sudo2.out)" && id --zero --user > id-sudo2.expected && test_cmp id-sudo2.expected id-sudo2.out ' test "$chain_lint" = "t" || test_set_prereq NO_CHAIN_LINT test_expect_success SUDO,NO_CHAIN_LINT 'flux-imp exec: setuid IMP lingers' ' - cat <<-EOF >sleeper.sh && - #!/bin/sh - printf "\$PPID\n" >$(pwd)/sleeper.pid - exec /bin/sleep "\$@" - EOF - chmod +x sleeper.sh && - ( export FLUX_IMP_CONFIG_PATTERN=sign-none.toml && \ - exec <&- 1>&- 2>&- && \ - fake_imp_input foo | \ - $SUDO FLUX_IMP_CONFIG_PATTERN=sign-none.toml \ - $flux_imp exec $(pwd)/sleeper.sh 15 >linger.out 2>&1 & \ - ) && - count=0 && - while ! test -f sleeper.pid; do - sleep 0.1 - count=$((count+1)) - test $count -gt 20 && break - test_debug "echo retrying count=${count}" - done && + imp_exec_sign_none $(pwd)/sleeper.sh 15 >linger.out 2>&1 & + imp_pid=$! && + test_when_finished "rm -f sleeper.pid" && + wait_for_file sleeper.pid && test_debug "cat sleeper.pid" test -f sleeper.pid && pid=$(cat sleeper.pid) && test_debug "pstree -lup $pid" && test $(ps --no-header -o comm -p ${pid}) = "flux-imp" && kill -TERM $pid && - wait + test_expect_code 143 wait $imp_pid ' $flux_imp version | grep -q pam || test_set_prereq NO_PAM test_expect_success NO_PAM,SUDO 'flux-imp exec: fails if not built with PAM but pam-support=true' ' From 5bb34b8b2cecf22abb6c9422fca0a8d414ecf1d0 Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Tue, 29 Oct 2024 21:28:10 +0000 Subject: [PATCH 07/11] testsuite: test IMP SIGUSR1 handling Problem: There are no tests the ensure sending SIGUSR1 to the IMP causes it to send SIGKILL to its children. Add a couple tests that ensure SIGKILL is sent when the IMP is running both in and out of an "imp-shell" cgroup. --- t/t2000-imp-exec.t | 81 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) diff --git a/t/t2000-imp-exec.t b/t/t2000-imp-exec.t index 4b5637a7..a7317028 100755 --- a/t/t2000-imp-exec.t +++ b/t/t2000-imp-exec.t @@ -250,6 +250,87 @@ test_expect_success SUDO,NO_CHAIN_LINT 'flux-imp exec: setuid IMP lingers' ' kill -TERM $pid && test_expect_code 143 wait $imp_pid ' +test_expect_success SUDO,NO_CHAIN_LINT 'flux-imp exec: SIGUSR1 sends SIGKILL' ' + imp_exec_sign_none $(pwd)/sleeper.sh 15 >sigkill.out 2>&1 & + imp_pid=$! && + test_when_finished "rm -f sleeper.pid" && + wait_for_file sleeper.pid && + test -f sleeper.pid && + pid=$(cat sleeper.pid) && + test_debug "echo pid=$pid; pstree -Tplu $imp_pid" && + kill -USR1 $pid && + test_expect_code 137 wait $imp_pid +' +# Notes about cgroup kill testing below: +# - systemd-run is used as a convenient way to execute the IMP inside a +# cgroup named with imp-shell prefix defined in RFC 15. +# - systemd-run is run with --collect so that a failure shouldn't leave +# a stale transient unit on the system +# - systemd-run is run under sudo instead of `systemd-run sudo flux-imp` +# because the IMP will send SIGKILL to all processes except itself in the +# cgroup.procs file. If run directly under sudo, then sudo is a parent of +# the IMP in the cgroup and gets SIGKILLed. +# +command -v systemd-run >/dev/null 2>&1 \ + && test_have_prereq SUDO \ + && $SUDO systemd-run --collect /bin/true >/dev/null 2>&1 \ + && test_set_prereq SYSTEMD_RUN + +test_expect_success SUDO,NO_CHAIN_LINT,SYSTEMD_RUN \ + 'flux-imp exec: SIGURS1 sends SIGKILL using cgroup kill' ' + FLUX_IMP_CONFIG_PATTERN=$(pwd)/sign-none.toml \ + fake_imp_input foo | \ + $SUDO systemd-run --collect --scope \ + --unit=imp-shell-$$ \ + -E FLUX_IMP_CONFIG_PATTERN=sign-none.toml \ + $flux_imp exec $(pwd)/sleeper.sh 5 & + pid=$! && + test_when_finished "rm -f sleeper.pid" && + wait_for_file sleeper.pid && + kill -USR1 $(cat sleeper.pid) && + test_expect_code 137 wait $pid +' + +CGROUP_PATH="$(awk '/^cgroup2/ {print $2}' /proc/self/mounts)/imp-shell.$$" +test_have_prereq SUDO && + $SUDO mkdir $CGROUP_PATH && + test_set_prereq CGROUPFS && + cleanup "$SUDO rmdir $CGROUP_PATH" + +cat <<'EOF' >run-in-cgroup.sh +#!/bin/sh +path=$1 +shift +test -d $path || mkdir -p $path +echo "moving pid $$ to $path" >&2 +echo $$ >${path}/cgroup.procs +echo "running $@ as $(id -u)" >&2 +exec "$@" +EOF +chmod +x run-in-cgroup.sh + +# Rewrite sleeper script so it results in a hierarchy of processes +cat <sleeper.sh +#!/bin/sh +printf "\$PPID\n" >$(pwd)/sleeper.pid +(/bin/sleep "\$@") +EOF +chmod +x sleeper.sh + +test_expect_success SUDO,CGROUPFS,NO_CHAIN_LINT \ + 'flux-imp exec: SIGUSR1 sends SIGKILL via cgroup kill' ' + fake_input_sign_none | \ + $SUDO FLUX_IMP_CONFIG_PATTERN=sign-none.toml \ + ./run-in-cgroup.sh "$CGROUP_PATH" \ + $flux_imp exec $(pwd)/sleeper.sh 15 & + imp_pid=$! && + test_when_finished "rm -f sleeper.pid" && + wait_for_file sleeper.pid && + kill -USR1 $(cat sleeper.pid) && + test_expect_code 137 wait $imp_pid && + test_must_be_empty ${CGROUP_PATH}/cgroup.procs +' + $flux_imp version | grep -q pam || test_set_prereq NO_PAM test_expect_success NO_PAM,SUDO 'flux-imp exec: fails if not built with PAM but pam-support=true' ' ( export FLUX_IMP_CONFIG_PATTERN=pam-test.toml && From 7e1093ebdd19cd0a67038ed2f4921135adf0d853 Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Wed, 30 Oct 2024 02:17:51 +0000 Subject: [PATCH 08/11] testsuite: remove calls to pstree(1) Problem: pstree(1) is not available on all test platforms. Remove its use in debugging tests. --- t/t2000-imp-exec.t | 2 -- 1 file changed, 2 deletions(-) diff --git a/t/t2000-imp-exec.t b/t/t2000-imp-exec.t index a7317028..98a54355 100755 --- a/t/t2000-imp-exec.t +++ b/t/t2000-imp-exec.t @@ -245,7 +245,6 @@ test_expect_success SUDO,NO_CHAIN_LINT 'flux-imp exec: setuid IMP lingers' ' test_debug "cat sleeper.pid" test -f sleeper.pid && pid=$(cat sleeper.pid) && - test_debug "pstree -lup $pid" && test $(ps --no-header -o comm -p ${pid}) = "flux-imp" && kill -TERM $pid && test_expect_code 143 wait $imp_pid @@ -257,7 +256,6 @@ test_expect_success SUDO,NO_CHAIN_LINT 'flux-imp exec: SIGUSR1 sends SIGKILL' ' wait_for_file sleeper.pid && test -f sleeper.pid && pid=$(cat sleeper.pid) && - test_debug "echo pid=$pid; pstree -Tplu $imp_pid" && kill -USR1 $pid && test_expect_code 137 wait $imp_pid ' From 21aebccda2e5e36ed47877643cef9db26404a969 Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Thu, 31 Oct 2024 18:21:02 +0000 Subject: [PATCH 09/11] doc: update readthedocs requirements Problem: The readthedocs build fails with sphinx.errors.VersionRequirementError: 5.0 Update doc/requirements.txt to match those in flux-core, including an updated sphinx version requirement. --- doc/requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/requirements.txt b/doc/requirements.txt index a78a8086..4bd13474 100644 --- a/doc/requirements.txt +++ b/doc/requirements.txt @@ -1,5 +1,5 @@ -sphinx==3.4.3 +sphinx<6.0.0 sphinx-rtd-theme>=0.5.2 docutils>=0.14,<0.18 -Jinja2<3.1 urllib3<2 +jinja2<3.10 From 1b38809bbcd42344e6737e568f87ecadd1f26eb9 Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Thu, 31 Oct 2024 23:57:10 +0000 Subject: [PATCH 10/11] imp: cgroup: copy definition of CGROUP2_SUPER_MAGIC Problem: Some distros don't seem to have CGROUP2_SUPER_MAGIC defined in linux/magic.h. Copy the magic value from a system that has it. Remove the ifndef CGROUP2_SUPER_MAGIC block since it will now always be defined. --- src/imp/cgroup.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/imp/cgroup.c b/src/imp/cgroup.c index 7e1f5152..a86c2cba 100644 --- a/src/imp/cgroup.c +++ b/src/imp/cgroup.c @@ -34,6 +34,9 @@ #ifndef CGROUP_SUPER_MAGIC #define CGROUP_SUPER_MAGIC 0x27e0eb #endif +#ifndef CGROUP2_SUPER_MAGIC +#define CGROUP2_SUPER_MAGIC 0x63677270 +#endif #include #include "src/libutil/strlcpy.h" @@ -123,7 +126,6 @@ static int cgroup_init_mount_dir_and_type (struct cgroup_info *cg) if (statfs (cg->mount_dir, &fs) < 0) return -1; -#ifdef CGROUP2_SUPER_MAGIC /* if cgroup2 fs mounted: unified hierarchy for all users of cgroupfs */ if (fs.f_type == CGROUP2_SUPER_MAGIC) @@ -141,7 +143,6 @@ static int cgroup_init_mount_dir_and_type (struct cgroup_info *cg) if (fs.f_type == CGROUP2_SUPER_MAGIC) return 0; -#endif /* CGROUP2_SUPER_MAGIC */ /* O/w, if /sys/fs/cgroup is mounted as tmpfs, we need to check * for /sys/fs/cgroup/systemd mounted as cgroupfs (legacy). */ From 35ec95d84f50f0f6b1326ee3a4fd720f6cfd1744 Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Thu, 31 Oct 2024 23:59:45 +0000 Subject: [PATCH 11/11] imp: cgroup: fix relative cgroup paths in containers Problem: Relative paths for cgroups in /proc/self/cgroup may contain leading `/..` when the IMP is run inside some containers. This is because the path is relative to the container's mount point, not the actual mount point of the cgroupfs. Just strip the leading /.. from paths during discovery of the current cgroup path from `/proc/self/cgroup`. This should only apply in containers and thus CI. --- src/imp/cgroup.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/imp/cgroup.c b/src/imp/cgroup.c index a86c2cba..8591164c 100644 --- a/src/imp/cgroup.c +++ b/src/imp/cgroup.c @@ -44,6 +44,13 @@ #include "cgroup.h" #include "imp_log.h" +static char *remove_leading_dotdot (char *relpath) +{ + while (strncmp (relpath, "/..", 3) == 0) + relpath += 3; + return relpath; +} + /* * Look up the current cgroup relative path from /proc/self/cgroup. * @@ -80,6 +87,11 @@ static int cgroup_init_path (struct cgroup_info *cgroup) /* Nullify subsys, relpath is already nul-terminated at newline */ *(relpath++) = '\0'; + /* Remove leading /.. in relpath. This could be due to cgroup + * mounted in a container. + */ + relpath = remove_leading_dotdot (relpath); + /* If unified cgroups are being used, then stop when we find * subsys="". Otherwise stop at subsys="name=systemd": */