From 92021490aebae22a71d816e10a65ecafe66a354f Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Wed, 21 Aug 2024 08:17:51 +0200 Subject: [PATCH 1/2] apt: Remove mounts argument from invoke() --- mkosi/distributions/debian.py | 28 ++++++++++++++-------------- mkosi/installer/apt.py | 3 +-- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/mkosi/distributions/debian.py b/mkosi/distributions/debian.py index 25eb9240a..a2a3ddb2e 100644 --- a/mkosi/distributions/debian.py +++ b/mkosi/distributions/debian.py @@ -136,21 +136,21 @@ def install(cls, context: Context) -> None: # By configuring Debug::pkgDpkgPm=1, apt-get install will not actually execute any dpkg commands, so # all it does is download the essential debs and tell us their full in the apt cache without actually # installing them. - with tempfile.NamedTemporaryFile(mode="r") as f: - Apt.invoke( - context, - "install", - [ - "-oDebug::pkgDPkgPm=1", - f"-oDPkg::Pre-Install-Pkgs::=cat >{f.name}", - "?essential", - "?exact-name(usr-is-merged)", - "base-files", - ], - mounts=[Mount(f.name, f.name)], - ) + Apt.invoke( + context, + "install", + [ + "-oDebug::pkgDPkgPm=1", + # context.pkgmngr is always mounted writable to /etc so let's use that as a channel to get the list of + # essential packages out of the sandbox. + "-oDPkg::Pre-Install-Pkgs::=cat >/etc/apt/essential", + "?essential", + "?exact-name(usr-is-merged)", + "base-files", + ], + ) - essential = f.read().strip().splitlines() + essential = (context.pkgmngr / "etc/apt/essential").read_text().strip().splitlines() # Now, extract the debs to the chroot by first extracting the sources tar file out of the deb and # then extracting the tar file into the chroot. diff --git a/mkosi/installer/apt.py b/mkosi/installer/apt.py index 59effcb32..7bfdae7fc 100644 --- a/mkosi/installer/apt.py +++ b/mkosi/installer/apt.py @@ -208,7 +208,6 @@ def invoke( arguments: Sequence[str] = (), *, apivfs: bool = False, - mounts: Sequence[Mount] = (), stdout: _FILE = None, ) -> CompletedProcess: return run( @@ -218,7 +217,7 @@ def invoke( binary="apt-get", network=True, vartmp=True, - mounts=[Mount(context.root, "/buildroot"), *cls.mounts(context), *mounts], + mounts=[Mount(context.root, "/buildroot"), *cls.mounts(context)], extra=apivfs_cmd() if apivfs else [] ) ), From b3a3e7e7fcb2a4e8f364d68745b616acc2a9801f Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Fri, 16 Aug 2024 23:41:49 +0200 Subject: [PATCH 2/2] Introduce mkosi-sandbox and stop using subuids for image builds Over the last years, we've accumulated a rather nasty set of workarounds for various issues in bubblewrap: - We contributed setpgid to util-linux and use it if available because bubblewrap does not support making its child process the foreground process. - We added the innerpid logic to run() because bubblewrap does not forward signals to the separate child process it runs in the sandbox which meant they were getting SIGKILLed when we killed bubblewrap, preventing proper cleanup from happening. - bubblewrap does not provide a proper way to detect whether the command was found in the sandbox or not, which meant we had to execute command -v within the sandbox separately to check whether the command exists or not. - We had to add extra logic to make sure / was a mount in the initramfs to allow running mkosi in the initramfs as bubblewrap does not fall back to MS_MOVE if pivot_root() doesn't work. - We had to stitch together shell invocations after bubblewrap but before executing the actual command we want to run to make sure directories had the correct mode as bubblewrap creates everything with mode 0700 which was too restrictive in many cases for us. This was fixed with new --perms and --chmod options in bubblewrap 0.5 but we had to keep compat with 0.4 because that's what's shipped in CentOS Stream 9. - We had to figure out a shell hack to do overlayfs mounts as these are not supported by bubblewrap (even though a PR for the feature has been open for years). - We had to introduce a Mount struct to pass around mounts so we could deduplicate and sort them before passing them to bubblewrap as bubblewrap did not do this itself. - Debugging all the above was made all the harder by the fact that bubblewrap's source code is full of tech debt from its history of being a setuid tool instead of using user namespaces. Getting any fixes into upstream is almost impossible as the tool is practically unmaintained. Aside from bubblewrap, our other source of troubles has been newuidmap/newgidmap. Running as a user within the subuid range configured in /etc/sub{u,g}id has meant we're constantly fixing ownership and permissions issues where stuff needs to be chowned and chmodded everywhere to make sure the current user and the subuid user can access the proper files. Another unfortunate side effect is that users end up with many files owned by the subuid root user in their home directories when building images with mkosi; Let's fix all these issues at once by getting rid of bubblewrap and newuidmap/newgidmap. bubblewrap is replaced with a new tool mkosi-sandbox. It looks and behaves a lot like bubblewrap, except it's much less code and much more flexible to fit our needs, allowing us to get rid of all the hacks we've built up over the years to work around issues that didn't get fixed in bubblewrap. To get rid of newuidmap/newgidmap, a rework of our user namespacing was needed. The need to use newuidmap/newgidmap came from the assumption that we need a full 65k subuid range to do unprivileged image builds, as distributions ship packages containing files and directories that are not owned by the root user. After some investigation, it turns out that there's very few files and directories not owned by root in distribution packages if you ignore /var. If we could temporarily ignore the ownership on these files and directories until we can get distributions to only ship root owned files in /usr and /etc of their packages, we could simply map the current user to root in a user namespace and get rid of the subuid range completely. Turns out that's possible with a seccomp filter. seccomp allows you to make all chown() syscalls succeed without actually doing anything. The files and directories end up owned by the root user instead. If we assume this is OK and are OK with instructing users to use tmpfiles to fix up the permissions on first boot if needed, a seccomp filter like this is sufficient to allow us to get rid of doing image builds within a subuid user namespace. It turns out we can go one step further. It turns out that for the majority of the image build, one doesn't actually need to be the root user. Only package managers and systemd-repart need the current user to be mapped to root to do their job correctly. The reason we did the entire build mapped to root until now was that we need to do a few mounts as part of the image build process and for now I was under the assumption that you needed to be root for that. It turns out that when you unshare a user namespace, you get a full set of capabilities regardless of whether you're root or some other uid in the user namespace. The only difference is that when you exec a subprocess as root, the capabilities aren't lost, whereas they are when you exec a subprocess as a non-root user. This can be avoided by adding the capabilities of the non-root user to the inheritable and ambient set. Once that's done, any subprocess exec'd by a non-root user in the user namespace can mount as many bind and overlay mounts as they can think of. The above allows us to run most of the image build under the current user uid instead of root, only switching to root when running package managers, invoking systemd-repart or systemd-tmpfiles, or when chroot-ing into the image. This allows us to get rid of various hacks we had to look up the proper user name or home directory. Specifically, we can get rid of the following: - mkosi-as-caller can become a noop since we now by default run the build as the caller. - Lots of chmod()'s and chown()'s can be removed - All uses of INVOKING_USER.uid/gid can be removed, and most can be replaced with simple os.getuid()/os.getgid() - We can use /etc/passwd and /etc/group from the host instead of building our own - We can get rid of the Acl= option as the user will now be able to remove (almost) all files written by mkosi. - We don't have to rchown the package manager cache directory anymore after each build. Root user builds will now use the system cache instead of the per user cache. - We can get rid of the Mount struct as mkosi-sandbox dedups and sorts operations itself. One thing to note is that if we're invoked as root, none of the seccomp or capabilities stuff applies and it is all skipped as it's not required in that case. This means that when building as root it's still possible to have more than one user in the generated image unlike when building unprivileged. Also note that users can still be added to /etc/passwd and such, they just can't own any files or directories in the image itself until the image is booted. --- .gitignore | 1 + action.yaml | 1 - kernel-install/50-mkosi.install | 4 +- mkosi/__init__.py | 895 ++++++++++------------- mkosi/__main__.py | 3 - mkosi/archive.py | 17 +- mkosi/config.py | 45 +- mkosi/context.py | 32 +- mkosi/distributions/debian.py | 6 +- mkosi/distributions/opensuse.py | 9 +- mkosi/initrd/__main__.py | 4 +- mkosi/installer/__init__.py | 54 +- mkosi/installer/apt.py | 30 +- mkosi/installer/dnf.py | 19 +- mkosi/installer/pacman.py | 25 +- mkosi/installer/zypper.py | 19 +- mkosi/kmod.py | 24 +- mkosi/manifest.py | 7 +- mkosi/mounts.py | 123 +--- mkosi/partition.py | 5 +- mkosi/qemu.py | 210 +++--- mkosi/resources/mkosi.md | 55 +- mkosi/run.py | 374 +++++++--- mkosi/sandbox.py | 312 -------- mkosi/sandbox/__init__.py | 794 ++++++++++++++++++++ mkosi/sandbox/__main__.py | 6 + mkosi/sandbox/resources/mkosi-sandbox.md | 144 ++++ mkosi/tree.py | 37 +- mkosi/user.py | 118 +-- mkosi/util.py | 9 - mkosi/vmspawn.py | 7 +- pyproject.toml | 4 + tests/__init__.py | 18 +- tests/test_initrd.py | 25 +- tests/test_json.py | 2 - tools/do-a-release.sh | 4 +- tools/make-man-page.sh | 1 + 37 files changed, 1991 insertions(+), 1452 deletions(-) delete mode 100644 mkosi/sandbox.py create mode 100644 mkosi/sandbox/__init__.py create mode 100644 mkosi/sandbox/__main__.py create mode 100644 mkosi/sandbox/resources/mkosi-sandbox.md diff --git a/.gitignore b/.gitignore index e8b7eab25..4d1b379d3 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ .cache .mkosi.1 .mkosi-initrd.1 +.mkosi-sandbox.1 .mypy_cache/ .project .pydevproject diff --git a/action.yaml b/action.yaml index fbcb92662..94dde3544 100644 --- a/action.yaml +++ b/action.yaml @@ -66,7 +66,6 @@ runs: sudo apt-get update sudo apt-get install --assume-yes --no-install-recommends \ archlinux-keyring \ - bubblewrap \ debian-archive-keyring \ dnf \ makepkg \ diff --git a/kernel-install/50-mkosi.install b/kernel-install/50-mkosi.install index 4d1360e58..d59f64f6f 100644 --- a/kernel-install/50-mkosi.install +++ b/kernel-install/50-mkosi.install @@ -12,11 +12,11 @@ from typing import Optional from mkosi import identify_cpu from mkosi.archive import make_cpio -from mkosi.config import OutputFormat, __version__ +from mkosi.config import OutputFormat from mkosi.log import die, log_setup from mkosi.run import run, uncaught_exception_handler +from mkosi.sandbox import __version__, umask from mkosi.types import PathString -from mkosi.util import umask @dataclasses.dataclass(frozen=True) diff --git a/mkosi/__init__.py b/mkosi/__init__.py index f9b37dd64..78389b6d3 100644 --- a/mkosi/__init__.py +++ b/mkosi/__init__.py @@ -3,6 +3,7 @@ import contextlib import dataclasses import datetime +import functools import hashlib import io import itertools @@ -47,7 +48,6 @@ ShimBootloader, Verb, Vmm, - __version__, format_bytes, parse_config, summary, @@ -66,14 +66,32 @@ from mkosi.partition import Partition, finalize_root, finalize_roothash from mkosi.qemu import KernelType, copy_ephemeral, run_qemu, run_ssh, start_journal_remote from mkosi.run import ( + chroot_cmd, + chroot_script_cmd, + finalize_passwd_mounts, find_binary, fork_and_wait, run, ) -from mkosi.sandbox import Mount, chroot_cmd, finalize_passwd_mounts +from mkosi.sandbox import ( + CLONE_NEWNS, + MOUNT_ATTR_NODEV, + MOUNT_ATTR_NOEXEC, + MOUNT_ATTR_NOSUID, + MOUNT_ATTR_RDONLY, + MS_REC, + MS_SLAVE, + __version__, + acquire_privileges, + mount, + mount_rbind, + umask, + unshare, + userns_has_single_user, +) from mkosi.tree import copy_tree, move_tree, rmtree from mkosi.types import PathString -from mkosi.user import CLONE_NEWNS, INVOKING_USER, become_root, unshare +from mkosi.user import INVOKING_USER from mkosi.util import ( flatten, flock, @@ -85,7 +103,6 @@ read_env_file, round_up, scopedenv, - umask, ) from mkosi.versioncomp import GenericVersion from mkosi.vmspawn import run_vmspawn @@ -118,7 +135,7 @@ def mount_base_trees(context: Context) -> Iterator[None]: else: die(f"Unsupported base tree source {path}") - stack.enter_context(mount_overlay(bases, context.root, context.root)) + stack.enter_context(mount_overlay(bases, context.root, upperdir=context.root)) yield @@ -131,7 +148,7 @@ def remove_files(context: Context) -> None: with complete_step("Removing files…"): remove = flatten(context.root.glob(pattern.lstrip("/")) for pattern in context.config.remove_files) - rmtree(*remove, sandbox=context.sandbox) + rmtree(*remove, context.root / "work", sandbox=context.sandbox) def install_distribution(context: Context) -> None: @@ -356,7 +373,7 @@ def mount_build_overlay(context: Context, volatile: bool = False) -> Iterator[Pa else: upper = d - stack.enter_context(mount_overlay(lower, upper, context.root)) + stack.enter_context(mount_overlay(lower, context.root, upperdir=upper)) yield context.root @@ -364,9 +381,6 @@ def mount_build_overlay(context: Context, volatile: bool = False) -> Iterator[Pa @contextlib.contextmanager def finalize_scripts(config: Config, scripts: Mapping[str, Sequence[PathString]]) -> Iterator[Path]: with tempfile.TemporaryDirectory(prefix="mkosi-scripts-") as d: - # Make sure than when mkosi-as-caller is used the scripts can still be accessed. - os.chmod(d, 0o755) - for name, script in scripts.items(): # Make sure we don't end up in a recursive loop when we name a script after the binary it execs # by removing the scripts directory from the PATH when we execute a script. @@ -387,7 +401,6 @@ def finalize_scripts(config: Config, scripts: Mapping[str, Sequence[PathString]] f.write(f'exec {shlex.join(str(s) for s in script)} "$@"\n') make_executable(Path(d) / name) - os.chmod(Path(d) / name, 0o755) os.utime(Path(d) / name, (0, 0)) yield Path(d) @@ -401,12 +414,8 @@ def finalize_scripts(config: Config, scripts: Mapping[str, Sequence[PathString]] def mkosi_as_caller() -> tuple[str, ...]: - return ( - "setpriv", - f"--reuid={INVOKING_USER.uid}", - f"--regid={INVOKING_USER.gid}", - "--clear-groups", - ) + # Kept for backwards compatibility. + return ("env",) def finalize_host_scripts( @@ -447,8 +456,8 @@ def run_configure_scripts(config: Config) -> Config: QEMU_ARCHITECTURE=config.architecture.to_qemu(), DISTRIBUTION_ARCHITECTURE=config.distribution.architecture(config.architecture), SRCDIR="/work/src", - MKOSI_UID=str(INVOKING_USER.uid), - MKOSI_GID=str(INVOKING_USER.gid), + MKOSI_UID=str(os.getuid()), + MKOSI_GID=str(os.getgid()), ) if config.profile: @@ -463,8 +472,12 @@ def run_configure_scripts(config: Config) -> Config: sandbox=config.sandbox( binary=None, vartmp=True, - mounts=[*sources, Mount(script, "/work/configure", ro=True)], - options=["--dir", "/work/src", "--chdir", "/work/src"] + options=[ + "--dir", "/work/src", + "--chdir", "/work/src", + "--ro-bind", script, "/work/configure", + *sources, + ], ), input=config.to_json(indent=None), stdout=subprocess.PIPE, @@ -485,8 +498,8 @@ def run_sync_scripts(context: Context) -> None: ARCHITECTURE=str(context.config.architecture), DISTRIBUTION_ARCHITECTURE=context.config.distribution.architecture(context.config.architecture), SRCDIR="/work/src", - MKOSI_UID=str(INVOKING_USER.uid), - MKOSI_GID=str(INVOKING_USER.gid), + MKOSI_UID=str(os.getuid()), + MKOSI_GID=str(os.getgid()), MKOSI_CONFIG="/work/config.json", CACHED=one_zero(have_cache(context.config)), ) @@ -503,21 +516,23 @@ def run_sync_scripts(context: Context) -> None: finalize_config_json(context.config) as json, ): for script in context.config.sync_scripts: - mounts = [ - *sources, + options = [ *finalize_crypto_mounts(context.config), - Mount(script, "/work/sync", ro=True), - Mount(json, "/work/config.json", ro=True), + "--ro-bind", script, "/work/sync", + "--ro-bind", json, "/work/config.json", + "--dir", "/work/src", + "--chdir", "/work/src", + *sources, ] if (p := INVOKING_USER.home()).exists() and p != Path("/"): # We use a writable mount here to keep git worktrees working which encode absolute paths to the parent # git repository and might need to modify the git config in the parent git repository when submodules # are in use as well. - mounts += [Mount(p, p)] + options += ["--bind", p, p] env["HOME"] = os.fspath(p) - if (p := Path(f"/run/user/{INVOKING_USER.uid}")).exists(): - mounts += [Mount(p, p, ro=True)] + if (p := Path(f"/run/user/{os.getuid()}")).exists(): + options += ["--ro-bind", p, p] with complete_step(f"Running sync script {script}…"): run( @@ -528,12 +543,50 @@ def run_sync_scripts(context: Context) -> None: binary=None, network=True, vartmp=True, - mounts=mounts, - options=["--dir", "/work/src", "--chdir", "/work/src"] + options=options, ), ) +@contextlib.contextmanager +def script_maybe_chroot_sandbox( + context: Context, + *, + script: Path, + options: Sequence[PathString], + network: bool, +) -> Iterator[list[PathString]]: + options = ["--dir", "/work/src", "--chdir", "/work/src", *options] + + helpers = { + "mkosi-chroot": chroot_script_cmd(tools=bool(context.config.tools_tree), network=network, work=True), + "mkosi-as-caller": mkosi_as_caller(), + **context.config.distribution.package_manager(context.config).scripts(context), + } + + with finalize_host_scripts(context, helpers) as hd: + if script.suffix != ".chroot": + with context.sandbox( + binary=None, + network=network, + vartmp=True, + options=[ + *options, + "--bind", context.root, "/buildroot", + *context.config.distribution.package_manager(context.config).mounts(context), + ], + scripts=hd, + ) as sandbox: + yield sandbox + else: + with chroot_cmd( + root=context.root, + network=network, + options=options, + ) as sandbox: + yield sandbox + + def run_prepare_scripts(context: Context, build: bool) -> None: if not context.config.prepare_scripts: return @@ -552,8 +605,8 @@ def run_prepare_scripts(context: Context, build: bool) -> None: ARTIFACTDIR="/work/artifacts", SCRIPT="/work/prepare", CHROOT_SCRIPT="/work/prepare", - MKOSI_UID=str(INVOKING_USER.uid), - MKOSI_GID=str(INVOKING_USER.gid), + MKOSI_UID=str(os.getuid()), + MKOSI_GID=str(os.getgid()), MKOSI_CONFIG="/work/config.json", WITH_DOCS=one_zero(context.config.with_docs), WITH_NETWORK=one_zero(context.config.with_network), @@ -567,9 +620,12 @@ def run_prepare_scripts(context: Context, build: bool) -> None: if context.config.build_dir is not None: env |= dict(BUILDDIR="/work/build") + env |= context.config.environment + with ( mount_build_overlay(context) if build else contextlib.nullcontext(), finalize_source_mounts(context.config, ephemeral=context.config.build_sources_ephemeral) as sources, + finalize_config_json(context.config) as json, ): if build: step_msg = "Running prepare script {} in build overlay…" @@ -579,45 +635,30 @@ def run_prepare_scripts(context: Context, build: bool) -> None: arg = "final" for script in context.config.prepare_scripts: - chroot = chroot_cmd(resolve=True, work=True) - - helpers = { - "mkosi-chroot": chroot, - "mkosi-as-caller": mkosi_as_caller(), - **context.config.distribution.package_manager(context.config).scripts(context), - } + with complete_step(step_msg.format(script)): + options: list[PathString] = [ + "--ro-bind", script, "/work/prepare", + "--ro-bind", json, "/work/config.json", + "--bind", context.artifacts, "/work/artifacts", + "--bind", context.package_dir, "/work/packages", + *( + ["--ro-bind", str(context.config.build_dir), "/work/build"] + if context.config.build_dir + else [] + ), + *sources, + ] - with ( - finalize_host_scripts(context, helpers) as hd, - finalize_config_json(context.config) as json, - complete_step(step_msg.format(script)), - ): run( ["/work/prepare", arg], - env=env | context.config.environment, + env=env, stdin=sys.stdin, - sandbox=context.sandbox( - binary=None, + sandbox=script_maybe_chroot_sandbox( + context, + script=script, + options=options, network=True, - vartmp=True, - mounts=[ - *sources, - Mount(script, "/work/prepare", ro=True), - Mount(json, "/work/config.json", ro=True), - Mount(context.root, "/buildroot"), - Mount(context.artifacts, "/work/artifacts"), - Mount(context.package_dir, "/work/packages"), - *( - [Mount(context.config.build_dir, "/work/build", ro=True)] - if context.config.build_dir - else [] - ), - *context.config.distribution.package_manager(context.config).mounts(context), - ], - options=["--dir", "/work/src", "--chdir", "/work/src"], - scripts=hd, - extra=chroot if script.suffix == ".chroot" else [], - ) + ), ) @@ -641,8 +682,8 @@ def run_build_scripts(context: Context) -> None: ARTIFACTDIR="/work/artifacts", SCRIPT="/work/build-script", CHROOT_SCRIPT="/work/build-script", - MKOSI_UID=str(INVOKING_USER.uid), - MKOSI_GID=str(INVOKING_USER.gid), + MKOSI_UID=str(os.getuid()), + MKOSI_GID=str(os.getgid()), MKOSI_CONFIG="/work/config.json", WITH_DOCS=one_zero(context.config.with_docs), WITH_NETWORK=one_zero(context.config.with_network), @@ -659,53 +700,41 @@ def run_build_scripts(context: Context) -> None: CHROOT_BUILDDIR="/work/build", ) + env |= context.config.environment + with ( mount_build_overlay(context, volatile=True), finalize_source_mounts(context.config, ephemeral=context.config.build_sources_ephemeral) as sources, + finalize_config_json(context.config) as json, ): for script in context.config.build_scripts: - chroot = chroot_cmd(resolve=context.config.with_network, work=True) - - helpers = { - "mkosi-chroot": chroot, - "mkosi-as-caller": mkosi_as_caller(), - **context.config.distribution.package_manager(context.config).scripts(context), - } - cmdline = context.args.cmdline if context.args.verb == Verb.build else [] - with ( - finalize_host_scripts(context, helpers) as hd, - finalize_config_json(context.config) as json, - complete_step(f"Running build script {script}…"), - ): + with complete_step(f"Running build script {script}…"): + options: list[PathString] = [ + "--ro-bind", script, "/work/build-script", + "--ro-bind", json, "/work/config.json", + "--bind", context.install_dir, "/work/dest", + "--bind", context.staging, "/work/out", + "--bind", context.artifacts, "/work/artifacts", + "--bind", context.package_dir, "/work/packages", + *( + ["--bind", str(context.config.build_dir), "/work/build"] + if context.config.build_dir + else [] + ), + *sources, + ] + run( ["/work/build-script", *cmdline], - env=env | context.config.environment, + env=env, stdin=sys.stdin, - sandbox=context.sandbox( - binary=None, + sandbox=script_maybe_chroot_sandbox( + context, + script=script, + options=options, network=context.config.with_network, - vartmp=True, - mounts=[ - *sources, - Mount(script, "/work/build-script", ro=True), - Mount(json, "/work/config.json", ro=True), - Mount(context.root, "/buildroot"), - Mount(context.install_dir, "/work/dest"), - Mount(context.staging, "/work/out"), - Mount(context.artifacts, "/work/artifacts"), - Mount(context.package_dir, "/work/packages"), - *( - [Mount(context.config.build_dir, "/work/build")] - if context.config.build_dir - else [] - ), - *context.config.distribution.package_manager(context.config).mounts(context), - ], - options=["--dir", "/work/src", "--chdir", "/work/src"], - scripts=hd, - extra=chroot if script.suffix == ".chroot" else [], ), ) @@ -728,8 +757,8 @@ def run_postinst_scripts(context: Context) -> None: CHROOT_SRCDIR="/work/src", PACKAGEDIR="/work/packages", ARTIFACTDIR="/work/artifacts", - MKOSI_UID=str(INVOKING_USER.uid), - MKOSI_GID=str(INVOKING_USER.gid), + MKOSI_UID=str(os.getuid()), + MKOSI_GID=str(os.getgid()), MKOSI_CONFIG="/work/config.json", WITH_NETWORK=one_zero(context.config.with_network), **GIT_ENV, @@ -741,49 +770,37 @@ def run_postinst_scripts(context: Context) -> None: if context.config.build_dir is not None: env |= dict(BUILDDIR="/work/build") + env |= context.config.environment + with ( finalize_source_mounts(context.config, ephemeral=context.config.build_sources_ephemeral) as sources, + finalize_config_json(context.config) as json, ): for script in context.config.postinst_scripts: - chroot = chroot_cmd(resolve=context.config.with_network, work=True) - - helpers = { - "mkosi-chroot": chroot, - "mkosi-as-caller": mkosi_as_caller(), - **context.config.distribution.package_manager(context.config).scripts(context), - } + with complete_step(f"Running postinstall script {script}…"): + options: list[PathString] = [ + "--ro-bind", script, "/work/postinst", + "--ro-bind", json, "/work/config.json", + "--bind", context.staging, "/work/out", + "--bind", context.artifacts, "/work/artifacts", + "--bind", context.package_dir, "/work/packages", + *( + ["--ro-bind", str(context.config.build_dir), "/work/build"] + if context.config.build_dir + else [] + ), + *sources, + ] - with ( - finalize_host_scripts(context, helpers) as hd, - finalize_config_json(context.config) as json, - complete_step(f"Running postinstall script {script}…"), - ): run( ["/work/postinst", "final"], - env=env | context.config.environment, + env=env, stdin=sys.stdin, - sandbox=context.sandbox( - binary=None, + sandbox=script_maybe_chroot_sandbox( + context, + script=script, + options=options, network=context.config.with_network, - vartmp=True, - mounts=[ - *sources, - Mount(script, "/work/postinst", ro=True), - Mount(json, "/work/config.json", ro=True), - Mount(context.root, "/buildroot"), - Mount(context.staging, "/work/out"), - Mount(context.artifacts, "/work/artifacts"), - Mount(context.package_dir, "/work/packages"), - *( - [Mount(context.config.build_dir, "/work/build", ro=True)] - if context.config.build_dir - else [] - ), - *context.config.distribution.package_manager(context.config).mounts(context), - ], - options=["--dir", "/work/src", "--chdir", "/work/src"], - scripts=hd, - extra=chroot if script.suffix == ".chroot" else [], ), ) @@ -806,8 +823,8 @@ def run_finalize_scripts(context: Context) -> None: ARTIFACTDIR="/work/artifacts", SCRIPT="/work/finalize", CHROOT_SCRIPT="/work/finalize", - MKOSI_UID=str(INVOKING_USER.uid), - MKOSI_GID=str(INVOKING_USER.gid), + MKOSI_UID=str(os.getuid()), + MKOSI_GID=str(os.getgid()), MKOSI_CONFIG="/work/config.json", WITH_NETWORK=one_zero(context.config.with_network), **GIT_ENV, @@ -819,48 +836,38 @@ def run_finalize_scripts(context: Context) -> None: if context.config.build_dir is not None: env |= dict(BUILDDIR="/work/build") - with finalize_source_mounts(context.config, ephemeral=context.config.build_sources_ephemeral) as sources: - for script in context.config.finalize_scripts: - chroot = chroot_cmd(resolve=context.config.with_network, work=True) + env |= context.config.environment - helpers = { - "mkosi-chroot": chroot, - "mkosi-as-caller": mkosi_as_caller(), - **context.config.distribution.package_manager(context.config).scripts(context), - } + with ( + finalize_source_mounts(context.config, ephemeral=context.config.build_sources_ephemeral) as sources, + finalize_config_json(context.config) as json, + ): + for script in context.config.finalize_scripts: + with complete_step(f"Running finalize script {script}…"): + options: list[PathString] = [ + "--ro-bind", script, "/work/finalize", + "--ro-bind", json, "/work/config.json", + "--bind", context.staging, "/work/out", + "--bind", context.artifacts, "/work/artifacts", + "--bind", context.package_dir, "/work/packages", + *( + ["--ro-bind", str(context.config.build_dir), "/work/build"] + if context.config.build_dir + else [] + ), + *sources, + ] - with ( - finalize_host_scripts(context, helpers) as hd, - finalize_config_json(context.config) as json, - complete_step(f"Running finalize script {script}…"), - ): run( ["/work/finalize"], - env=env | context.config.environment, + env=env, stdin=sys.stdin, - sandbox=context.sandbox( - binary=None, + sandbox=script_maybe_chroot_sandbox( + context, + script=script, + options=options, network=context.config.with_network, - vartmp=True, - mounts=[ - *sources, - Mount(script, "/work/finalize", ro=True), - Mount(json, "/work/config.json", ro=True), - Mount(context.root, "/buildroot"), - Mount(context.staging, "/work/out"), - Mount(context.artifacts, "/work/artifacts"), - Mount(context.package_dir, "/work/packages"), - *( - [Mount(context.config.build_dir, "/work/build", ro=True)] - if context.config.build_dir - else [] - ), - *context.config.distribution.package_manager(context.config).mounts(context), - ], - options=["--dir", "/work/src", "--chdir", "/work/src"], - scripts=hd, - extra=chroot if script.suffix == ".chroot" else [], - ), + ) ) @@ -875,8 +882,8 @@ def run_postoutput_scripts(context: Context) -> None: DISTRIBUTION_ARCHITECTURE=context.config.distribution.architecture(context.config.architecture), SRCDIR="/work/src", OUTPUTDIR="/work/out", - MKOSI_UID=str(INVOKING_USER.uid), - MKOSI_GID=str(INVOKING_USER.gid), + MKOSI_UID=str(os.getuid()), + MKOSI_GID=str(os.getgid()), MKOSI_CONFIG="/work/config.json", ) @@ -895,13 +902,18 @@ def run_postoutput_scripts(context: Context) -> None: sandbox=context.sandbox( binary=None, vartmp=True, - mounts=[ + # postoutput scripts should run as (fake) root so that file ownership is always recorded as if + # owned by root. + options=[ + "--ro-bind", script, "/work/postoutput", + "--ro-bind", json, "/work/config.json", + "--bind", context.staging, "/work/out", + "--dir", "/work/src", + "--chdir", "/work/src", + "--dir", "/work/out", + "--become-root", *sources, - Mount(script, "/work/postoutput", ro=True), - Mount(json, "/work/config.json", ro=True), - Mount(context.staging, "/work/out"), - ], - options=["--dir", "/work/src", "--chdir", "/work/src", "--dir", "/work/out"] + ] ), stdin=sys.stdin, ) @@ -918,7 +930,7 @@ def certificate_common_name(context: Context, certificate: Path) -> str: "-in", certificate, ], stdout=subprocess.PIPE, - sandbox=context.sandbox(binary="openssl", mounts=[Mount(certificate, certificate, ro=True)]), + sandbox=context.sandbox(binary="openssl", options=["--ro-bind", certificate, certificate]), ).stdout for line in output.splitlines(): @@ -963,9 +975,9 @@ def pesign_prepare(context: Context) -> None: stdout=f, sandbox=context.sandbox( binary="openssl", - mounts=[ - Mount(context.config.secure_boot_key, context.config.secure_boot_key, ro=True), - Mount(context.config.secure_boot_certificate, context.config.secure_boot_certificate, ro=True), + options=[ + "--ro-bind", context.config.secure_boot_key, context.config.secure_boot_key, + "--ro-bind", context.config.secure_boot_certificate, context.config.secure_boot_certificate, ], ), ) @@ -982,9 +994,9 @@ def pesign_prepare(context: Context) -> None: ], sandbox=context.sandbox( binary="pk12util", - mounts=[ - Mount(context.workspace / "secure-boot.p12", context.workspace / "secure-boot.p12", ro=True), - Mount(context.workspace / "pesign", context.workspace / "pesign"), + options=[ + "--ro-bind", context.workspace / "secure-boot.p12", context.workspace / "secure-boot.p12", + "--ro-bind", context.workspace / "pesign", context.workspace / "pesign", ], ), ) @@ -1022,21 +1034,21 @@ def sign_efi_binary(context: Context, input: Path, output: Path) -> Path: "--cert", context.config.secure_boot_certificate, "--output", "/dev/stdout", ] - mounts = [ - Mount(context.config.secure_boot_certificate, context.config.secure_boot_certificate, ro=True), - Mount(input, input, ro=True), + options: list[PathString] = [ + "--ro-bind", context.config.secure_boot_certificate, context.config.secure_boot_certificate, + "--ro-bind", input, input, ] if context.config.secure_boot_key_source.type == KeySource.Type.engine: cmd += ["--engine", context.config.secure_boot_key_source.source] if context.config.secure_boot_key.exists(): - mounts += [Mount(context.config.secure_boot_key, context.config.secure_boot_key, ro=True)] + options += ["--ro-bind", context.config.secure_boot_key, context.config.secure_boot_key] cmd += [input] run( cmd, stdout=f, sandbox=context.sandbox( binary="sbsign", - mounts=mounts, + options=options, devices=context.config.secure_boot_key_source.type != KeySource.Type.file, ) ) @@ -1063,9 +1075,9 @@ def sign_efi_binary(context: Context, input: Path, output: Path) -> Path: stdout=f, sandbox=context.sandbox( binary="pesign", - mounts=[ - Mount(context.workspace / "pesign", context.workspace / "pesign", ro=True), - Mount(input, input, ro=True), + options=[ + "--ro-bind", context.workspace / "pesign", context.workspace / "pesign", + "--ro-bind", input, input, ] ), ) @@ -1110,7 +1122,7 @@ def install_systemd_boot(context: Context) -> None: run( ["bootctl", "install", "--root=/buildroot", "--all-architectures", "--no-variables"], env={"SYSTEMD_ESP_PATH": "/efi", "SYSTEMD_XBOOTLDR_PATH": "/boot"}, - sandbox=context.sandbox(binary="bootctl", mounts=[Mount(context.root, "/buildroot")]), + sandbox=context.sandbox(binary="bootctl", options=["--bind", context.root, "/buildroot"]), ) # TODO: Use --random-seed=no when we can depend on systemd 256. Path(context.root / "efi/loader/random-seed").unlink(missing_ok=True) @@ -1142,12 +1154,10 @@ def install_systemd_boot(context: Context) -> None: stdout=f, sandbox=context.sandbox( binary="openssl", - mounts=[ - Mount( - context.config.secure_boot_certificate, - context.config.secure_boot_certificate, - ro=True - ), + options=[ + "--ro-bind", + context.config.secure_boot_certificate, + context.config.secure_boot_certificate, ], ), ) @@ -1164,7 +1174,7 @@ def install_systemd_boot(context: Context) -> None: stdout=f, sandbox=context.sandbox( binary="sbsiglist", - mounts=[Mount(context.workspace / "mkosi.der", context.workspace / "mkosi.der", ro=True)] + options=["--ro-bind", context.workspace / "mkosi.der", context.workspace / "mkosi.der"] ), ) @@ -1179,25 +1189,21 @@ def install_systemd_boot(context: Context) -> None: "--cert", context.config.secure_boot_certificate, "--output", "/dev/stdout", ] - mounts = [ - Mount( - context.config.secure_boot_certificate, - context.config.secure_boot_certificate, - ro=True - ), - Mount(context.workspace / "mkosi.esl", context.workspace / "mkosi.esl", ro=True), + options: list[PathString] = [ + "--ro-bind", context.config.secure_boot_certificate, context.config.secure_boot_certificate, + "--ro-bind", context.workspace / "mkosi.esl", context.workspace / "mkosi.esl", ] if context.config.secure_boot_key_source.type == KeySource.Type.engine: cmd += ["--engine", context.config.secure_boot_key_source.source] if context.config.secure_boot_key.exists(): - mounts += [Mount(context.config.secure_boot_key, context.config.secure_boot_key, ro=True)] + options += ["--ro-bind", context.config.secure_boot_key, context.config.secure_boot_key] cmd += [db, context.workspace / "mkosi.esl"] run( cmd, stdout=f, sandbox=context.sandbox( binary="sbvarsign", - mounts=mounts, + options=options, devices=context.config.secure_boot_key_source.type != KeySource.Type.file, ), ) @@ -1465,11 +1471,11 @@ def grub_mkimage( ], sandbox=context.sandbox( binary=mkimage, - mounts=[ - Mount(directory, "/grub"), - Mount(earlyconfig.name, earlyconfig.name, ro=True), - *([Mount(output.parent, output.parent)] if output else []), - *([Mount(str(sbat), str(sbat), ro=True)] if sbat else []), + options=[ + "--bind", directory, "/grub", + "--ro-bind", earlyconfig.name, earlyconfig.name, + *(["--bind", str(output.parent), str(output.parent)] if output else []), + *(["--ro-bind", str(sbat), str(sbat)] if sbat else []), ], ), ) @@ -1565,8 +1571,6 @@ def grub_bios_setup(context: Context, partitions: Sequence[Partition]) -> None: mountinfo.write(f"1 0 1:1 / / - fat {context.staging / context.config.output_with_format}\n") mountinfo.flush() - # We don't setup the mountinfo bind mount with bwrap because we need to know the child process pid to - # be able to do the mount and we don't know the pid beforehand. run( [ setup, @@ -1575,12 +1579,11 @@ def grub_bios_setup(context: Context, partitions: Sequence[Partition]) -> None: ], sandbox=context.sandbox( binary=setup, - mounts=[ - Mount(directory, "/grub"), - Mount(context.staging, context.staging), - Mount(mountinfo.name, mountinfo.name), + options=[ + "--bind", directory, "/grub", + "--bind", context.staging, context.staging, + "--bind", mountinfo.name, "/proc/self/mountinfo", ], - extra=["sh", "-c", f"mount --bind {mountinfo.name} /proc/$$/mountinfo && exec $0 \"$@\""], ), ) @@ -1621,7 +1624,7 @@ def copy() -> None: binary="systemd-dissect", devices=True, network=True, - mounts=[Mount(src, src, ro=True), Mount(t.parent, t.parent)], + options=["--ro-bind", src, src, "--bind", t.parent, t.parent], ), ) else: @@ -1651,28 +1654,16 @@ def install_package_manager_trees(context: Context) -> None: # Ensure /etc exists in the package manager tree (context.pkgmngr / "etc").mkdir(exist_ok=True) - # Backwards compatibility symlink. - (context.pkgmngr / "etc/mtab").symlink_to("../proc/self/mounts") - # Required to be able to access certificates in the sandbox when running from nix. if Path("/etc/static").is_symlink(): (context.pkgmngr / "etc/static").symlink_to(Path("/etc/static").readlink()) (context.pkgmngr / "var/log").mkdir(parents=True) - with (context.pkgmngr / "etc/passwd").open("w") as passwd: - passwd.write("root:x:0:0:root:/root:/bin/sh\n") - if INVOKING_USER.uid != 0: - name = INVOKING_USER.name() - home = INVOKING_USER.home() - passwd.write(f"{name}:x:{INVOKING_USER.uid}:{INVOKING_USER.gid}:{name}:{home}:/bin/sh\n") - os.fchown(passwd.fileno(), INVOKING_USER.uid, INVOKING_USER.gid) - - with (context.pkgmngr / "etc/group").open("w") as group: - group.write("root:x:0:\n") - if INVOKING_USER.uid != 0: - group.write(f"{INVOKING_USER.name()}:x:{INVOKING_USER.gid}:\n") - os.fchown(group.fileno(), INVOKING_USER.uid, INVOKING_USER.gid) + if Path("/etc/passwd").exists(): + shutil.copy("/etc/passwd", context.pkgmngr / "etc/passwd") + if Path("/etc/group").exists(): + shutil.copy("/etc/passwd", context.pkgmngr / "etc/group") if (p := context.config.tools() / "etc/crypto-policies").exists(): copy_tree( @@ -1824,7 +1815,6 @@ def finalize_default_initrd( *(["--package-cache-dir", str(config.package_cache_dir)] if config.package_cache_dir else []), *(["--local-mirror", str(config.local_mirror)] if config.local_mirror else []), "--incremental", str(config.incremental), - "--acl", str(config.acl), *(f"--package={package}" for package in config.initrd_packages), *(f"--volatile-package={package}" for package in config.initrd_volatile_packages), *(f"--package-directory={d}" for d in config.package_directories), @@ -2017,7 +2007,6 @@ def build_kernel_modules_initrd(context: Context, kver: str) -> Path: host=context.config.kernel_modules_initrd_include_host, ), exclude=context.config.kernel_modules_initrd_exclude, - sandbox=context.sandbox, ), sandbox=context.sandbox, ) @@ -2068,7 +2057,7 @@ def python_binary(config: Config, *, binary: Optional[PathString]) -> str: # If there's no tools tree, prefer the interpreter from MKOSI_INTERPRETER. If there is a tools # tree, just use the default python3 interpreter. - return "python3" if tools and config.tools_tree else os.getenv("MKOSI_INTERPRETER", "python3") + return "python3" if tools and config.tools_tree else sys.executable def extract_pe_section(context: Context, binary: Path, section: str, output: Path) -> Path: @@ -2098,8 +2087,8 @@ def extract_pe_section(context: Context, binary: Path, section: str, output: Pat stdout=f, sandbox=context.sandbox( binary=python_binary(context.config, binary=None), - mounts=[Mount(binary, binary, ro=True), - ]), + options=["--ro-bind", binary, binary], + ), success_exit_status=(0, 67), ) if result.returncode == 67: @@ -2149,11 +2138,11 @@ def build_uki( "--uname", kver, ] - mounts = [ - Mount(output.parent, output.parent), - Mount(context.workspace / "cmdline", context.workspace / "cmdline", ro=True), - Mount(context.root / "usr/lib/os-release", context.root / "usr/lib/os-release", ro=True), - Mount(stub, stub, ro=True), + options: list[PathString] = [ + "--bind", output.parent, output.parent, + "--ro-bind", context.workspace / "cmdline", context.workspace / "cmdline", + "--ro-bind", context.root / "usr/lib/os-release", context.root / "usr/lib/os-release", + "--ro-bind", stub, stub, ] if context.config.secure_boot: @@ -2170,13 +2159,13 @@ def build_uki( "--secureboot-certificate", context.config.secure_boot_certificate, ] - mounts += [ - Mount(context.config.secure_boot_certificate, context.config.secure_boot_certificate, ro=True), + options += [ + "--ro-bind", context.config.secure_boot_certificate, context.config.secure_boot_certificate, ] if context.config.secure_boot_key_source.type == KeySource.Type.engine: cmd += ["--signing-engine", context.config.secure_boot_key_source.source] if context.config.secure_boot_key.exists(): - mounts += [Mount(context.config.secure_boot_key, context.config.secure_boot_key, ro=True)] + options += ["--ro-bind", context.config.secure_boot_key, context.config.secure_boot_key] else: pesign_prepare(context) cmd += [ @@ -2186,7 +2175,7 @@ def build_uki( "--secureboot-certificate-name", certificate_common_name(context, context.config.secure_boot_certificate), ] - mounts += [Mount(context.workspace / "pesign", context.workspace / "pesign", ro=True)] + options += ["--ro-bind", context.workspace / "pesign", context.workspace / "pesign"] if want_signed_pcrs(context.config): cmd += [ @@ -2196,18 +2185,18 @@ def build_uki( "--pcr-banks", "sha256", ] if context.config.secure_boot_key.exists(): - mounts += [Mount(context.config.secure_boot_key, context.config.secure_boot_key)] + options += ["--bind", context.config.secure_boot_key, context.config.secure_boot_key] if context.config.secure_boot_key_source.type == KeySource.Type.engine: cmd += [ "--signing-engine", context.config.secure_boot_key_source.source, "--pcr-public-key", context.config.secure_boot_certificate, ] - mounts += [ - Mount(context.config.secure_boot_certificate, context.config.secure_boot_certificate, ro=True), + options += [ + "--ro-bind", context.config.secure_boot_certificate, context.config.secure_boot_certificate, ] cmd += ["build", "--linux", kimg] - mounts += [Mount(kimg, kimg, ro=True)] + options += ["--ro-bind", kimg, kimg] if microcodes: # new .ucode section support? @@ -2222,20 +2211,20 @@ def build_uki( ): for microcode in microcodes: cmd += ["--microcode", microcode] - mounts += [Mount(microcode, microcode, ro=True)] + options += ["--ro-bind", microcode, microcode] else: initrds = microcodes + initrds for initrd in initrds: cmd += ["--initrd", initrd] - mounts += [Mount(initrd, initrd, ro=True)] + options += ["--ro-bind", initrd, initrd] with complete_step(f"Generating unified kernel image for kernel version {kver}"): run( cmd, sandbox=context.sandbox( binary=ukify, - mounts=mounts, + options=options, devices=context.config.secure_boot_key_source.type != KeySource.Type.file, ), ) @@ -2320,7 +2309,7 @@ def find_entry_token(context: Context) -> str: output = json.loads( run( ["kernel-install", "--root=/buildroot", "--json=pretty", "inspect"], - sandbox=context.sandbox(binary="kernel-install", mounts=[Mount(context.root, "/buildroot", ro=True)]), + sandbox=context.sandbox(binary="kernel-install", options=["--ro-bind", context.root, "/buildroot"]), stdout=subprocess.PIPE, env={"BOOT_ROOT": "/boot"}, ).stdout @@ -2756,13 +2745,11 @@ def calculate_signature(context: Context) -> None: if sys.stderr.isatty(): env |= dict(GPGTTY=os.ttyname(sys.stderr.fileno())) - options: list[PathString] = ["--perms", "755", "--dir", home] - mounts = [Mount(home, home)] + options: list[PathString] = ["--bind", home, home] # gpg can communicate with smartcard readers via this socket so bind mount it in if it exists. if (p := Path("/run/pcscd/pcscd.comm")).exists(): - options += ["--perms", "755", "--dir", p.parent] - mounts += [Mount(p, p)] + options += ["--bind", p, p] with ( complete_step("Signing SHA256SUMS…"), @@ -2774,12 +2761,9 @@ def calculate_signature(context: Context) -> None: env=env, stdin=i, stdout=o, - # GPG messes with the user's home directory so we run it as the invoking user. sandbox=context.sandbox( binary="gpg", - mounts=mounts, options=options, - extra=["setpriv", f"--reuid={INVOKING_USER.uid}", f"--regid={INVOKING_USER.gid}", "--clear-groups"], ) ) @@ -2938,8 +2922,6 @@ def check_ukify( def check_tools(config: Config, verb: Verb) -> None: - check_tool(config, "bwrap", reason="execute sandboxed commands") - if verb == Verb.build: if config.bootable != ConfigFeature.disabled: check_tool(config, "depmod", reason="generate kernel module dependencies") @@ -3106,18 +3088,10 @@ def run_depmod(context: Context, *, cache: bool = False) -> None: host=context.config.kernel_modules_include_host, ), exclude=context.config.kernel_modules_exclude, - sandbox=context.sandbox, ) with complete_step(f"Running depmod for {kver}"): - run( - ["depmod", "--all", kver], - sandbox=context.sandbox( - binary=None, - mounts=[Mount(context.root, "/buildroot")], - extra=chroot_cmd(), - ) - ) + run(["depmod", "--all", kver], sandbox=chroot_cmd(root=context.root)) def run_sysusers(context: Context) -> None: @@ -3130,7 +3104,7 @@ def run_sysusers(context: Context) -> None: with complete_step("Generating system users"): run(["systemd-sysusers", "--root=/buildroot"], - sandbox=context.sandbox(binary="systemd-sysusers", mounts=[Mount(context.root, "/buildroot")])) + sandbox=context.sandbox(binary="systemd-sysusers", options=["--bind", context.root, "/buildroot"])) def run_tmpfiles(context: Context) -> None: @@ -3151,6 +3125,8 @@ def run_tmpfiles(context: Context) -> None: "--remove", # Exclude APIVFS and temporary files directories. *(f"--exclude-prefix={d}" for d in ("/tmp", "/var/tmp", "/run", "/proc", "/sys", "/dev")), + # Exclude /var if we're not invoked as root as all the chown()'s for daemon owned directories will fail + *(["--exclude-prefix=/var"] if os.getuid() != 0 or userns_has_single_user() else []), ], env={"SYSTEMD_TMPFILES_FORCE_SUBVOL": "0"}, # systemd-tmpfiles can exit with DATAERR or CANTCREAT in some cases which are handled as success by the @@ -3158,11 +3134,14 @@ def run_tmpfiles(context: Context) -> None: success_exit_status=(0, 65, 73), sandbox=context.sandbox( binary="systemd-tmpfiles", - mounts=[ - Mount(context.root, "/buildroot"), + options=[ + "--bind", context.root, "/buildroot", # systemd uses acl.h to parse ACLs in tmpfiles snippets which uses the host's passwd so we have to # mount the image's passwd over it to make ACL parsing work. - *finalize_passwd_mounts(context.root) + *finalize_passwd_mounts(context.root), + # Sometimes directories are configured to be owned by root in tmpfiles snippets so we want to make + # sure those chown()'s succeed by making ourselves the root user so that the root user exists. + "--become-root", ], ), ) @@ -3178,9 +3157,9 @@ def run_preset(context: Context) -> None: with complete_step("Applying presets…"): run(["systemctl", "--root=/buildroot", "preset-all"], - sandbox=context.sandbox(binary="systemctl", mounts=[Mount(context.root, "/buildroot")])) + sandbox=context.sandbox(binary="systemctl", options=["--bind", context.root, "/buildroot"])) run(["systemctl", "--root=/buildroot", "--global", "preset-all"], - sandbox=context.sandbox(binary="systemctl", mounts=[Mount(context.root, "/buildroot")])) + sandbox=context.sandbox(binary="systemctl", options=["--bind", context.root, "/buildroot"])) def run_hwdb(context: Context) -> None: @@ -3193,7 +3172,7 @@ def run_hwdb(context: Context) -> None: with complete_step("Generating hardware database"): run(["systemd-hwdb", "--root=/buildroot", "--usr", "--strict", "update"], - sandbox=context.sandbox(binary="systemd-hwdb", mounts=[Mount(context.root, "/buildroot")])) + sandbox=context.sandbox(binary="systemd-hwdb", options=["--bind", context.root, "/buildroot"])) # Remove any existing hwdb in /etc in favor of the one we just put in /usr. (context.root / "etc/udev/hwdb.bin").unlink(missing_ok=True) @@ -3244,7 +3223,7 @@ def run_firstboot(context: Context) -> None: with complete_step("Applying first boot settings"): run(["systemd-firstboot", "--root=/buildroot", "--force", *options], - sandbox=context.sandbox(binary="systemd-firstboot", mounts=[Mount(context.root, "/buildroot")])) + sandbox=context.sandbox(binary="systemd-firstboot", options=["--bind", context.root, "/buildroot"])) # Initrds generally don't ship with only /usr so there's not much point in putting the credentials in # /usr/lib/credstore. @@ -3267,7 +3246,7 @@ def run_selinux_relabel(context: Context) -> None: with complete_step(f"Relabeling files using {policy} policy"): run([setfiles, "-mFr", "/buildroot", "-c", binpolicy, fc, "/buildroot"], - sandbox=context.sandbox(binary=setfiles, mounts=[Mount(context.root, "/buildroot")]), + sandbox=context.sandbox(binary=setfiles, options=["--bind", context.root, "/buildroot"]), check=context.config.selinux_relabel == ConfigFeature.enabled) @@ -3328,7 +3307,7 @@ def have_cache(config: Config) -> bool: logging.info("Cache manifest mismatch, not reusing cached images") if ARG_DEBUG.get(): run(["diff", manifest, "-"], input=new, check=False, - sandbox=config.sandbox(binary="diff", mounts=[Mount(manifest, manifest)])) + sandbox=config.sandbox(binary="diff", options=["--bind", manifest, manifest])) return False else: @@ -3404,27 +3383,32 @@ def make_image( "--seed", str(context.config.seed), context.staging / context.config.output_with_format, ] - mounts = [Mount(context.staging, context.staging)] + options: list[PathString] = [ + # Make sure we're root so that the mkfs tools invoked by systemd-repart think the files that go + # into the disk image are owned by root. + "--become-root", + "--bind", context.staging, context.staging, + ] if root: cmdline += ["--root=/buildroot"] - mounts += [Mount(root, "/buildroot")] + options += ["--bind", root, "/buildroot"] if not context.config.architecture.is_native(): cmdline += ["--architecture", str(context.config.architecture)] if not (context.staging / context.config.output_with_format).exists(): cmdline += ["--empty=create"] if context.config.passphrase: cmdline += ["--key-file", context.config.passphrase] - mounts += [Mount(context.config.passphrase, context.config.passphrase, ro=True)] + options += ["--ro-bind", context.config.passphrase, context.config.passphrase] if context.config.verity_key: cmdline += ["--private-key", context.config.verity_key] if context.config.verity_key_source.type != KeySource.Type.file: cmdline += ["--private-key-source", str(context.config.verity_key_source)] if context.config.verity_key.exists(): - mounts += [Mount(context.config.verity_key, context.config.verity_key, ro=True)] + options += ["--ro-bind", context.config.verity_key, context.config.verity_key] if context.config.verity_certificate: cmdline += ["--certificate", context.config.verity_certificate] - mounts += [Mount(context.config.verity_certificate, context.config.verity_certificate, ro=True)] + options += ["--ro-bind", context.config.verity_certificate, context.config.verity_certificate] if skip: cmdline += ["--defer-partitions", ",".join(skip)] if split: @@ -3439,7 +3423,7 @@ def make_image( for d in definitions: cmdline += ["--definitions", d] - mounts += [Mount(d, d, ro=True)] + options += ["--ro-bind", d, d] with complete_step(msg): output = json.loads( @@ -3454,7 +3438,7 @@ def make_image( context.config.verity_key_source.type != KeySource.Type.file ), vartmp=True, - mounts=mounts, + options=options, ), ).stdout ) @@ -3696,26 +3680,29 @@ def make_extension_image(context: Context, output: Path) -> None: "--definitions", r, output, ] - mounts = [ - Mount(output.parent, output.parent), - Mount(context.root, "/buildroot", ro=True), - Mount(r, r, ro=True), + options: list[PathString] = [ + # Make sure we're root so that the mkfs tools invoked by systemd-repart think the files that go + # into the disk image are owned by root. + "--become-root", + "--bind", output.parent, output.parent, + "--ro-bind", context.root, "/buildroot", + "--ro-bind", r, r, ] if not context.config.architecture.is_native(): cmdline += ["--architecture", str(context.config.architecture)] if context.config.passphrase: cmdline += ["--key-file", context.config.passphrase] - mounts += [Mount(context.config.passphrase, context.config.passphrase, ro=True)] + options += ["--ro-bind", context.config.passphrase, context.config.passphrase] if context.config.verity_key: cmdline += ["--private-key", context.config.verity_key] if context.config.verity_key_source.type != KeySource.Type.file: cmdline += ["--private-key-source", str(context.config.verity_key_source)] if context.config.verity_key.exists(): - mounts += [Mount(context.config.verity_key, context.config.verity_key, ro=True)] + options += ["--ro-bind", context.config.verity_key, context.config.verity_key] if context.config.verity_certificate: cmdline += ["--certificate", context.config.verity_certificate] - mounts += [Mount(context.config.verity_certificate, context.config.verity_certificate, ro=True)] + options += ["--ro-bind", context.config.verity_certificate, context.config.verity_certificate] if context.config.sector_size: cmdline += ["--sector-size", str(context.config.sector_size)] if context.config.split_artifacts: @@ -3734,7 +3721,7 @@ def make_extension_image(context: Context, output: Path) -> None: context.config.verity_key_source.type != KeySource.Type.file ), vartmp=True, - mounts=mounts, + options=options, ), ).stdout ) @@ -3751,13 +3738,8 @@ def finalize_staging(context: Context) -> None: rmtree(*(context.config.output_dir_or_cwd() / f.name for f in context.staging.iterdir())) for f in context.staging.iterdir(): - # Make sure all build outputs that are not directories are owned by the user running mkosi. - if not f.is_dir(): - os.chown(f, INVOKING_USER.uid, INVOKING_USER.gid, follow_symlinks=False) - if f.is_symlink(): (context.config.output_dir_or_cwd() / f.name).symlink_to(f.readlink()) - os.chown(f, INVOKING_USER.uid, INVOKING_USER.gid, follow_symlinks=False) continue move_tree( @@ -3805,7 +3787,6 @@ def setup_workspace(args: Args, config: Config) -> Iterator[Path]: if args.debug_workspace: stack.pop_all() log_notice(f"Workspace: {workspace}") - workspace.chmod(0o755) raise @@ -3849,16 +3830,17 @@ def copy_repository_metadata(context: Context) -> None: # cp doesn't support excluding directories but we can imitate it by bind mounting an empty directory # over the directories we want to exclude. + exclude: list[PathString] if d == "cache": - exclude = [ - Mount(tmp, p, ro=True) + exclude = flatten( + ("--ro-bind", tmp, p) for p in context.config.distribution.package_manager(context.config).cache_subdirs(src) - ] + ) else: - exclude = [ - Mount(tmp, p, ro=True) + exclude = flatten( + ("--ro-bind", tmp, p) for p in context.config.distribution.package_manager(context.config).state_subdirs(src) - ] + ) dst = context.package_cache_dir / d / subdir with umask(~0o755): @@ -3868,10 +3850,9 @@ def sandbox( *, binary: Optional[PathString], vartmp: bool = False, - mounts: Sequence[Mount] = (), - extra: Sequence[PathString] = (), + options: Sequence[PathString] = (), ) -> AbstractContextManager[list[PathString]]: - return context.sandbox(binary=binary, vartmp=vartmp, mounts=[*mounts, *exclude], extra=extra) + return context.sandbox(binary=binary, vartmp=vartmp, options=[*options, *exclude]) copy_tree( src, dst, @@ -4041,84 +4022,6 @@ def build_image(context: Context) -> None: print_output_size(context.config.output_dir_or_cwd() / context.config.output_with_compression) -def setfacl(config: Config, root: Path, uid: int, allow: bool) -> None: - run( - [ - "setfacl", - "--physical", - "--modify" if allow else "--remove", - f"user:{uid}:rwx" if allow else f"user:{uid}", - "-", - ], - # Supply files via stdin so we don't clutter --debug run output too much - input="\n".join([str(root), *(os.fspath(p) for p in root.rglob("*") if p.is_dir())]), - sandbox=config.sandbox(binary="setfacl", mounts=[Mount(root, root)]), - ) - - -@contextlib.contextmanager -def acl_maybe_toggle(config: Config, root: Path, uid: int, *, always: bool) -> Iterator[None]: - if not config.acl: - yield - return - - # getfacl complains about absolute paths so make sure we pass a relative one. - if root.exists(): - sandbox = config.sandbox(binary="getfacl", mounts=[Mount(root, root)], options=["--chdir", root]) - has_acl = f"user:{uid}:rwx" in run(["getfacl", "-n", "."], sandbox=sandbox, stdout=subprocess.PIPE).stdout - - if not has_acl and not always: - yield - return - else: - has_acl = False - - try: - if has_acl: - with complete_step(f"Removing ACLs from {root}"): - setfacl(config, root, uid, allow=False) - - yield - finally: - if has_acl or always: - with complete_step(f"Adding ACLs to {root}"): - setfacl(config, root, uid, allow=True) - - -@contextlib.contextmanager -def acl_toggle_build(config: Config, uid: int) -> Iterator[None]: - if not config.acl: - yield - return - - extras = [t.source for t in config.extra_trees] - skeletons = [t.source for t in config.skeleton_trees] - - with contextlib.ExitStack() as stack: - for p in (*config.base_trees, *extras, *skeletons): - if p and p.is_dir(): - stack.enter_context(acl_maybe_toggle(config, p, uid, always=False)) - - for p in (config.cache_dir, config.build_dir): - if p: - stack.enter_context(acl_maybe_toggle(config, p, uid, always=True)) - - if config.output_format == OutputFormat.directory: - stack.enter_context(acl_maybe_toggle(config, config.output_dir_or_cwd() / config.output, uid, always=True)) - - yield - - -@contextlib.contextmanager -def acl_toggle_boot(config: Config, uid: int) -> Iterator[None]: - if not config.acl or config.output_format != OutputFormat.directory: - yield - return - - with acl_maybe_toggle(config, config.output_dir_or_cwd() / config.output, uid, always=False): - yield - - def run_shell(args: Args, config: Config) -> None: opname = "acquire shell in" if args.verb == Verb.shell else "boot" if config.output_format in (OutputFormat.tar, OutputFormat.cpio): @@ -4164,7 +4067,13 @@ def run_shell(args: Args, config: Config) -> None: stack.callback(lambda: (config.output_dir_or_cwd() / f"{name}.nspawn").unlink(missing_ok=True)) shutil.copy2(config.nspawn_settings, config.output_dir_or_cwd() / f"{name}.nspawn") - if config.ephemeral: + # If we're booting a directory image that wasn't built by root, we always make an ephemeral copy to avoid + # ending up with files not owned by the directory image owner in the directory image. + if config.ephemeral or ( + config.output_format == OutputFormat.directory and + args.verb == Verb.boot and + (config.output_dir_or_cwd() / config.output).stat().st_uid != 0 + ): fname = stack.enter_context(copy_ephemeral(config, config.output_dir_or_cwd() / config.output)) else: fname = stack.enter_context(flock_or_die(config.output_dir_or_cwd() / config.output)) @@ -4188,7 +4097,7 @@ def run_shell(args: Args, config: Config) -> None: network=True, devices=True, vartmp=True, - mounts=[Mount(fname, fname)], + options=["--bind", fname, fname], ), ) @@ -4197,15 +4106,22 @@ def run_shell(args: Args, config: Config) -> None: owner = os.stat(fname).st_uid if owner != 0: - cmdline += [f"--private-users={str(owner)}"] + # Let's allow running a shell in a non-ephemeral image but in that case only map a single user into the + # image so it can't get poluted with files or directories owned by other users. + if args.verb == Verb.shell and config.output_format == OutputFormat.directory and not config.ephemeral: + range = 1 + else: + range = 65536 + + cmdline += [f"--private-users={owner}:{range}"] else: cmdline += ["--image", fname] if config.runtime_build_sources: - with finalize_source_mounts(config, ephemeral=False) as mounts: - for mount in mounts: - uidmap = "rootidmap" if Path(mount.src).stat().st_uid == INVOKING_USER.uid else "noidmap" - cmdline += ["--bind", f"{mount.src}:{mount.dst}:norbind,{uidmap}"] + for t in config.build_sources: + src, dst = t.with_prefix("/work/src") + uidmap = "rootidmap" if src.stat().st_uid != 0 else "noidmap" + cmdline += ["--bind", f"{src}:{dst}:norbind,{uidmap}"] if config.build_dir: cmdline += ["--bind", f"{config.build_dir}:/work/build:norbind,noidmap"] @@ -4217,7 +4133,7 @@ def run_shell(args: Args, config: Config) -> None: # source directory which would mean we'd be mounting the container root directory as a subdirectory in # itself which tends to lead to all kinds of weird issues, which we avoid by not doing a recursive mount # which means the container root directory mounts will be skipped. - uidmap = "rootidmap" if tree.source.stat().st_uid == INVOKING_USER.uid else "noidmap" + uidmap = "rootidmap" if tree.source.stat().st_uid != 0 else "noidmap" cmdline += ["--bind", f"{tree.source}:{target}:norbind,{uidmap}"] if config.runtime_scratch == ConfigFeature.enabled or ( @@ -4314,7 +4230,6 @@ def run_systemd_tool(tool: str, args: Args, config: Config) -> None: stdout=sys.stdout, env=os.environ | config.environment, log=False, - preexec_fn=become_root if not config.forward_journal else None, sandbox=config.sandbox( binary=tool_path, network=True, @@ -4409,7 +4324,6 @@ def bump_image_version() -> None: logging.info(f"Bumping version: '{version}' → '{new_version}'") version_file.write_text(f"{new_version}\n") - os.chown(version_file, INVOKING_USER.uid, INVOKING_USER.gid) def show_docs(args: Args, *, resources: Path) -> None: @@ -4492,7 +4406,6 @@ def finalize_default_tools(args: Args, config: Config, *, resources: Path) -> Co *(["--cache-dir", str(config.cache_dir)] if config.cache_dir else []), *(["--package-cache-dir", str(config.package_cache_dir)] if config.package_cache_dir else []), "--incremental", str(config.incremental), - "--acl", str(config.acl), *([f"--package={package}" for package in config.tools_tree_packages]), "--output", f"{config.tools_tree_distribution}-tools", *(["--source-date-epoch", str(config.source_date_epoch)] if config.source_date_epoch is not None else []), @@ -4541,8 +4454,8 @@ def run_clean_scripts(config: Config) -> None: DISTRIBUTION_ARCHITECTURE=config.distribution.architecture(config.architecture), SRCDIR="/work/src", OUTPUTDIR="/work/out", - MKOSI_UID=str(INVOKING_USER.uid), - MKOSI_GID=str(INVOKING_USER.gid), + MKOSI_UID=str(os.getuid()), + MKOSI_GID=str(os.getgid()), MKOSI_CONFIG="/work/config.json", ) @@ -4562,13 +4475,15 @@ def run_clean_scripts(config: Config) -> None: binary=None, vartmp=True, tools=False, - mounts=[ + options=[ + "--dir", "/work/src", + "--chdir", "/work/src", + "--dir", "/work/out", + "--ro-bind", script, "/work/clean", + "--ro-bind", json, "/work/config.json", + *(["--bind", str(o), "/work/out"] if (o := config.output_dir_or_cwd()).exists() else []), *sources, - Mount(script, "/work/clean", ro=True), - Mount(json, "/work/config.json", ro=True), - *([Mount(o, "/work/out")] if (o := config.output_dir_or_cwd()).exists() else []), - ], - options=["--dir", "/work/src", "--chdir", "/work/src", "--dir", "/work/out"] + ] ), stdin=sys.stdin, ) @@ -4587,12 +4502,15 @@ def needs_clean(args: Args, config: Config, force: int = 1) -> bool: def run_clean(args: Args, config: Config, *, resources: Path) -> None: - become_root() - # We remove any cached images if either the user used --force twice, or he/she called "clean" with it # passed once. Let's also remove the downloaded package cache if the user specified one additional # "--force". + # We don't want to require a tools tree to run mkosi clean so we pass in a sandbox that disables use of the tools + # tree. We still need a sandbox as we need to acquire privileges to be able to remove various files from the + # rootfs. + sandbox = functools.partial(config.sandbox, tools=False) + if args.verb == Verb.clean: remove_output_dir = config.output_format != OutputFormat.none remove_build_cache = args.force > 0 or args.wipe_build_dir @@ -4622,11 +4540,11 @@ def run_clean(args: Args, config: Config, *, resources: Path) -> None: if (config.output_dir_or_cwd() / config.output).exists() else contextlib.nullcontext() ): - rmtree(*outputs) + rmtree(*outputs, sandbox=sandbox) if remove_build_cache and config.build_dir and config.build_dir.exists() and any(config.build_dir.iterdir()): with complete_step(f"Clearing out build directory of {config.name()} image…"): - rmtree(*config.build_dir.iterdir()) + rmtree(*config.build_dir.iterdir(), sandbox=sandbox) if remove_image_cache and config.cache_dir: initrd = ( @@ -4637,7 +4555,7 @@ def run_clean(args: Args, config: Config, *, resources: Path) -> None: if any(p.exists() for p in itertools.chain(cache_tree_paths(config), initrd)): with complete_step(f"Removing cache entries of {config.name()} image…"): - rmtree(*(p for p in itertools.chain(cache_tree_paths(config), initrd) if p.exists())) + rmtree(*(p for p in itertools.chain(cache_tree_paths(config), initrd) if p.exists()), sandbox=sandbox) if remove_package_cache and any(config.package_cache_dir_or_default().glob("*")): subdir = config.distribution.package_manager(config).subdir(config) @@ -4651,23 +4569,12 @@ def run_clean(args: Args, config: Config, *, resources: Path) -> None: config.package_cache_dir_or_default() / d / subdir for d in ("cache", "lib") ), + sandbox=sandbox, ) run_clean_scripts(config) -@contextlib.contextmanager -def rchown_package_manager_dirs(config: Config) -> Iterator[None]: - try: - yield - finally: - if INVOKING_USER.is_regular_user(): - with complete_step("Fixing ownership of package manager cache directory"): - subdir = config.distribution.package_manager(config).subdir(config) - for d in ("cache", "lib"): - INVOKING_USER.rchown(config.package_cache_dir_or_default() / d / subdir) - - def sync_repository_metadata(context: Context) -> None: if ( context.config.cacheonly != Cacheonly.never and @@ -4686,11 +4593,6 @@ def sync_repository_metadata(context: Context) -> None: def run_sync(args: Args, config: Config, *, resources: Path) -> None: - if os.getuid() == 0: - os.setgroups(INVOKING_USER.extra_groups()) - os.setresgid(INVOKING_USER.gid, INVOKING_USER.gid, INVOKING_USER.gid) - os.setresuid(INVOKING_USER.uid, INVOKING_USER.uid, INVOKING_USER.uid) - if not (p := config.package_cache_dir_or_default()).exists(): p.mkdir(parents=True, exist_ok=True) @@ -4724,11 +4626,13 @@ def run_sync(args: Args, config: Config, *, resources: Path) -> None: def run_build(args: Args, config: Config, *, resources: Path, package_dir: Optional[Path] = None) -> None: - if (uid := os.getuid()) != 0: - become_root() + if os.getuid() != 0: + acquire_privileges() + unshare(CLONE_NEWNS) - if uid == 0: - run(["mount", "--make-rslave", "/"]) + + if os.getuid() == 0: + mount("", "/", "", MS_SLAVE|MS_REC, "") for p in ( config.output_dir, @@ -4741,67 +4645,37 @@ def run_build(args: Args, config: Config, *, resources: Path, package_dir: Optio continue p.mkdir(parents=True, exist_ok=True) - INVOKING_USER.chown(p) if config.build_dir: - # Make sure the build directory is owned by root (in the user namespace) so that the correct uid-mapping is - # applied if it is used in RuntimeTrees= - os.chown(config.build_dir, os.getuid(), os.getgid()) - # Discard setuid/setgid bits as these are inherited and can leak into the image. config.build_dir.chmod(stat.S_IMODE(config.build_dir.stat().st_mode) & ~(stat.S_ISGID|stat.S_ISUID)) # For extra safety when running as root, remount a bunch of stuff read-only. # Because some build systems use output directories in /usr, we only remount # /usr read-only if the output directory is not relative to it. - if INVOKING_USER.invoked_as_root: + if os.getuid() == 0: remount = ["/etc", "/opt", "/boot", "/efi", "/media"] if not config.output_dir_or_cwd().is_relative_to("/usr"): remount += ["/usr"] for d in remount: - if Path(d).exists(): - options = "ro" if d in ("/usr", "/opt") else "ro,nosuid,nodev,noexec" - run(["mount", "--rbind", d, d, "--options", options]) + if not Path(d).exists(): + continue + + attrs = MOUNT_ATTR_RDONLY + if d not in ("/usr", "/opt"): + attrs |= MOUNT_ATTR_NOSUID|MOUNT_ATTR_NODEV|MOUNT_ATTR_NOEXEC + + mount_rbind(d, d, attrs) with ( complete_step(f"Building {config.name()} image"), prepend_to_environ_path(config), - acl_toggle_build(config, INVOKING_USER.uid), - rchown_package_manager_dirs(config), setup_workspace(args, config) as workspace, ): build_image(Context(args, config, workspace=workspace, resources=resources, package_dir=package_dir)) -def ensure_root_is_mountpoint() -> None: - """ - bubblewrap uses pivot_root() which doesn't work in the initramfs as pivot_root() requires / to be a mountpoint - which is not the case in the initramfs. So, to make sure mkosi works from within the initramfs, let's make / a - mountpoint by recursively bind-mounting / (the directory) to another location and then switching root into the bind - mount directory. - """ - fstype = run( - ["findmnt", "--target", "/", "--output", "FSTYPE", "--noheadings"], - stdout=subprocess.PIPE, - ).stdout.strip() - - if fstype != "rootfs": - return - - if os.getuid() != 0: - die("mkosi can only be run as root from the initramfs") - - unshare(CLONE_NEWNS) - run(["mount", "--make-rslave", "/"]) - mountpoint = Path("/run/mkosi/mkosi-root") - mountpoint.mkdir(parents=True, exist_ok=True) - run(["mount", "--rbind", "/", mountpoint]) - os.chdir(mountpoint) - run(["mount", "--move", ".", "/"]) - os.chroot(".") - - def run_verb(args: Args, images: Sequence[Config], *, resources: Path) -> None: images = list(images) @@ -4849,8 +4723,6 @@ def run_verb(args: Args, images: Sequence[Config], *, resources: Path) -> None: page(text, args.pager) return - ensure_root_is_mountpoint() - if args.verb in (Verb.journalctl, Verb.coredumpctl, Verb.ssh): # We don't use a tools tree for verbs that don't need an image build. last = dataclasses.replace(images[-1], tools_tree=None) @@ -4960,21 +4832,24 @@ def run_verb(args: Args, images: Sequence[Config], *, resources: Path) -> None: die(f"Image '{last.name()}' has not been built yet", hint="Make sure to build the image first with 'mkosi build' or use '--force'") - with prepend_to_environ_path(last): - with ( - acl_toggle_boot(last, INVOKING_USER.uid) - if args.verb in (Verb.shell, Verb.boot) - else contextlib.nullcontext() - ): - run_vm = { - Vmm.qemu: run_qemu, - Vmm.vmspawn: run_vmspawn, - }[last.vmm] + if ( + last.output_format == OutputFormat.directory and + (last.output_dir_or_cwd() / last.output).stat().st_uid == 0 and + os.getuid() != 0 + ): + die("Cannot operate on directory images built as root when running unprivileged", + hint="Clean the root owned image by running mkosi -ff clean as root and then rebuild the image") - { - Verb.shell: run_shell, - Verb.boot: run_shell, - Verb.qemu: run_vm, - Verb.serve: run_serve, - Verb.burn: run_burn, - }[args.verb](args, last) + with prepend_to_environ_path(last): + run_vm = { + Vmm.qemu: run_qemu, + Vmm.vmspawn: run_vmspawn, + }[last.vmm] + + { + Verb.shell: run_shell, + Verb.boot: run_shell, + Verb.qemu: run_vm, + Verb.serve: run_serve, + Verb.burn: run_burn, + }[args.verb](args, last) diff --git a/mkosi/__main__.py b/mkosi/__main__.py index 032519fe6..c91550483 100644 --- a/mkosi/__main__.py +++ b/mkosi/__main__.py @@ -12,7 +12,6 @@ from mkosi.config import parse_config from mkosi.log import log_setup from mkosi.run import find_binary, run, uncaught_exception_handler -from mkosi.user import INVOKING_USER from mkosi.util import resource_path @@ -26,8 +25,6 @@ def main() -> None: signal.signal(signal.SIGHUP, onsignal) log_setup() - # Ensure that the name and home of the user we are running as are resolved as early as possible. - INVOKING_USER.init() with resource_path(mkosi.resources) as resources: args, images = parse_config(sys.argv[1:], resources=resources) diff --git a/mkosi/archive.py b/mkosi/archive.py index 9e4d2d523..154a82c94 100644 --- a/mkosi/archive.py +++ b/mkosi/archive.py @@ -6,10 +6,10 @@ from typing import Optional from mkosi.log import log_step -from mkosi.run import run -from mkosi.sandbox import Mount, SandboxProtocol, finalize_passwd_mounts, nosandbox +from mkosi.run import SandboxProtocol, finalize_passwd_mounts, nosandbox, run +from mkosi.sandbox import umask from mkosi.types import PathString -from mkosi.util import chdir, umask +from mkosi.util import chdir def tar_exclude_apivfs_tmp() -> list[str]: @@ -42,12 +42,14 @@ def make_tar(src: Path, dst: Path, *, sandbox: SandboxProtocol = nosandbox) -> N "--pax-option=delete=atime,delete=ctime,delete=mtime", "--sparse", "--force-local", + *(["--owner=root:0"] if os.getuid() != 0 else []), + *(["--group=root:0"] if os.getuid() != 0 else []), *tar_exclude_apivfs_tmp(), ".", ], stdout=f, # Make sure tar uses user/group information from the root directory instead of the host. - sandbox=sandbox(binary="tar", mounts=[Mount(src, src, ro=True), *finalize_passwd_mounts(src)]), + sandbox=sandbox(binary="tar", options=["--ro-bind", src, src, *finalize_passwd_mounts(src)]), ) @@ -78,7 +80,7 @@ def extract_tar( "--keep-directory-symlink", "--no-overwrite-dir", "--same-permissions", - "--same-owner" if (dst / "etc/passwd").exists() else "--numeric-owner", + "--same-owner" if (dst / "etc/passwd").exists() and os.getuid() == 0 else "--numeric-owner", "--same-order", "--acls", "--selinux", @@ -90,7 +92,7 @@ def extract_tar( sandbox=sandbox( binary="tar", # Make sure tar uses user/group information from the root directory instead of the host. - mounts=[Mount(src, src, ro=True), Mount(dst, dst), *finalize_passwd_mounts(dst)] + options=["--ro-bind", src, src, "--bind", dst, dst, *finalize_passwd_mounts(dst)] ), ) @@ -120,8 +122,9 @@ def make_cpio( "--format=newc", "--quiet", "--directory", src, + *(["--owner=0:0"] if os.getuid() != 0 else []), ], input="\0".join(os.fspath(f) for f in files), stdout=f, - sandbox=sandbox(binary="cpio", mounts=[Mount(src, src, ro=True), *finalize_passwd_mounts(src)]), + sandbox=sandbox(binary="cpio", options=["--ro-bind", src, src, *finalize_passwd_mounts(src)]), ) diff --git a/mkosi/config.py b/mkosi/config.py index 2c6a4c92e..30d0f21bb 100644 --- a/mkosi/config.py +++ b/mkosi/config.py @@ -34,8 +34,8 @@ from mkosi.distributions import Distribution, detect_distribution from mkosi.log import ARG_DEBUG, ARG_DEBUG_SHELL, Style, die from mkosi.pager import page -from mkosi.run import find_binary, run -from mkosi.sandbox import Mount, SandboxProtocol, nosandbox, sandbox_cmd +from mkosi.run import SandboxProtocol, find_binary, nosandbox, run, sandbox_cmd +from mkosi.sandbox import __version__ from mkosi.types import PathString, SupportsRead from mkosi.user import INVOKING_USER from mkosi.util import ( @@ -48,8 +48,6 @@ ) from mkosi.versioncomp import GenericVersion -__version__ = "25~devel" - ConfigParseCallback = Callable[[Optional[str], Optional[Any]], Any] ConfigMatchCallback = Callable[[str, Any], bool] ConfigDefaultCallback = Callable[[argparse.Namespace], Any] @@ -131,8 +129,8 @@ class ConfigTree: source: Path target: Optional[Path] - def with_prefix(self, prefix: Path = Path("/")) -> tuple[Path, Path]: - return (self.source, prefix / os.fspath(self.target).lstrip("/") if self.target else prefix) + def with_prefix(self, prefix: PathString = "/") -> tuple[Path, Path]: + return (self.source, Path(prefix) / os.fspath(self.target).lstrip("/") if self.target else Path(prefix)) def __str__(self) -> str: return f"{self.source}:{self.target}" if self.target else f"{self.source}" @@ -512,8 +510,6 @@ def parse_path(value: str, path = Path(value) if expanduser: - if path.is_relative_to("~") and not INVOKING_USER.is_running_user(): - path = INVOKING_USER.home() / path.relative_to("~") path = path.expanduser() if required and not path.exists(): @@ -1542,7 +1538,6 @@ class Config: ephemeral: bool credentials: dict[str, str] kernel_command_line_extra: list[str] - acl: bool tools_tree: Optional[Path] tools_tree_distribution: Optional[Distribution] tools_tree_release: Optional[str] @@ -1767,16 +1762,15 @@ def sandbox( relaxed: bool = False, tools: bool = True, scripts: Optional[Path] = None, - mounts: Sequence[Mount] = (), + usroverlaydirs: Sequence[PathString] = (), options: Sequence[PathString] = (), setup: Sequence[PathString] = (), - extra: Sequence[PathString] = (), ) -> AbstractContextManager[list[PathString]]: - mounts = [ - *([Mount(p, "/proxy.cacert", ro=True)] if (p := self.proxy_peer_certificate) else []), - *([Mount(p, "/proxy.clientcert", ro=True)] if (p := self.proxy_client_certificate) else []), - *([Mount(p, "/proxy.clientkey", ro=True)] if (p := self.proxy_client_key) else []), - *mounts, + opt: list[PathString] = [ + *options, + *(["--ro-bind", str(p), "/proxy.cacert"] if (p := self.proxy_peer_certificate) else []), + *(["--ro-bind", str(p), "/proxy.clientcert"] if (p := self.proxy_client_certificate) else []), + *(["--ro-bind", str(p), "/proxy.clientkey"] if (p := self.proxy_client_key) else []), ] if ( @@ -1785,7 +1779,7 @@ def sandbox( any(path.is_relative_to(d) for d in self.extra_search_paths) ): tools = False - mounts += [Mount(d, d, ro=True) for d in self.extra_search_paths if not relaxed] + opt += flatten(("--ro-bind", d, d) for d in self.extra_search_paths if not relaxed) return sandbox_cmd( network=network, @@ -1794,10 +1788,9 @@ def sandbox( relaxed=relaxed, scripts=scripts, tools=self.tools() if tools else Path("/"), - mounts=mounts, - options=options, + usroverlaydirs=usroverlaydirs, + options=opt, setup=setup, - extra=extra, ) @@ -2892,15 +2885,6 @@ def parse_ini(path: Path, only_sections: Collection[str] = ()) -> Iterator[tuple parse=config_make_list_parser(delimiter=" "), help="Append extra entries to the kernel command line when booting the image", ), - ConfigSetting( - dest="acl", - metavar="BOOL", - nargs="?", - section="Host", - parse=config_parse_boolean, - help="Set ACLs on generated directories to permit the user running mkosi to remove them", - scope=SettingScope.universal, - ), ConfigSetting( dest="tools_tree", metavar="PATH", @@ -4273,7 +4257,6 @@ def bold(s: Any) -> str: Ephemeral: {config.ephemeral} Credentials: {line_join_list(config.credentials.keys())} Extra Kernel Command Line: {line_join_list(config.kernel_command_line_extra)} - Use ACLs: {yes_no(config.acl)} Tools Tree: {config.tools_tree} Tools Tree Distribution: {none_to_none(config.tools_tree_distribution)} Tools Tree Release: {none_to_none(config.tools_tree_release)} @@ -4470,7 +4453,7 @@ def want_selinux_relabel(config: Config, root: Path, fatal: bool = True) -> Opti return None policy = run(["sh", "-c", f". {selinux} && echo $SELINUXTYPE"], - sandbox=config.sandbox(binary="sh", mounts=[Mount(selinux, selinux, ro=True)]), + sandbox=config.sandbox(binary="sh", options=["--ro-bind", selinux, selinux]), stdout=subprocess.PIPE).stdout.strip() if not policy: if fatal and config.selinux_relabel == ConfigFeature.enabled: diff --git a/mkosi/context.py b/mkosi/context.py index 2a4102084..852fe756f 100644 --- a/mkosi/context.py +++ b/mkosi/context.py @@ -6,10 +6,9 @@ from typing import Optional from mkosi.config import Args, Config -from mkosi.sandbox import Mount +from mkosi.sandbox import umask from mkosi.tree import make_tree from mkosi.types import PathString -from mkosi.util import umask class Context: @@ -83,38 +82,21 @@ def sandbox( devices: bool = False, vartmp: bool = False, scripts: Optional[Path] = None, - mounts: Sequence[Mount] = (), options: Sequence[PathString] = (), - extra: Sequence[PathString] = (), ) -> AbstractContextManager[list[PathString]]: - if (self.pkgmngr / "usr").exists(): - extra = [ - "sh", - "-c", - f"mount -t overlay -o lowerdir={self.pkgmngr / 'usr'}:/usr overlayfs /usr && exec $0 \"$@\"", - *extra, - ] - return self.config.sandbox( binary=binary, network=network, devices=devices, vartmp=vartmp, scripts=scripts, - mounts=[ - # This mount is writable so bubblewrap can create extra directories or symlinks inside of it as needed. - # This isn't a problem as the package manager directory is created by mkosi and thrown away when the - # build finishes. - Mount(self.pkgmngr / "etc", "/etc"), - Mount(self.pkgmngr / "var/log", "/var/log"), - *([Mount(p, p, ro=True)] if (p := self.pkgmngr / "usr").exists() else []), - *mounts, - ], + usroverlaydirs=[self.pkgmngr / "usr"] if (self.pkgmngr / "usr").exists() else [], options=[ - "--uid", "0", - "--gid", "0", - "--cap-add", "ALL", *options, + # This mount is writable so we can create extra directories or symlinks inside of it as needed. + # This isn't a problem as the package manager directory is created by mkosi and thrown away when the + # build finishes. + "--bind", self.pkgmngr / "etc", "/etc", + "--bind", self.pkgmngr / "var/log", "/var/log", ], - extra=extra, ) diff --git a/mkosi/distributions/debian.py b/mkosi/distributions/debian.py index a2a3ddb2e..16eaebbb8 100644 --- a/mkosi/distributions/debian.py +++ b/mkosi/distributions/debian.py @@ -12,8 +12,8 @@ from mkosi.installer.apt import Apt, AptRepository from mkosi.log import die from mkosi.run import run -from mkosi.sandbox import Mount -from mkosi.util import listify, umask +from mkosi.sandbox import umask +from mkosi.util import listify class Installer(DistributionInstaller): @@ -297,6 +297,6 @@ def fixup_os_release(context: Context) -> None: "--divert", f"/{candidate}.dpkg", f"/{candidate}", - ], sandbox=context.sandbox(binary="dpkg-divert", mounts=[Mount(context.root, "/buildroot")])) + ], sandbox=context.sandbox(binary="dpkg-divert", options=["--bind", context.root, "/buildroot"])) newosrelease.rename(osrelease) diff --git a/mkosi/distributions/opensuse.py b/mkosi/distributions/opensuse.py index f089a97eb..0b4dc5ba6 100644 --- a/mkosi/distributions/opensuse.py +++ b/mkosi/distributions/opensuse.py @@ -15,7 +15,6 @@ from mkosi.log import die from mkosi.mounts import finalize_crypto_mounts from mkosi.run import run -from mkosi.sandbox import Mount from mkosi.util import listify, sort_packages @@ -109,9 +108,9 @@ def repositories(cls, context: Context) -> Iterable[RpmRepository]: ["rpm", "--root=/buildroot", "--import", *(key.removeprefix("file://") for key in gpgkeys)], sandbox=context.sandbox( binary="rpm", - mounts=[ - Mount(context.root, "/buildroot"), - *finalize_crypto_mounts(context.config) + options=[ + "--bind", context.root, "/buildroot", + *finalize_crypto_mounts(context.config), ], ) ) @@ -258,7 +257,7 @@ def fetch_gpgurls(context: Context, repourl: str) -> tuple[str, ...]: sandbox=context.sandbox( binary="curl", network=True, - mounts=[Mount(d, d), *finalize_crypto_mounts(context.config)], + options=["--bind", d, d, *finalize_crypto_mounts(context.config)], ), ) xml = (Path(d) / "repomd.xml").read_text() diff --git a/mkosi/initrd/__main__.py b/mkosi/initrd/__main__.py index 904083ba1..cc966bcd8 100644 --- a/mkosi/initrd/__main__.py +++ b/mkosi/initrd/__main__.py @@ -7,9 +7,10 @@ import tempfile from pathlib import Path -from mkosi.config import OutputFormat, __version__ +from mkosi.config import OutputFormat from mkosi.log import log_setup from mkosi.run import find_binary, run, uncaught_exception_handler +from mkosi.sandbox import __version__ from mkosi.types import PathString @@ -130,4 +131,3 @@ def main() -> None: if __name__ == "__main__": main() - diff --git a/mkosi/installer/__init__.py b/mkosi/installer/__init__.py index 511d8d6b6..da583fc72 100644 --- a/mkosi/installer/__init__.py +++ b/mkosi/installer/__init__.py @@ -1,15 +1,15 @@ # SPDX-License-Identifier: LGPL-2.1-or-later +from contextlib import AbstractContextManager from pathlib import Path from mkosi.config import Config, ConfigFeature, OutputFormat from mkosi.context import Context from mkosi.mounts import finalize_crypto_mounts -from mkosi.run import find_binary -from mkosi.sandbox import Mount +from mkosi.run import apivfs_options, apivfs_script_cmd, finalize_passwd_mounts, find_binary from mkosi.tree import copy_tree, rmtree from mkosi.types import PathString -from mkosi.util import startswith +from mkosi.util import flatten, startswith class PackageManager: @@ -58,27 +58,27 @@ def finalize_environment(cls, context: Context) -> dict[str, str]: "hostonly_l": "no", } - return env + return context.config.environment | env @classmethod def env_cmd(cls, context: Context) -> list[PathString]: return ["env", *([f"{k}={v}" for k, v in cls.finalize_environment(context).items()])] @classmethod - def mounts(cls, context: Context) -> list[Mount]: + def mounts(cls, context: Context) -> list[PathString]: mounts = [ *finalize_crypto_mounts(context.config), - Mount(context.repository, "/repository"), + "--bind", context.repository, "/repository", ] if context.config.local_mirror and (mirror := startswith(context.config.local_mirror, "file://")): - mounts += [Mount(mirror, mirror, ro=True)] + mounts += ["--ro-bind", mirror, mirror] subdir = context.config.distribution.package_manager(context.config).subdir(context.config) for d in ("cache", "lib"): src = context.package_cache_dir / d / subdir - mounts += [Mount(src, Path("/var") / d / subdir)] + mounts += ["--bind", src, Path("/var") / d / subdir] # If we're not operating on the configured package cache directory, we're operating on a snapshot of the # repository metadata in the image root directory. To make sure any downloaded packages are still cached in @@ -86,17 +86,49 @@ def mounts(cls, context: Context) -> list[Mount]: # configured package cache directory. if d == "cache" and context.package_cache_dir != context.config.package_cache_dir_or_default(): caches = context.config.distribution.package_manager(context.config).cache_subdirs(src) - mounts += [ - Mount( + mounts += flatten( + ( + "--bind", context.config.package_cache_dir_or_default() / d / subdir / p.relative_to(src), Path("/var") / d / subdir / p.relative_to(src), ) for p in caches if (context.config.package_cache_dir_or_default() / d / subdir / p.relative_to(src)).exists() - ] + ) return mounts + @classmethod + def options(cls, *, root: PathString, apivfs: bool = True) -> list[PathString]: + return [ + *(apivfs_options() if apivfs else []), + "--become-root", + "--suppress-chown", + # Make sure /etc/machine-id is not overwritten by any package manager post install scripts. + "--ro-bind-try", Path(root) / "etc/machine-id", "/buildroot/etc/machine-id", + # If we're already in the sandbox, we want to pick up use the passwd files from /buildroot since the + # original root won't be available anymore. If we're not in the sandbox yet, we want to pick up the passwd + # files from the original root. + *finalize_passwd_mounts(root), + ] + + @classmethod + def apivfs_script_cmd(cls, context: Context) -> list[PathString]: + return apivfs_script_cmd(tools=bool(context.config.tools_tree), options=cls.options(root="/buildroot")) + + @classmethod + def sandbox(cls, context: Context, *, apivfs: bool) -> AbstractContextManager[list[PathString]]: + return context.sandbox( + binary=cls.executable(context.config), + network=True, + vartmp=True, + options=[ + "--bind", context.root, "/buildroot", + *cls.mounts(context), + *cls.options(root=context.root, apivfs=apivfs), + ], + ) + @classmethod def sync(cls, context: Context, force: bool) -> None: pass diff --git a/mkosi/installer/apt.py b/mkosi/installer/apt.py index 7bfdae7fc..9559a283a 100644 --- a/mkosi/installer/apt.py +++ b/mkosi/installer/apt.py @@ -11,9 +11,8 @@ from mkosi.installer import PackageManager from mkosi.log import die from mkosi.run import run -from mkosi.sandbox import Mount, apivfs_cmd +from mkosi.sandbox import umask from mkosi.types import _FILE, CompletedProcess, PathString -from mkosi.util import umask @dataclasses.dataclass(frozen=True) @@ -48,7 +47,7 @@ class Apt(PackageManager): @classmethod def executable(cls, config: Config) -> str: - return "apt" + return "apt-get" @classmethod def subdir(cls, config: Config) -> Path: @@ -68,9 +67,11 @@ def dpkg_cmd(cls, command: str) -> list[PathString]: @classmethod def scripts(cls, context: Context) -> dict[str, list[PathString]]: + cmd = cls.apivfs_script_cmd(context) + return { **{ - command: apivfs_cmd() + cls.env_cmd(context) + cls.cmd(context, command) for command in ( + command: cmd + cls.env_cmd(context) + cls.cmd(context, command) for command in ( "apt", "apt-cache", "apt-cdrom", @@ -83,7 +84,7 @@ def scripts(cls, context: Context) -> dict[str, list[PathString]]: ) }, **{ - command: apivfs_cmd() + cls.dpkg_cmd(command) for command in( + command: cmd + cls.dpkg_cmd(command) for command in( "dpkg", "dpkg-query", ) @@ -150,7 +151,7 @@ def finalize_environment(cls, context: Context) -> dict[str, str]: return super().finalize_environment(context) | env @classmethod - def cmd(cls, context: Context, command: str) -> list[PathString]: + def cmd(cls, context: Context, command: str = "apt-get") -> list[PathString]: debarch = context.config.distribution.architecture(context.config.architecture) cmdline: list[PathString] = [ @@ -211,17 +212,9 @@ def invoke( stdout: _FILE = None, ) -> CompletedProcess: return run( - cls.cmd(context, "apt-get") + [operation, *arguments], - sandbox=( - context.sandbox( - binary="apt-get", - network=True, - vartmp=True, - mounts=[Mount(context.root, "/buildroot"), *cls.mounts(context)], - extra=apivfs_cmd() if apivfs else [] - ) - ), - env=context.config.environment | cls.finalize_environment(context), + cls.cmd(context) + [operation, *arguments], + sandbox=cls.sandbox(context, apivfs=apivfs), + env=cls.finalize_environment(context), stdout=stdout, ) @@ -256,8 +249,7 @@ def createrepo(cls, context: Context) -> None: ], sandbox=context.sandbox( binary="reprepro", - mounts=[Mount(context.repository, context.repository)], - options=["--chdir", context.repository], + options=["--bind", context.repository, context.repository, "--chdir", context.repository], ), ) diff --git a/mkosi/installer/dnf.py b/mkosi/installer/dnf.py index 10e230f3a..9a137873a 100644 --- a/mkosi/installer/dnf.py +++ b/mkosi/installer/dnf.py @@ -9,7 +9,6 @@ from mkosi.installer.rpm import RpmRepository, rpm_cmd from mkosi.log import ARG_DEBUG from mkosi.run import run -from mkosi.sandbox import Mount, apivfs_cmd from mkosi.types import _FILE, CompletedProcess, PathString @@ -35,8 +34,8 @@ def cache_subdirs(cls, cache: Path) -> list[Path]: @classmethod def scripts(cls, context: Context) -> dict[str, list[PathString]]: return { - "dnf": apivfs_cmd() + cls.env_cmd(context) + cls.cmd(context), - "rpm": apivfs_cmd() + rpm_cmd(), + "dnf": cls.apivfs_script_cmd(context) + cls.env_cmd(context) + cls.cmd(context), + "rpm": cls.apivfs_script_cmd(context) + rpm_cmd(), "mkosi-install" : ["dnf", "install"], "mkosi-upgrade" : ["dnf", "upgrade"], "mkosi-remove" : ["dnf", "remove"], @@ -194,16 +193,8 @@ def invoke( try: return run( cls.cmd(context, cached_metadata=cached_metadata) + [operation, *arguments], - sandbox=( - context.sandbox( - binary=cls.executable(context.config), - network=True, - vartmp=True, - mounts=[Mount(context.root, "/buildroot"), *cls.mounts(context)], - extra=apivfs_cmd() if apivfs else [], - ) - ), - env=context.config.environment | cls.finalize_environment(context), + sandbox=cls.sandbox(context, apivfs=apivfs), + env=cls.finalize_environment(context), stdout=stdout, ) finally: @@ -226,7 +217,7 @@ def sync(cls, context: Context, force: bool, arguments: Sequence[str] = ()) -> N @classmethod def createrepo(cls, context: Context) -> None: run(["createrepo_c", context.repository], - sandbox=context.sandbox(binary="createrepo_c", mounts=[Mount(context.repository, context.repository)])) + sandbox=context.sandbox(binary="createrepo_c", options=["--bind", context.repository, context.repository])) (context.pkgmngr / "etc/yum.repos.d/mkosi-local.repo").write_text( textwrap.dedent( diff --git a/mkosi/installer/pacman.py b/mkosi/installer/pacman.py index ea0c4abf4..308151cae 100644 --- a/mkosi/installer/pacman.py +++ b/mkosi/installer/pacman.py @@ -10,9 +10,8 @@ from mkosi.context import Context from mkosi.installer import PackageManager from mkosi.run import run -from mkosi.sandbox import Mount, apivfs_cmd +from mkosi.sandbox import umask from mkosi.types import _FILE, CompletedProcess, PathString -from mkosi.util import umask from mkosi.versioncomp import GenericVersion @@ -42,7 +41,7 @@ def state_subdirs(cls, state: Path) -> list[Path]: @classmethod def scripts(cls, context: Context) -> dict[str, list[PathString]]: return { - "pacman": apivfs_cmd() + cls.env_cmd(context) + cls.cmd(context), + "pacman": cls.apivfs_script_cmd(context) + cls.env_cmd(context) + cls.cmd(context), "mkosi-install" : ["pacman", "--sync", "--needed"], "mkosi-upgrade" : ["pacman", "--sync", "--sysupgrade", "--needed"], "mkosi-remove" : ["pacman", "--remove", "--recursive", "--nosave"], @@ -50,19 +49,19 @@ def scripts(cls, context: Context) -> dict[str, list[PathString]]: } @classmethod - def mounts(cls, context: Context) -> list[Mount]: + def mounts(cls, context: Context) -> list[PathString]: mounts = [ *super().mounts(context), # pacman writes downloaded packages to the first writable cache directory. We don't want it to write to our # local repository directory so we expose it as a read-only directory to pacman. - Mount(context.repository, "/var/cache/pacman/mkosi", ro=True), + "--ro-bind", context.repository, "/var/cache/pacman/mkosi", ] if (context.root / "var/lib/pacman/local").exists(): # pacman reuses the same directory for the sync databases and the local database containing the list of # installed packages. The former should go in the cache directory, the latter should go in the image, so we # bind mount the local directory from the image to make sure that happens. - mounts += [Mount(context.root / "var/lib/pacman/local", "/var/lib/pacman/local")] + mounts += ["--bind", context.root / "var/lib/pacman/local", "/var/lib/pacman/local"] return mounts @@ -166,16 +165,8 @@ def invoke( ) -> CompletedProcess: return run( cls.cmd(context) + [operation, *arguments], - sandbox=( - context.sandbox( - binary="pacman", - network=True, - vartmp=True, - mounts=[Mount(context.root, "/buildroot"), *cls.mounts(context)], - extra=apivfs_cmd() if apivfs else [], - ) - ), - env=context.config.environment | cls.finalize_environment(context), + sandbox=cls.sandbox(context, apivfs=apivfs), + env=cls.finalize_environment(context), stdout=stdout, ) @@ -192,7 +183,7 @@ def createrepo(cls, context: Context) -> None: context.repository / "mkosi.db.tar", *sorted(context.repository.glob("*.pkg.tar*"), key=lambda p: GenericVersion(Path(p).name)) ], - sandbox=context.sandbox(binary="repo-add", mounts=[Mount(context.repository, context.repository)]), + sandbox=context.sandbox(binary="repo-add", options=["--bind", context.repository, context.repository]), ) (context.pkgmngr / "etc/mkosi-local.conf").write_text( diff --git a/mkosi/installer/zypper.py b/mkosi/installer/zypper.py index 90ef6ad34..1ef968927 100644 --- a/mkosi/installer/zypper.py +++ b/mkosi/installer/zypper.py @@ -9,7 +9,6 @@ from mkosi.installer import PackageManager from mkosi.installer.rpm import RpmRepository, rpm_cmd from mkosi.run import run -from mkosi.sandbox import Mount, apivfs_cmd from mkosi.types import _FILE, CompletedProcess, PathString @@ -36,8 +35,8 @@ def scripts(cls, context: Context) -> dict[str, list[PathString]]: ] return { - "zypper": apivfs_cmd() + cls.env_cmd(context) + cls.cmd(context), - "rpm" : apivfs_cmd() + rpm_cmd(), + "zypper": cls.apivfs_script_cmd(context) + cls.env_cmd(context) + cls.cmd(context), + "rpm" : cls.apivfs_script_cmd(context) + rpm_cmd(), "mkosi-install" : install, "mkosi-upgrade" : ["zypper", "update"], "mkosi-remove" : ["zypper", "remove", "--clean-deps"], @@ -128,16 +127,8 @@ def invoke( ) -> CompletedProcess: return run( cls.cmd(context) + [operation, *arguments], - sandbox=( - context.sandbox( - binary="zypper", - network=True, - vartmp=True, - mounts=[Mount(context.root, "/buildroot"), *cls.mounts(context)], - extra=apivfs_cmd() if apivfs else [], - ) - ), - env=context.config.environment | cls.finalize_environment(context), + sandbox=cls.sandbox(context, apivfs=apivfs), + env=cls.finalize_environment(context), stdout=stdout, ) @@ -148,7 +139,7 @@ def sync(cls, context: Context, force: bool, arguments: Sequence[str] = ()) -> N @classmethod def createrepo(cls, context: Context) -> None: run(["createrepo_c", context.repository], - sandbox=context.sandbox(binary="createrepo_c", mounts=[Mount(context.repository, context.repository)])) + sandbox=context.sandbox(binary="createrepo_c", options=["--bind", context.repository, context.repository])) (context.pkgmngr / "etc/zypp/repos.d/mkosi-local.repo").write_text( textwrap.dedent( diff --git a/mkosi/kmod.py b/mkosi/kmod.py index e998cb3b4..a128c2c2d 100644 --- a/mkosi/kmod.py +++ b/mkosi/kmod.py @@ -9,8 +9,7 @@ from pathlib import Path from mkosi.log import complete_step, log_step -from mkosi.run import run -from mkosi.sandbox import Mount, SandboxProtocol, chroot_cmd, nosandbox +from mkosi.run import chroot_cmd, run from mkosi.util import chdir, parents_below @@ -57,8 +56,6 @@ def resolve_module_dependencies( root: Path, kver: str, modules: Iterable[str], - *, - sandbox: SandboxProtocol = nosandbox, ) -> tuple[set[Path], set[Path]]: """ Returns a tuple of lists containing the paths to the module and firmware dependencies of the given list @@ -76,19 +73,17 @@ def resolve_module_dependencies( log_step("Running modinfo to fetch kernel module dependencies") - # We could run modinfo once for each module but that's slow. Luckily we can pass multiple modules to - # modinfo and it'll process them all in a single go. We get the modinfo for all modules to build two maps - # that map the path of the module to its module dependencies and its firmware dependencies respectively. - # Because there's more kernel modules than the max number of accepted CLI arguments for bwrap, we split the modules - # list up into chunks. + # We could run modinfo once for each module but that's slow. Luckily we can pass multiple modules to modinfo and + # it'll process them all in a single go. We get the modinfo for all modules to build two maps that map the path of + # the module to its module dependencies and its firmware dependencies respectively. Because there's more kernel + # modules than the max number of accepted CLI arguments, we split the modules list up into chunks. info = "" for i in range(0, len(nametofile.keys()), 8500): chunk = list(nametofile.keys())[i:i+8500] info += run( ["modinfo", "--set-version", kver, "--null", *chunk], stdout=subprocess.PIPE, - sandbox=sandbox(binary="modinfo", mounts=[Mount(root, "/buildroot", ro=True)], extra=chroot_cmd()), - cwd=root, + sandbox=chroot_cmd(root=root), ).stdout.strip() log_step("Calculating required kernel modules and firmware") @@ -159,7 +154,6 @@ def gen_required_kernel_modules( *, include: Iterable[str], exclude: Iterable[str], - sandbox: SandboxProtocol = nosandbox, ) -> Iterator[Path]: modulesd = Path("usr/lib/modules") / kver @@ -169,7 +163,7 @@ def gen_required_kernel_modules( if exclude or (root / "usr/lib/firmware").glob("*"): modules = filter_kernel_modules(root, kver, include=include, exclude=exclude) names = [module_path_to_name(m) for m in modules] - mods, firmware = resolve_module_dependencies(root, kver, names, sandbox=sandbox) + mods, firmware = resolve_module_dependencies(root, kver, names) else: logging.debug("No modules excluded and no firmware installed, using kernel modules generation fast path") with chdir(root): @@ -199,7 +193,6 @@ def process_kernel_modules( *, include: Iterable[str], exclude: Iterable[str], - sandbox: SandboxProtocol = nosandbox, ) -> None: if not exclude: return @@ -208,7 +201,8 @@ def process_kernel_modules( firmwared = Path("usr/lib/firmware") with complete_step("Applying kernel module filters"): - required = set(gen_required_kernel_modules(root, kver, include=include, exclude=exclude, sandbox=sandbox)) + required = set( + gen_required_kernel_modules(root, kver, include=include, exclude=exclude)) with chdir(root): modules = sorted(modulesd.rglob("*.ko*"), reverse=True) diff --git a/mkosi/manifest.py b/mkosi/manifest.py index c08acf6e3..a360b56da 100644 --- a/mkosi/manifest.py +++ b/mkosi/manifest.py @@ -14,7 +14,6 @@ from mkosi.installer.apt import Apt from mkosi.log import complete_step from mkosi.run import run -from mkosi.sandbox import Mount @dataclasses.dataclass @@ -111,7 +110,7 @@ def record_rpm_packages(self) -> None: "--queryformat", r"%{NEVRA}\t%{SOURCERPM}\t%{NAME}\t%{ARCH}\t%{LONGSIZE}\t%{INSTALLTIME}\n", ], stdout=subprocess.PIPE, - sandbox=self.context.sandbox(binary="rpm", mounts=[Mount(self.context.root, "/buildroot")]), + sandbox=self.context.sandbox(binary="rpm", options=["--ro-bind", self.context.root, "/buildroot"]), ) packages = sorted(c.stdout.splitlines()) @@ -159,7 +158,7 @@ def record_rpm_packages(self) -> None: stderr=subprocess.DEVNULL, sandbox=self.context.sandbox( binary="rpm", - mounts=[Mount(self.context.root, "/buildroot", ro=True)] + options=["--ro-bind", self.context.root, "/buildroot"], ), ) changelog = c.stdout.strip() @@ -180,7 +179,7 @@ def record_deb_packages(self) -> None: stdout=subprocess.PIPE, sandbox=self.context.sandbox( binary="dpkg-query", - mounts=[Mount(self.context.root, "/buildroot", ro=True)], + options=["--ro-bind", self.context.root, "/buildroot"], ), ) diff --git a/mkosi/mounts.py b/mkosi/mounts.py index ed7cec49f..b55d17703 100644 --- a/mkosi/mounts.py +++ b/mkosi/mounts.py @@ -2,7 +2,6 @@ import contextlib import os -import platform import stat import tempfile from collections.abc import Iterator, Sequence @@ -10,11 +9,9 @@ from typing import Optional from mkosi.config import Config -from mkosi.run import run -from mkosi.sandbox import Mount +from mkosi.sandbox import OverlayOperation from mkosi.types import PathString -from mkosi.util import umask -from mkosi.versioncomp import GenericVersion +from mkosi.util import flatten def stat_is_whiteout(st: os.stat_result) -> bool: @@ -33,109 +30,63 @@ def delete_whiteout_files(path: Path) -> None: entry.unlink() -@contextlib.contextmanager -def mount( - what: PathString, - where: Path, - operation: Optional[str] = None, - options: Sequence[str] = (), - type: Optional[str] = None, - read_only: bool = False, - lazy: bool = False, - umount: bool = True, -) -> Iterator[Path]: - if not where.exists(): - with umask(~0o755): - where.mkdir(parents=True) - - if read_only: - options = ["ro", *options] - - cmd: list[PathString] = ["mount", "--no-mtab"] - - if operation: - cmd += [operation] - - cmd += [what, where] - - if type: - cmd += ["--types", type] - - if options: - cmd += ["--options", ",".join(options)] - - try: - run(cmd) - yield where - finally: - if umount: - run(["umount", "--no-mtab", *(["--lazy"] if lazy else []), where]) - - @contextlib.contextmanager def mount_overlay( lowerdirs: Sequence[Path], + dst: Path, + *, upperdir: Optional[Path] = None, - where: Optional[Path] = None, - lazy: bool = False, ) -> Iterator[Path]: with contextlib.ExitStack() as stack: if upperdir is None: upperdir = Path(stack.enter_context(tempfile.TemporaryDirectory(prefix="volatile-overlay"))) st = lowerdirs[-1].stat() os.chmod(upperdir, st.st_mode) - os.chown(upperdir, st.st_uid, st.st_gid) workdir = Path( stack.enter_context(tempfile.TemporaryDirectory(dir=upperdir.parent, prefix=f"{upperdir.name}-workdir")) ) - if where is None: - where = Path( - stack.enter_context( - tempfile.TemporaryDirectory(dir=upperdir.parent, prefix=f"{upperdir.name}-mountpoint") - ) - ) - - options = [ - f"lowerdir={':'.join(os.fspath(p) for p in reversed(lowerdirs))}", - f"upperdir={upperdir}", - f"workdir={workdir}", - # Disable the inodes index and metacopy (only copy metadata upwards if possible) - # options. If these are enabled (e.g., if the kernel enables them by default), - # the mount will fail if the upper directory has been earlier used with a different - # lower directory, such as with a build overlay that was generated on top of a - # different temporary root. - # See https://www.kernel.org/doc/html/latest/filesystems/overlayfs.html#sharing-and-copying-layers - # and https://github.com/systemd/mkosi/issues/1841. - "index=off", - "metacopy=off" - ] - - # userxattr is only supported on overlayfs since kernel 5.11 - if GenericVersion(platform.release()) >= GenericVersion("5.11"): - options.append("userxattr") - try: - with mount("overlay", where, options=options, type="overlay", lazy=lazy): - yield where + with OverlayOperation(tuple(str(p) for p in lowerdirs), str(upperdir), str(workdir), str(dst)): + yield dst finally: delete_whiteout_files(upperdir) @contextlib.contextmanager -def finalize_source_mounts(config: Config, *, ephemeral: bool) -> Iterator[list[Mount]]: +def finalize_source_mounts(config: Config, *, ephemeral: bool) -> Iterator[list[PathString]]: with contextlib.ExitStack() as stack: - sources = ( - (stack.enter_context(mount_overlay([source])) if ephemeral else source, target) - for source, target - in {t.with_prefix(Path("/work/src")) for t in config.build_sources} - ) + options: list[PathString] = [] + + for t in config.build_sources: + src, dst = t.with_prefix("/work/src") + + if ephemeral: + options = [] - yield [Mount(src, target) for src, target in sorted(sources, key=lambda s: s[1])] + upperdir = Path(stack.enter_context(tempfile.TemporaryDirectory(prefix="volatile-overlay"))) + os.chmod(upperdir, src.stat().st_mode) + workdir = Path( + stack.enter_context( + tempfile.TemporaryDirectory(dir=upperdir.parent, prefix=f"{upperdir.name}-workdir") + ) + ) + + options += [ + "--overlay-lowerdir", src, + "--overlay-upperdir", upperdir, + "--overlay-workdir", workdir, + "--overlay", dst, + ] + else: + options += ["--bind", src, dst] + + yield options -def finalize_crypto_mounts(config: Config) -> list[Mount]: + +def finalize_crypto_mounts(config: Config) -> list[PathString]: root = config.tools() if config.tools_tree_certificates else Path("/") mounts = [ @@ -152,8 +103,8 @@ def finalize_crypto_mounts(config: Config) -> list[Mount]: if (root / subdir).exists() ] - return [ - Mount(src, target, ro=True) + return flatten( + ("--ro-bind", src, target) for src, target in sorted(set(mounts), key=lambda s: s[1]) - ] + ) diff --git a/mkosi/partition.py b/mkosi/partition.py index 9a598b66e..4faa12c10 100644 --- a/mkosi/partition.py +++ b/mkosi/partition.py @@ -6,8 +6,7 @@ from typing import Any, Optional from mkosi.log import die -from mkosi.run import run -from mkosi.sandbox import Mount, SandboxProtocol, nosandbox +from mkosi.run import SandboxProtocol, nosandbox, run @dataclasses.dataclass(frozen=True) @@ -37,7 +36,7 @@ def find_partitions(image: Path, *, sandbox: SandboxProtocol = nosandbox) -> lis ["systemd-repart", "--json=short", image], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, - sandbox=sandbox(binary="systemd-repart", mounts=[Mount(image, image, ro=True)]), + sandbox=sandbox(binary="systemd-repart", options=["--ro-bind", image, image]), ).stdout ) return [Partition.from_dict(d) for d in output] diff --git a/mkosi/qemu.py b/mkosi/qemu.py index 85c294261..b98bec651 100644 --- a/mkosi/qemu.py +++ b/mkosi/qemu.py @@ -14,7 +14,6 @@ import random import resource import shutil -import signal import socket import struct import subprocess @@ -41,13 +40,11 @@ yes_no, ) from mkosi.log import ARG_DEBUG, die -from mkosi.mounts import finalize_source_mounts from mkosi.partition import finalize_root, find_partitions -from mkosi.run import SD_LISTEN_FDS_START, AsyncioThread, find_binary, fork_and_wait, kill, run, spawn -from mkosi.sandbox import Mount +from mkosi.run import SD_LISTEN_FDS_START, AsyncioThread, find_binary, fork_and_wait, run, spawn from mkosi.tree import copy_tree, rmtree from mkosi.types import PathString -from mkosi.user import INVOKING_USER, become_root, become_root_cmd +from mkosi.user import INVOKING_USER, become_root_in_subuid_range, become_root_in_subuid_range_cmd from mkosi.util import StrEnum, flock, flock_or_die, groupby, round_up, try_or from mkosi.versioncomp import GenericVersion @@ -160,7 +157,7 @@ def identify(cls, config: Config, path: Path) -> "KernelType": type = run( ["bootctl", "kernel-identify", path], stdout=subprocess.PIPE, - sandbox=config.sandbox(binary="bootctl", mounts=[Mount(path, path, ro=True)]), + sandbox=config.sandbox(binary="bootctl", options=["--ro-bind", path, path]), ).stdout.strip() try: @@ -253,13 +250,12 @@ def start_swtpm(config: Config) -> Iterator[Path]: ["swtpm_setup", "--tpm-state", state, "--tpm2", "--pcr-banks", "sha256", "--config", "/dev/null"], sandbox=config.sandbox( binary="swtpm_setup", - mounts=[Mount(state, state)], - ), - scope=scope_cmd( - name=f"mkosi-swtpm-{config.machine_or_name()}", - description=f"swtpm for {config.machine_or_name()}", + options=["--bind", state, state], + setup=scope_cmd( + name=f"mkosi-swtpm-{config.machine_or_name()}", + description=f"swtpm for {config.machine_or_name()}", + ), ), - env=scope_env(), stdout=None if ARG_DEBUG.get() else subprocess.DEVNULL, ) @@ -277,10 +273,10 @@ def start_swtpm(config: Config) -> Iterator[Path]: with spawn( cmdline, pass_fds=(sock.fileno(),), - sandbox=config.sandbox(binary="swtpm", mounts=[Mount(state, state)]), - ) as (proc, innerpid): + sandbox=config.sandbox(binary="swtpm", options=["--bind", state, state]), + ) as proc: yield path - kill(proc, innerpid, signal.SIGTERM) + proc.terminate() def find_virtiofsd(*, root: Path = Path("/"), extra: Sequence[Path] = ()) -> Optional[Path]: @@ -313,14 +309,13 @@ def start_virtiofsd( config: Config, directory: PathString, *, + uidmap: bool = True, name: Optional[str] = None, selinux: bool = False, ) -> Iterator[Path]: if name is None: name = systemd_escape(config, directory, path=True) - uidmap = Path(directory).stat().st_uid == INVOKING_USER.uid - virtiofsd = find_virtiofsd(root=config.tools(), extra=config.extra_search_paths) if virtiofsd is None: die("virtiofsd must be installed to boot directory images or use RuntimeTrees= with mkosi qemu") @@ -338,14 +333,24 @@ def start_virtiofsd( if selinux: cmdline += ["--security-label"] + st = None + if uidmap: + st = Path(directory).stat() + + # If we're already running as the same user that we'll be running virtiofsd as, don't bother doing any explicit + # user switching or chown()'ing as it's not needed in this case. + if st.st_uid == os.getuid() and st.st_gid == os.getgid(): + st = None + # We create the socket ourselves and pass the fd to virtiofsd to avoid race conditions where we start qemu # before virtiofsd has had the chance to create the socket (or where we try to chown it first). with ( tempfile.TemporaryDirectory(prefix="mkosi-virtiofsd-") as context, socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) as sock, ): - # Make sure virtiofsd can access the socket in this directory. - os.chown(context, INVOKING_USER.uid, INVOKING_USER.gid) + if st: + # Make sure virtiofsd can access the socket in this directory. + os.chown(context, st.st_uid, st.st_gid) # Make sure we can use the socket name as a unique identifier for the fs as well but make sure it's not too # long as virtiofs tag names are limited to 36 bytes. @@ -353,46 +358,44 @@ def start_virtiofsd( sock.bind(os.fspath(path)) sock.listen() - # Make sure virtiofsd can connect to the socket. - os.chown(path, INVOKING_USER.uid, INVOKING_USER.gid) + if st: + # Make sure virtiofsd can connect to the socket. + os.chown(path, st.st_uid, st.st_gid) cmdline += ["--fd", str(SD_LISTEN_FDS_START)] + # We want RuntimeBuildSources= and RuntimeTrees= to do the right thing even when running mkosi qemu as root + # without the source directories necessarily being owned by root. We achieve this by running virtiofsd as the + # owner of the source directory and then mapping that uid to root. + name = f"mkosi-virtiofsd-{name}" description = f"virtiofsd for {directory}" - uid = gid = None - runas = [] scope = [] - if uidmap: - uid = INVOKING_USER.uid if os.getuid() != INVOKING_USER.uid else None - gid = INVOKING_USER.gid if os.getgid() != INVOKING_USER.gid else None - scope = scope_cmd(name=name, description=description, user=uid, group=gid) + if st: + scope = scope_cmd(name=name, description=description, user=st.st_uid, group=st.st_gid) elif not uidmap and (os.getuid() == 0 or unshare_version() >= "2.38"): scope = scope_cmd(name=name, description=description) - if scope: - runas = become_root_cmd() with spawn( cmdline, pass_fds=(sock.fileno(),), - # When not invoked as root, bubblewrap will automatically map the current uid/gid to the requested uid/gid - # in the user namespace it spawns, so by specifying --uid 0 --gid 0 we'll get a userns with the current - # uid/gid mapped to root in the userns. --cap-add=all is required to make virtiofsd work. Since it drops - # capabilities itself, we don't bother figuring out the exact set of capabilities it needs. - user=uid if not scope else None, - group=gid if not scope else None, - preexec_fn=become_root if not scope and not uidmap else None, - env=scope_env() if scope else {}, + user=st.st_uid if st and not scope else None, + group=st.st_gid if st and not scope else None, + # If we're booting from virtiofs and unshare is too old, we don't set up a scope so we can use our own + # function to become root in the subuid range. + # TODO: Drop this as soon as we drop CentOS Stream 9 support and can rely on newer unshare features. + preexec_fn=become_root_in_subuid_range if not scope and not uidmap else None, sandbox=config.sandbox( binary=virtiofsd, - mounts=[Mount(directory, directory)], - options=["--uid", "0", "--gid", "0", "--cap-add", "all"], - setup=runas, + options=[ + "--bind", directory, directory, + *(["--become-root"] if uidmap else []), + ], + setup=scope + become_root_in_subuid_range_cmd() if scope and not uidmap else [], ), - scope=scope, - ) as (proc, innerpid): + ) as proc: yield path - kill(proc, innerpid, signal.SIGTERM) + proc.terminate() @contextlib.contextmanager @@ -481,8 +484,8 @@ def start_journal_remote(config: Config, sockfd: int) -> Iterator[None]: f.flush() - user = config.forward_journal.parent.stat().st_uid if INVOKING_USER.invoked_as_root else None - group = config.forward_journal.parent.stat().st_gid if INVOKING_USER.invoked_as_root else None + user = d.stat().st_uid if os.getuid() == 0 else None + group = d.stat().st_gid if os.getuid() == 0 else None scope = scope_cmd( name=f"mkosi-journal-remote-{config.machine_or_name()}", description=f"mkosi systemd-journal-remote for {config.machine_or_name()}", @@ -499,19 +502,18 @@ def start_journal_remote(config: Config, sockfd: int) -> Iterator[None]: pass_fds=(sockfd,), sandbox=config.sandbox( binary=bin, - mounts=[ - Mount(config.forward_journal.parent, config.forward_journal.parent), - Mount(f.name, "/etc/systemd/journal-remote.conf"), + options=[ + "--bind", config.forward_journal.parent, config.forward_journal.parent, + "--ro-bind", f.name, "/etc/systemd/journal-remote.conf", ], + setup=scope, ), user=user if not scope else None, group=group if not scope else None, - scope=scope, - env=scope_env(), foreground=False, - ) as (proc, innerpid): + ) as proc: yield - kill(proc, innerpid, signal.SIGTERM) + proc.terminate() @@ -527,7 +529,14 @@ def start_journal_remote_vsock(config: Config) -> Iterator[str]: @contextlib.contextmanager def copy_ephemeral(config: Config, src: Path) -> Iterator[Path]: - if not config.ephemeral or config.output_format in (OutputFormat.cpio, OutputFormat.uki): + if config.output_format in (OutputFormat.cpio, OutputFormat.uki): + yield src + return + + # If we're booting a directory image that was not built as root, we have to make an ephemeral copy so that we can + # ensure the files in the directory are either owned by the actual root user or a fake one in a subuid user + # namespace which we'll run virtiofsd as. + if not config.ephemeral and (config.output_format != OutputFormat.directory or src.stat().st_uid == 0): with flock_or_die(src): yield src @@ -542,11 +551,11 @@ def copy_ephemeral(config: Config, src: Path) -> Iterator[Path]: try: def copy() -> None: if config.output_format == OutputFormat.directory: - become_root() + become_root_in_subuid_range() elif config.output_format in (OutputFormat.disk, OutputFormat.esp): attr = run( ["lsattr", "-l", src], - sandbox=config.sandbox(binary="lsattr", mounts=[Mount(src, src, ro=True)]), + sandbox=config.sandbox(binary="lsattr", options=["--ro-bind", src, src]), stdout=subprocess.PIPE, ).stdout @@ -554,12 +563,13 @@ def copy() -> None: tmp.touch() run( ["chattr", "+C", tmp], - sandbox=config.sandbox(binary="chattr", mounts=[Mount(tmp, tmp)]), + sandbox=config.sandbox(binary="chattr", options=["--bind", tmp, tmp]), ) copy_tree( src, tmp, - preserve=config.output_format == OutputFormat.directory, + # Make sure the ownership is changed to the (fake) root user if the directory was not built as root. + preserve=config.output_format == OutputFormat.directory and src.stat().st_uid == 0, use_subvolumes=config.use_subvolumes, sandbox=config.sandbox, ) @@ -570,7 +580,7 @@ def copy() -> None: finally: def rm() -> None: if config.output_format == OutputFormat.directory: - become_root() + become_root_in_subuid_range() rmtree(tmp, sandbox=config.sandbox) @@ -603,7 +613,7 @@ def generate_scratch_fs(config: Config) -> Iterator[Path]: run( [f"mkfs.{fs}", "-L", "scratch", *extra.split(), scratch.name], stdout=subprocess.DEVNULL, - sandbox=config.sandbox(binary= f"mkfs.{fs}", mounts=[Mount(scratch.name, scratch.name)]), + sandbox=config.sandbox(binary= f"mkfs.{fs}", options=["--bind", scratch.name, scratch.name]), ) yield Path(scratch.name) @@ -656,9 +666,9 @@ def finalize_firmware_variables( ], sandbox=config.sandbox( binary=qemu, - mounts=[ - Mount(ovmf_vars.name, ovmf_vars.name), - Mount(config.secure_boot_certificate, config.secure_boot_certificate, ro=True), + options=[ + "--bind", ovmf_vars.name, ovmf_vars.name, + "--ro-bind", config.secure_boot_certificate, config.secure_boot_certificate, ], ), ) @@ -689,7 +699,7 @@ def apply_runtime_size(config: Config, image: Path) -> None: "--offline=yes", image, ], - sandbox=config.sandbox(binary="systemd-repart", mounts=[Mount(image, image)]), + sandbox=config.sandbox(binary="systemd-repart", options=["--bind", image, image]), ) @@ -704,10 +714,6 @@ def finalize_drive(drive: QemuDrive) -> Iterator[Path]: def finalize_state(config: Config, cid: int) -> Iterator[None]: (INVOKING_USER.runtime_dir() / "machine").mkdir(parents=True, exist_ok=True) - if INVOKING_USER.is_regular_user(): - os.chown(INVOKING_USER.runtime_dir(), INVOKING_USER.uid, INVOKING_USER.gid) - os.chown(INVOKING_USER.runtime_dir() / "machine", INVOKING_USER.uid, INVOKING_USER.gid) - with flock(INVOKING_USER.runtime_dir() / "machine"): if (p := INVOKING_USER.runtime_dir() / "machine" / f"{config.machine_or_name()}.json").exists(): die(f"Another virtual machine named {config.machine_or_name()} is already running", @@ -724,10 +730,6 @@ def finalize_state(config: Config, cid: int) -> Iterator[None]: indent=4, ) ) - - if INVOKING_USER.is_regular_user(): - os.chown(p, INVOKING_USER.uid, INVOKING_USER.gid) - try: yield finally: @@ -735,36 +737,35 @@ def finalize_state(config: Config, cid: int) -> Iterator[None]: p.unlink(missing_ok=True) -def scope_env() -> dict[str, str]: +def scope_cmd( + name: str, + description: str, + user: Optional[int] = None, + group: Optional[int] = None, + properties: Sequence[str] = (), + environment: bool = True, +) -> list[str]: if not find_binary("systemd-run"): - return {} - elif os.getuid() != 0 and "DBUS_SESSION_BUS_ADDRESS" in os.environ and "XDG_RUNTIME_DIR" in os.environ: - return { + return [] + + if os.getuid() != 0 and "DBUS_SESSION_BUS_ADDRESS" in os.environ and "XDG_RUNTIME_DIR" in os.environ: + env = { "DBUS_SESSION_BUS_ADDRESS": os.environ["DBUS_SESSION_BUS_ADDRESS"], "XDG_RUNTIME_DIR": os.environ["XDG_RUNTIME_DIR"] } elif os.getuid() == 0: if "DBUS_SYSTEM_ADDRESS" in os.environ: - return {"DBUS_SYSTEM_ADDRESS": os.environ["DBUS_SYSTEM_ADDRESS"]} + env = {"DBUS_SYSTEM_ADDRESS": os.environ["DBUS_SYSTEM_ADDRESS"]} elif Path("/run/dbus/system_bus_socket").exists(): - return {"DBUS_SYSTEM_ADDRESS": "/run/dbus/system_bus_socket"} + env = {"DBUS_SYSTEM_ADDRESS": "/run/dbus/system_bus_socket"} else: - return {} + return [] else: - return {} - - -def scope_cmd( - name: str, - description: str, - user: Optional[int] = None, - group: Optional[int] = None, - properties: Sequence[str] = (), -) -> list[str]: - if not scope_env(): return [] return [ + "env", + *(f"{k}={v}" for k, v in env.items() if environment), "systemd-run", "--system" if os.getuid() == 0 else "--user", *(["--quiet"] if not ARG_DEBUG.get() else []), @@ -1014,7 +1015,7 @@ def run_qemu(args: Args, config: Config) -> None: sandbox=config.sandbox( binary="systemd-repart", vartmp=True, - mounts=[Mount(fname.parent, fname.parent), Mount(src, src, ro=True)], + options=["--bind", fname.parent, fname.parent, "--ro-bind", src, src], ), ) stack.callback(lambda: fname.unlink()) @@ -1055,6 +1056,7 @@ def run_qemu(args: Args, config: Config) -> None: config, fname, name=config.machine_or_name(), + uidmap=False, selinux=bool(want_selinux_relabel(config, fname, fatal=False))), ) cmdline += [ @@ -1086,10 +1088,10 @@ def add_virtiofs_mount( credentials["fstab.extra"] += f"{tag} {dst} virtiofs x-initrd.mount\n" if config.runtime_build_sources: - with finalize_source_mounts(config, ephemeral=False) as mounts: - for mount in mounts: - sock = stack.enter_context(start_virtiofsd(config, mount.src)) - add_virtiofs_mount(sock, mount.dst, cmdline, credentials, tag=Path(mount.src).name) + for t in config.build_sources: + src, dst = t.with_prefix("/work/src") + sock = stack.enter_context(start_virtiofsd(config, src)) + add_virtiofs_mount(sock, dst, cmdline, credentials, tag=src.name) if config.build_dir: sock = stack.enter_context(start_virtiofsd(config, config.build_dir)) @@ -1233,18 +1235,24 @@ def add_virtiofs_mount( env=os.environ | config.environment, log=False, foreground=True, - sandbox=config.sandbox(binary=qemu, network=True, devices=True, relaxed=True), - scope=scope_cmd( - name=name, - description=f"mkosi Virtual Machine {name}", - properties=config.unit_properties, + sandbox=config.sandbox( + binary=qemu, + network=True, + devices=True, + relaxed=True, + setup=scope_cmd( + name=name, + description=f"mkosi Virtual Machine {name}", + properties=config.unit_properties, + environment=False, + ), ), - ) as (proc, innerpid): + ) as proc: # We have to close these before we wait for qemu otherwise we'll deadlock as qemu will never exit. for fd in qemu_device_fds.values(): os.close(fd) - register_machine(config, innerpid, fname) + register_machine(config, proc.pid, fname) if proc.wait() == 0 and (status := int(notifications.get("EXIT_STATUS", 0))): raise subprocess.CalledProcessError(status, cmdline) diff --git a/mkosi/resources/mkosi.md b/mkosi/resources/mkosi.md index 500964010..b13b47921 100644 --- a/mkosi/resources/mkosi.md +++ b/mkosi/resources/mkosi.md @@ -1468,10 +1468,6 @@ boolean argument: either `1`, `yes`, or `true` to enable, or `0`, `no`, OEM string. This will only be picked up by systemd-boot/systemd-stub versions newer than or equal to v254. -`Acl=`, `--acl=` -: If specified, ACLs will be set on any generated root filesystem directories that - allow the user running mkosi to remove them without needing privileges. - `ToolsTree=`, `--tools-tree=` : If specified, programs executed by mkosi to build and boot an image are looked up inside the given tree instead of in the host system. Use @@ -2194,10 +2190,7 @@ Scripts executed by mkosi receive the following environment variables: for more information. * `$MKOSI_UID` and `$MKOSI_GID` are the respectively the uid, gid of the - user that invoked mkosi, potentially translated to a uid in the user - namespace that mkosi is running in. These can be used in combination - with `setpriv` to run commands as the user that invoked mkosi (e.g. - `setpriv --reuid=$MKOSI_UID --regid=$MKOSI_GID --clear-groups `) + user that invoked mkosi. * `$MKOSI_CONFIG` is a file containing a json summary of the settings of the current image. This file can be parsed inside scripts to gain access to all @@ -2275,22 +2268,6 @@ available via `$PATH` to simplify common usecases. `mkosi-remove` will invoke the corresponding operation of the package manager being used to built the image. -* `mkosi-as-caller`: This script uses `setpriv` to switch from - the user `root` in the user namespace used for various build steps - back to the original user that called mkosi. This is useful when - we want to invoke build steps which will write to `$BUILDDIR` and - we want to have the files owned by the calling user. - - For example, a complete `mkosi.build` script might be the following: - - ```sh - set -ex - - mkosi-as-caller meson setup "$BUILDDIR/build" "$SRCDIR" - mkosi-as-caller meson compile -C "$BUILDDIR/build" - meson install -C "$BUILDDIR/build" --no-rebuild - ``` - * `git` is automatically invoked with `safe.directory=*` to avoid permissions errors when running as the root user in a user namespace. @@ -2533,7 +2510,6 @@ overridden): - `ProxyClientKey=` - `Incremental=` - `ExtraSearchPaths=` -- `Acl=` - `ToolsTree=` - `ToolsTreeCertificates=` @@ -2715,7 +2691,7 @@ When not using distribution packages make sure to install the necessary dependencies. For example, on *Fedora Linux* you need: ```bash -# dnf install bubblewrap btrfs-progs apt dosfstools mtools edk2-ovmf e2fsprogs squashfs-tools gnupg python3 tar xfsprogs xz zypper sbsigntools +# dnf install btrfs-progs apt dosfstools mtools edk2-ovmf e2fsprogs squashfs-tools gnupg python3 tar xfsprogs xz zypper sbsigntools ``` On Debian/Kali/Ubuntu it might be necessary to install the `ubuntu-keyring`, @@ -2754,6 +2730,33 @@ Note that the minimum required Python version is 3.9. `systemd-homed-firstboot.service` will prompt to create a regular user on first boot if there are no regular users. +- Why do I see failures to chown files when building images? + + When not running as root, your user is not able to change ownership of + files to arbitrary owners. Various distributions still ship files in their + packages that are not owned by the root user. When not running as root, mkosi + maps the current user to root when invoking package managers, which means that + changing ownership to root will work but changing ownership to any other user + or group will fail. + + If this behavior causes applications running in your image to misbehave, you + can consider running `mkosi` as root which avoids this problem. Alternatively, + if running `mkosi` as root is not desired, you can use + `unshare --map-auto --map-current-user --setuid 0 --setgid 0` to become root in + a user namespace with more than one user assuming the UID/GID mappings in + `/etc/subuid` and `/etc/subgid` are configured correctly. Note that running mkosi + as root or with `unshare` means that all output files produced by mkosi will not + be owned by your current user anymore. + + Note that for systemd services that need directories in `/var` owned by the service + user and group, an alternative to shipping these directories in packages or + creating them via systemd-tmpfiles is to use `StateDirectory=`, `CacheDirectory=` or + `LogsDirectory=` in the service file which instructs systemd to create the directory + when it first starts the service. + + Alternatively, the `z` or `Z` directives for `systemd-tmpfiles` can be used to chown + various directories and files to their owning user when the system first boots up. + # REFERENCES * [Primary mkosi git repository on GitHub](https://github.com/systemd/mkosi/) * [mkosi — A Tool for Generating OS Images](https://0pointer.net/blog/mkosi-a-tool-for-generating-os-images.html) introductory blog post by Lennart Poettering diff --git a/mkosi/run.py b/mkosi/run.py index 3cfc963ce..dd4fe0305 100644 --- a/mkosi/run.py +++ b/mkosi/run.py @@ -15,14 +15,17 @@ import subprocess import sys import threading +import uuid from collections.abc import Awaitable, Collection, Iterator, Mapping, Sequence from contextlib import AbstractContextManager from pathlib import Path from types import TracebackType -from typing import Any, Callable, NoReturn, Optional +from typing import Any, Callable, NoReturn, Optional, Protocol +import mkosi.sandbox from mkosi.log import ARG_DEBUG, ARG_DEBUG_SHELL, die from mkosi.types import _FILE, CompletedProcess, PathString, Popen +from mkosi.util import flatten, one_zero SD_LISTEN_FDS_START = 3 @@ -117,6 +120,8 @@ def fork_and_wait(target: Callable[..., None], *args: Any, **kwargs: Any) -> Non def log_process_failure(sandbox: Sequence[str], cmdline: Sequence[str], returncode: int) -> None: if returncode < 0: logging.error(f"Interrupted by {signal.Signals(-returncode).name} signal") + elif returncode == 127: + logging.error(f"{cmdline[0]} not found.") else: logging.error( f"\"{shlex.join([*sandbox, *cmdline] if ARG_DEBUG.get() else cmdline)}\" returned non-zero exit code " @@ -134,40 +139,30 @@ def run( user: Optional[int] = None, group: Optional[int] = None, env: Mapping[str, str] = {}, - cwd: Optional[Path] = None, log: bool = True, foreground: bool = True, - preexec_fn: Optional[Callable[[], None]] = None, success_exit_status: Sequence[int] = (0,), sandbox: AbstractContextManager[Sequence[PathString]] = contextlib.nullcontext([]), - scope: Sequence[str] = (), ) -> CompletedProcess: if input is not None: assert stdin is None # stdin and input cannot be specified together stdin = subprocess.PIPE - try: - with spawn( - cmdline, - check=check, - stdin=stdin, - stdout=stdout, - stderr=stderr, - user=user, - group=group, - env=env, - cwd=cwd, - log=log, - foreground=foreground, - preexec_fn=preexec_fn, - success_exit_status=success_exit_status, - sandbox=sandbox, - scope=scope, - innerpid=False, - ) as (process, _): - out, err = process.communicate(input) - except FileNotFoundError: - return CompletedProcess(cmdline, 1, "", "") + with spawn( + cmdline, + check=check, + stdin=stdin, + stdout=stdout, + stderr=stderr, + user=user, + group=group, + env=env, + log=log, + foreground=foreground, + success_exit_status=success_exit_status, + sandbox=sandbox, + ) as process: + out, err = process.communicate(input) return CompletedProcess(cmdline, process.returncode, out, err) @@ -183,15 +178,12 @@ def spawn( group: Optional[int] = None, pass_fds: Collection[int] = (), env: Mapping[str, str] = {}, - cwd: Optional[Path] = None, log: bool = True, foreground: bool = False, preexec_fn: Optional[Callable[[], None]] = None, success_exit_status: Sequence[int] = (0,), sandbox: AbstractContextManager[Sequence[PathString]] = contextlib.nullcontext([]), - scope: Sequence[str] = (), - innerpid: bool = True, -) -> Iterator[tuple[Popen, int]]: +) -> Iterator[Popen]: assert sorted(set(pass_fds)) == list(pass_fds) cmdline = [os.fspath(x) for x in cmdline] @@ -225,6 +217,10 @@ def spawn( if "HOME" not in env: env["HOME"] = "/" + # sandbox.py takes care of setting $LISTEN_PID + if pass_fds: + env["LISTEN_FDS"] = str(len(pass_fds)) + def preexec() -> None: if foreground: make_foreground_process() @@ -259,51 +255,9 @@ def preexec() -> None: with sandbox as sbx: prefix = [os.fspath(x) for x in sbx] - # First, check if the sandbox works at all before executing the command. - if prefix and (rc := subprocess.run(prefix + ["true"]).returncode) != 0: - log_process_failure(prefix, cmdline, rc) - raise subprocess.CalledProcessError(rc, prefix + cmdline) - - if subprocess.run( - prefix + ["sh", "-c", f"command -v {cmdline[0]}"], - stdout=subprocess.DEVNULL, - ).returncode != 0: - if check: - die(f"{cmdline[0]} not found.", hint=f"Is {cmdline[0]} installed on the host system?") - - # We can't really return anything in this case, so we raise a specific exception that we can catch in - # run(). - logging.debug(f"{cmdline[0]} not found, not running {shlex.join(cmdline)}") - raise FileNotFoundError(cmdline[0]) - - if ( - foreground and - prefix and - subprocess.run(prefix + ["sh", "-c", "command -v setpgid"], stdout=subprocess.DEVNULL).returncode == 0 - ): - prefix += ["setpgid", "--foreground", "--"] - - if pass_fds: - # We don't know the PID before we start the process and we can't modify the environment in preexec_fn so we - # have to spawn a temporary shell to set the necessary environment variables before spawning the actual - # command. - prefix += ["sh", "-c", f"LISTEN_FDS={len(pass_fds)} LISTEN_PID=$$ exec $0 \"$@\""] - - if prefix and innerpid: - r, w = os.pipe2(os.O_CLOEXEC) - # Make sure that the write end won't be overridden in preexec() when we're moving fds forward. - q = fcntl.fcntl(w, fcntl.F_DUPFD_CLOEXEC, SD_LISTEN_FDS_START + len(pass_fds) + 1) - os.close(w) - w = q - # dash doesn't support working with file descriptors higher than 9 so make sure we use bash. - innerpidcmd = ["bash", "-c", f"echo $$ >&{w} && exec {w}>&- && exec $0 \"$@\""] - else: - innerpidcmd = [] - r, w = (None, None) - try: with subprocess.Popen( - [*scope, *prefix, *innerpidcmd, *cmdline], + [*prefix, *cmdline], stdin=stdin, stdout=stdout, stderr=stderr, @@ -312,24 +266,14 @@ def preexec() -> None: group=group, # pass_fds only comes into effect after python has invoked the preexec function, so we make sure that # pass_fds contains the file descriptors to keep open after we've done our transformation in preexec(). - pass_fds=[SD_LISTEN_FDS_START + i for i in range(len(pass_fds))] + ([w] if w else []), + pass_fds=[SD_LISTEN_FDS_START + i for i in range(len(pass_fds))], env=env, - cwd=cwd, preexec_fn=preexec, ) as proc: - if w: - os.close(w) - pid = proc.pid try: - if r: - with open(r) as f: - s = f.read() - if s: - pid = int(s) - - yield proc, pid + yield proc except BaseException: - kill(proc, pid, signal.SIGTERM) + proc.terminate() raise finally: returncode = proc.wait() @@ -339,14 +283,13 @@ def preexec() -> None: log_process_failure(prefix, cmdline, returncode) if ARG_DEBUG_SHELL.get(): subprocess.run( - [*scope, *prefix, "bash"], + [*prefix, "bash"], check=False, stdin=sys.stdin, text=True, user=user, group=group, env=env, - cwd=cwd, preexec_fn=preexec, ) raise subprocess.CalledProcessError(returncode, cmdline) @@ -385,18 +328,6 @@ def find_binary(*names: PathString, root: Path = Path("/"), extra: Sequence[Path return None -def kill(process: Popen, innerpid: int, signal: int) -> None: - process.poll() - if process.returncode is not None: - return - - try: - os.kill(innerpid, signal) - # Handle the race condition where the process might exit between us calling poll() and us calling os.kill(). - except ProcessLookupError: - pass - - class AsyncioThread(threading.Thread): """ The default threading.Thread() is not interruptable, so we make our own version by using the concurrency @@ -448,3 +379,246 @@ def __exit__( raise self.exc.get_nowait() except queue.Empty: pass + + +class SandboxProtocol(Protocol): + def __call__( + self, + *, + binary: Optional[PathString], + vartmp: bool = False, + options: Sequence[PathString] = (), + ) -> AbstractContextManager[list[PathString]]: ... + + +def nosandbox( + *, + binary: Optional[PathString], + vartmp: bool = False, + options: Sequence[PathString] = (), +) -> AbstractContextManager[list[PathString]]: + return contextlib.nullcontext([]) + + +def finalize_passwd_mounts(root: PathString) -> list[PathString]: + """ + If passwd or a related file exists in the apivfs directory, bind mount it over the host files while we + run the command, to make sure that the command we run uses user/group information from the apivfs + directory instead of from the host. + """ + return flatten( + ("--ro-bind-try", Path(root) / "etc" / f, f"/etc/{f}") + for f in ("passwd", "group", "shadow", "gshadow") + ) + + +def network_options(*, network: bool) -> list[PathString]: + return [ + "--setenv", "SYSTEMD_OFFLINE", one_zero(network), + *(["--unshare-net"] if not network else []), + ] + + +@contextlib.contextmanager +def vartmpdir(condition: bool = True) -> Iterator[Optional[Path]]: + if not condition: + yield None + return + + # We want to use an empty subdirectory in the host's temporary directory as the sandbox's /var/tmp. + d = Path(os.getenv("TMPDIR", "/var/tmp")) / f"mkosi-var-tmp-{uuid.uuid4().hex[:16]}" + d.mkdir(mode=0o1777) + + try: + yield d + finally: + shutil.rmtree(d) + + +@contextlib.contextmanager +def sandbox_cmd( + *, + network: bool = False, + devices: bool = False, + vartmp: bool = False, + scripts: Optional[Path] = None, + tools: Path = Path("/"), + relaxed: bool = False, + usroverlaydirs: Sequence[PathString] = (), + options: Sequence[PathString] = (), + setup: Sequence[PathString] = (), +) -> Iterator[list[PathString]]: + cmdline: list[PathString] = [ + *setup, + sys.executable, "-SI", mkosi.sandbox.__file__, + "--proc", "/proc", + # We mounted a subdirectory of TMPDIR to /var/tmp so we unset TMPDIR so that /tmp or /var/tmp are used instead. + "--unsetenv", "TMPDIR", + *network_options(network=network), + # apivfs_script_cmd() and chroot_script_cmd() are executed from within the sandbox, but they still use + # sandbox.py, so we make sure it is available inside the sandbox so it can be executed there as well. + "--ro-bind", Path(mkosi.sandbox.__file__), "/sandbox.py", + ] + + if usroverlaydirs: + cmdline += ["--overlay-lowerdir", tools / "usr"] + + for d in usroverlaydirs: + cmdline += ["--overlay-lowerdir", d] + + cmdline += ["--overlay", "/usr"] + else: + cmdline += ["--ro-bind", tools / "usr", "/usr"] + + if relaxed: + cmdline += ["--bind", "/tmp", "/tmp"] + else: + cmdline += ["--dir", "/tmp", "--dir", "/var/tmp", "--unshare-ipc"] + + if (tools / "nix/store").exists(): + cmdline += ["--bind", tools / "nix/store", "/nix/store"] + + if devices or relaxed: + cmdline += [ + "--bind", "/sys", "/sys", + "--bind", "/run", "/run", + "--bind", "/dev", "/dev", + ] + else: + cmdline += ["--dev", "/dev"] + + if relaxed: + dirs = ("/etc", "/opt", "/srv", "/media", "/mnt", "/var") + + for d in dirs: + if Path(d).exists(): + cmdline += ["--bind", d, d] + + # Either add the home directory we're running from or the current working directory if we're not running from + # inside a home directory. + if Path.cwd() == Path("/"): + d = "" + if Path.cwd().is_relative_to("/root"): + d = "/root" + elif Path.cwd() == Path("/home"): + d = "/home" + elif Path.cwd().is_relative_to("/home"): + # `Path.parents` only supports slices and negative indexing from Python 3.10 onwards. + # TODO: Remove list() when we depend on Python 3.10 or newer. + d = os.fspath(list(Path.cwd().parents)[-2]) + else: + d = os.fspath(Path.cwd()) + + if d and not any(Path(d).is_relative_to(dir) for dir in (*dirs, "/usr", "/nix", "/tmp")): + cmdline += ["--bind", d, d] + + for d in ("bin", "sbin", "lib", "lib32", "lib64"): + if (p := tools / d).is_symlink(): + cmdline += ["--symlink", p.readlink(), Path("/") / p.relative_to(tools)] + elif p.is_dir(): + cmdline += ["--ro-bind", p, Path("/") / p.relative_to(tools)] + + path = "/usr/bin:/usr/sbin" if tools != Path("/") else os.environ["PATH"] + + cmdline += ["--setenv", "PATH", f"/scripts:{path}", *options] + + # If we're using /usr from a tools tree, we have to use /etc/alternatives from the tools tree as well if it + # exists since that points directly back to /usr. Apply this after the options so the caller can mount + # something else to /etc without overriding this mount. In relaxed mode, we only do this if /etc/alternatives + # already exists on the host as otherwise we'd modify the host's /etc by creating the mountpoint ourselves (or + # fail when trying to create it). + if (tools / "etc/alternatives").exists() and (not relaxed or Path("/etc/alternatives").exists()): + cmdline += ["--ro-bind", tools / "etc/alternatives", "/etc/alternatives"] + + if scripts: + cmdline += ["--ro-bind", scripts, "/scripts"] + + if network and not relaxed and Path("/etc/resolv.conf").exists(): + cmdline += ["--ro-bind", "/etc/resolv.conf", "/etc/resolv.conf"] + + with vartmpdir(condition=vartmp and not relaxed) as dir: + if dir: + cmdline += ["--bind", dir, "/var/tmp"] + + yield [*cmdline, "--"] + + +def apivfs_options(*, root: Path = Path("/buildroot")) -> list[PathString]: + return [ + "--tmpfs", root / "run", + "--tmpfs", root / "tmp", + "--bind", "/var/tmp", root / "var/tmp", + "--proc", root / "proc", + "--dev", root / "dev", + # Nudge gpg to create its sockets in /run by making sure /run/user/0 exists. + "--dir", root / "run/user/0", + # Make sure anything running in the root directory thinks it's in a container. $container can't always + # be accessed so we write /run/host/container-manager as well which is always accessible. + "--write", "mkosi", root / "run/host/container-manager", + ] + + +def apivfs_script_cmd(*, tools: bool, options: Sequence[PathString] = ()) -> list[PathString]: + return [ + "python3" if tools else sys.executable, "-SI", "/sandbox.py", + "--bind", "/", "/", + "--same-dir", + *apivfs_options(), + *options, + "--", + ] + + +def chroot_options(*, network: bool = False) -> list[PathString]: + return [ + # Let's always run as (fake) root when we chroot inside the image as tools executed within the image could + # have builtin assumptions about files being owned by root. + "--become-root", + # Unshare IPC namespace so any tests that exercise IPC related features don't fail with permission errors as + # --become-root implies unsharing a user namespace which won't have access to the parent's IPC namespace + # anymore. + "--unshare-ipc", + "--setenv", "container", "mkosi", + "--setenv", "HOME", "/", + "--setenv", "PATH", "/usr/bin:/usr/sbin", + *(["--ro-bind-try", "/etc/resolv.conf", "/etc/resolv.conf"] if network else []), + "--setenv", "BUILDROOT", "/", + ] + + +@contextlib.contextmanager +def chroot_cmd( + *, + root: Path, + network: bool = False, + options: Sequence[PathString] = (), +) -> Iterator[list[PathString]]: + cmdline: list[PathString] = [ + sys.executable, "-SI", mkosi.sandbox.__file__, + "--bind", root, "/", + # We mounted a subdirectory of TMPDIR to /var/tmp so we unset TMPDIR so that /tmp or /var/tmp are used instead. + "--unsetenv", "TMPDIR", + *network_options(network=network), + *apivfs_options(root=Path("/")), + *chroot_options(network=network), + ] + + if network and Path("/etc/resolv.conf").exists(): + cmdline += ["--ro-bind", "/etc/resolv.conf", "/etc/resolv.conf"] + + with vartmpdir() as dir: + if dir: + cmdline += ["--bind", dir, "/var/tmp"] + + yield [*cmdline, *options, "--"] + + +def chroot_script_cmd(*, tools: bool, network: bool = False, work: bool = False) -> list[PathString]: + return [ + "python3" if tools else sys.executable, "-SI", "/sandbox.py", + "--bind", "/buildroot", "/", + *apivfs_options(root=Path("/")), + *chroot_options(network=network), + *(["--bind", "/work", "/work", "--chdir", "/work/src"] if work else []), + "--", + ] diff --git a/mkosi/sandbox.py b/mkosi/sandbox.py deleted file mode 100644 index 0ba5b9a6c..000000000 --- a/mkosi/sandbox.py +++ /dev/null @@ -1,312 +0,0 @@ -# SPDX-License-Identifier: LGPL-2.1-or-later -import contextlib -import dataclasses -import enum -import logging -import os -import shutil -import uuid -from collections.abc import Iterator, Sequence -from contextlib import AbstractContextManager -from pathlib import Path -from typing import Optional, Protocol - -from mkosi.types import PathString -from mkosi.user import INVOKING_USER -from mkosi.util import flatten, one_zero, startswith - - -@dataclasses.dataclass(frozen=True) -class Mount: - src: PathString - dst: PathString - devices: bool = False - ro: bool = False - required: bool = True - - def __hash__(self) -> int: - return hash((Path(self.src), Path(self.dst), self.devices, self.ro, self.required)) - - def __eq__(self, other: object) -> bool: - if not isinstance(other, Mount): - return False - - return self.__hash__() == other.__hash__() - - def options(self) -> list[str]: - if self.devices: - opt = "--dev-bind" if self.required else "--dev-bind-try" - elif self.ro: - opt = "--ro-bind" if self.required else "--ro-bind-try" - else: - opt = "--bind" if self.required else "--bind-try" - - return [opt, os.fspath(self.src), os.fspath(self.dst)] - - -class SandboxProtocol(Protocol): - def __call__( - self, - *, - binary: Optional[PathString], - vartmp: bool = False, - mounts: Sequence[Mount] = (), - extra: Sequence[PathString] = (), - ) -> AbstractContextManager[list[PathString]]: ... - - -def nosandbox( - *, - binary: Optional[PathString], - vartmp: bool = False, - mounts: Sequence[Mount] = (), - extra: Sequence[PathString] = (), -) -> AbstractContextManager[list[PathString]]: - return contextlib.nullcontext([]) - - -# https://github.com/torvalds/linux/blob/master/include/uapi/linux/capability.h -class Capability(enum.Enum): - CAP_NET_ADMIN = 12 - - -def have_effective_cap(capability: Capability) -> bool: - for line in Path("/proc/self/status").read_text().splitlines(): - if rhs := startswith(line, "CapEff:"): - hexcap = rhs.strip() - break - else: - logging.warning(f"\"CapEff:\" not found in /proc/self/status, assuming we don't have {capability}") - return False - - return (int(hexcap, 16) & (1 << capability.value)) != 0 - - -def finalize_passwd_mounts(root: PathString) -> list[Mount]: - """ - If passwd or a related file exists in the apivfs directory, bind mount it over the host files while we - run the command, to make sure that the command we run uses user/group information from the apivfs - directory instead of from the host. - """ - return [ - Mount(Path(root) / "etc" / f, f"/etc/{f}", ro=True, required=False) - for f in ("passwd", "group", "shadow", "gshadow") - ] - - -def finalize_mounts(mounts: Sequence[Mount]) -> list[PathString]: - mounts = list(set(mounts)) - - mounts = [ - m for m in mounts - if not any( - m != n and - m.devices == n.devices and - m.ro == n.ro and - m.required == n.required and - Path(m.src).is_relative_to(n.src) and - Path(m.dst).is_relative_to(n.dst) and - Path(m.src).relative_to(n.src) == Path(m.dst).relative_to(n.dst) - for n in mounts - ) - ] - - mounts = sorted(mounts, key=lambda m: (Path(m.dst), m.devices, not m.ro, m.required, Path(m.src))) - - return flatten(m.options() for m in mounts) - - -@contextlib.contextmanager -def sandbox_cmd( - *, - network: bool = False, - devices: bool = False, - vartmp: bool = False, - scripts: Optional[Path] = None, - tools: Path = Path("/"), - relaxed: bool = False, - mounts: Sequence[Mount] = (), - options: Sequence[PathString] = (), - setup: Sequence[PathString] = (), - extra: Sequence[PathString] = (), -) -> Iterator[list[PathString]]: - cmdline: list[PathString] = [] - mounts = list(mounts) - - if vartmp and not relaxed: - # We want to use an empty subdirectory in the host's temporary directory as the sandbox's /var/tmp. - vartmpdir = Path(os.getenv("TMPDIR", "/var/tmp")) / f"mkosi-var-tmp-{uuid.uuid4().hex[:16]}" - else: - vartmpdir = None - - cmdline += [ - *setup, - "bwrap", - *( - ["--unshare-net"] - if not network and (os.getuid() != 0 or have_effective_cap(Capability.CAP_NET_ADMIN)) - else [] - ), - "--die-with-parent", - "--proc", "/proc", - "--setenv", "SYSTEMD_OFFLINE", one_zero(network), - # We mounted a subdirectory of TMPDIR to /var/tmp so we unset TMPDIR so that /tmp or /var/tmp are used instead. - "--unsetenv", "TMPDIR", - ] - mounts += [Mount(tools / "usr", "/usr", ro=True)] - - if relaxed: - mounts += [Mount("/tmp", "/tmp")] - else: - cmdline += ["--dir", "/tmp", "--dir", "/var/tmp", "--unshare-ipc"] - - if (tools / "nix/store").exists(): - mounts += [Mount(tools / "nix/store", "/nix/store")] - - if devices or relaxed: - mounts += [ - Mount("/sys", "/sys"), - Mount("/run", "/run"), - Mount("/dev", "/dev", devices=True), - ] - else: - cmdline += ["--dev", "/dev"] - - if relaxed: - dirs = ("/etc", "/opt", "/srv", "/media", "/mnt", "/var", os.fspath(INVOKING_USER.home())) - - for d in dirs: - if Path(d).exists(): - mounts += [Mount(d, d)] - - if len(Path.cwd().parents) >= 2: - # `Path.parents` only supports slices and negative indexing from Python 3.10 onwards. - # TODO: Remove list() when we depend on Python 3.10 or newer. - d = os.fspath(list(Path.cwd().parents)[-2]) - elif len(Path.cwd().parents) == 1: - d = os.fspath(Path.cwd()) - else: - d = "" - - if d and d not in (*dirs, "/home", "/usr", "/nix", "/tmp"): - mounts += [Mount(d, d)] - - if vartmpdir: - mounts += [Mount(vartmpdir, "/var/tmp")] - - for d in ("bin", "sbin", "lib", "lib32", "lib64"): - if (p := tools / d).is_symlink(): - cmdline += ["--symlink", p.readlink(), Path("/") / p.relative_to(tools)] - elif p.is_dir(): - mounts += [Mount(p, Path("/") / p.relative_to(tools), ro=True)] - - path = "/usr/bin:/usr/sbin" if tools != Path("/") else os.environ["PATH"] - - cmdline += ["--setenv", "PATH", f"/scripts:{path}", *options] - - # If we're using /usr from a tools tree, we have to use /etc/alternatives from the tools tree as well if it - # exists since that points directly back to /usr. Apply this after the options so the caller can mount - # something else to /etc without overriding this mount. In relaxed mode, we only do this if /etc/alternatives - # already exists on the host as otherwise we'd modify the host's /etc by creating the mountpoint ourselves (or - # fail when trying to create it). - if (tools / "etc/alternatives").exists() and (not relaxed or Path("/etc/alternatives").exists()): - mounts += [Mount(tools / "etc/alternatives", "/etc/alternatives", ro=True)] - - if scripts: - mounts += [Mount(scripts, "/scripts", ro=True)] - - if network and not relaxed and Path("/etc/resolv.conf").exists(): - mounts += [Mount("/etc/resolv.conf", "/etc/resolv.conf")] - - cmdline += finalize_mounts(mounts) - - if not any(Path(m.dst) == Path("/etc") for m in mounts): - cmdline += ["--symlink", "../proc/self/mounts", "/etc/mtab"] - - # bubblewrap creates everything with a restricted mode so relax stuff as needed. - ops = [] - if not relaxed: - if not any(Path(m.dst) == Path("/tmp") for m in mounts): - ops += ["chmod 1777 /tmp"] - if not devices: - ops += ["chmod 1777 /dev/shm"] - if vartmpdir: - ops += ["chmod 1777 /var/tmp"] - if relaxed and INVOKING_USER.home().exists() and len(INVOKING_USER.home().parents) > 1: - # We might mount a subdirectory of /home so /home will be created with the wrong permissions by bubblewrap so - # we need to fix up the permissions. - ops += [f"chmod 755 {list(INVOKING_USER.home().parents)[-1]}"] - else: - ops += ["chmod 755 /etc"] - ops += ["exec $0 \"$@\""] - - cmdline += ["sh", "-c", " && ".join(ops), *extra] - - if vartmpdir: - vartmpdir.mkdir(mode=0o1777) - - try: - yield cmdline - finally: - if vartmpdir: - shutil.rmtree(vartmpdir) - - -def apivfs_cmd() -> list[PathString]: - return [ - "bwrap", - "--dev-bind", "/", "/", - "--tmpfs", "/buildroot/run", - "--tmpfs", "/buildroot/tmp", - "--bind", "/var/tmp", "/buildroot/var/tmp", - "--proc", "/buildroot/proc", - "--dev", "/buildroot/dev", - # Make sure /etc/machine-id is not overwritten by any package manager post install scripts. - "--ro-bind-try", "/buildroot/etc/machine-id", "/buildroot/etc/machine-id", - # Nudge gpg to create its sockets in /run by making sure /run/user/0 exists. - "--dir", "/buildroot/run/user/0", - *flatten(mount.options() for mount in finalize_passwd_mounts("/buildroot")), - "sh", "-c", - " && ".join( - [ - "chmod 1777 /buildroot/tmp /buildroot/var/tmp /buildroot/dev/shm", - "chmod 755 /buildroot/run", - # Make sure anything running in the root directory thinks it's in a container. $container can't always - # be accessed so we write /run/host/container-manager as well which is always accessible. - "mkdir -m 755 /buildroot/run/host", - "echo mkosi >/buildroot/run/host/container-manager", - "exec $0 \"$@\"", - ] - ), - ] - - -def chroot_cmd(*, resolve: bool = False, work: bool = False) -> list[PathString]: - workdir = "/buildroot/work" if work else "" - - return apivfs_cmd() + [ - "sh", "-c", - " && ".join( - [ - *([f"trap 'rm -rf {workdir}' EXIT"] if work else []), - # /etc/resolv.conf can be a dangling symlink to /run/systemd/resolve/stub-resolv.conf. Bubblewrap tries - # to call mkdir() on each component of the path which means it will try to call - # mkdir(/run/systemd/resolve/stub-resolv.conf) which will fail unless /run/systemd/resolve exists - # already so we make sure that it already exists. - f"mkdir -p -m 755 {workdir} /buildroot/run/systemd /buildroot/run/systemd/resolve", - # No exec here because we need to clean up the /work directory afterwards. - "$0 \"$@\"", - ] - ), - "bwrap", - "--dev-bind", "/buildroot", "/", - "--setenv", "container", "mkosi", - "--setenv", "HOME", "/", - "--setenv", "PATH", "/work/scripts:/usr/bin:/usr/sbin", - *(["--ro-bind-try", "/etc/resolv.conf", "/etc/resolv.conf"] if resolve else []), - *(["--bind", "/work", "/work", "--chdir", "/work/src"] if work else []), - "--setenv", "BUILDROOT", "/", - # Start an interactive bash shell if we're not given any arguments. - "sh", "-c", '[ "$0" = "sh" ] && [ $# -eq 0 ] && exec bash -i || exec $0 "$@"', - ] - diff --git a/mkosi/sandbox/__init__.py b/mkosi/sandbox/__init__.py new file mode 100644 index 000000000..7db340c52 --- /dev/null +++ b/mkosi/sandbox/__init__.py @@ -0,0 +1,794 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +""" +This is a standalone implementation of sandboxing which is used by mkosi. Note that this is invoked many times while +building the image and as a result, the performance of this script has a substantial impact on the performance of mkosi +itself. To keep the runtime of this script to a minimum, please don't import any extra modules if it can be avoided. +""" + +import ctypes +import os +import sys +import warnings # noqa: F401 (loaded lazily by os.execvp() which happens too late) + +__version__ = "25~devel" + +# The following constants are taken from the Linux kernel headers. +AT_EMPTY_PATH = 0x1000 +AT_FDCWD = -100 +AT_NO_AUTOMOUNT = 0x800 +AT_RECURSIVE = 0x8000 +AT_SYMLINK_NOFOLLOW = 0x100 +BTRFS_SUPER_MAGIC = 0x9123683E +CAP_NET_ADMIN = 12 +CAP_SYS_ADMIN = 21 +CLONE_NEWIPC = 0x08000000 +CLONE_NEWNET = 0x40000000 +CLONE_NEWNS = 0x00020000 +CLONE_NEWUSER = 0x10000000 +ENOENT = 2 +LINUX_CAPABILITY_U32S_3 = 2 +LINUX_CAPABILITY_VERSION_3 = 0x20080522 +MNT_DETACH = 2 +MOUNT_ATTR_RDONLY = 0x00000001 +MOUNT_ATTR_NOSUID = 0x00000002 +MOUNT_ATTR_NODEV = 0x00000004 +MOUNT_ATTR_NOEXEC = 0x00000008 +MOUNT_ATTR_SIZE_VER0 = 32 +MOVE_MOUNT_F_EMPTY_PATH = 0x00000004 +MS_BIND = 4096 +MS_MOVE = 8192 +MS_REC = 16384 +MS_SHARED = 1 << 20 +MS_SLAVE = 1 << 19 +NR_mount_setattr = 442 +NR_move_mount = 429 +NR_open_tree = 428 +OPEN_TREE_CLOEXEC = os.O_CLOEXEC +OPEN_TREE_CLONE = 1 +PR_CAP_AMBIENT = 47 +PR_CAP_AMBIENT_RAISE = 2 +# These definitions are taken from the libseccomp headers +SCMP_ACT_ALLOW = 0x7FFF0000 +SCMP_ACT_ERRNO = 0x00050000 + +class mount_attr(ctypes.Structure): + _fields_ = [ + ("attr_set", ctypes.c_uint64), + ("attr_clr", ctypes.c_uint64), + ("propagation", ctypes.c_uint64), + ("userns_fd", ctypes.c_uint64), + ] + + +class cap_user_header_t(ctypes.Structure): + # __user_cap_header_struct + _fields_ = [ + ("version", ctypes.c_uint32), + ("pid", ctypes.c_int), + ] + + +class cap_user_data_t(ctypes.Structure): + # __user_cap_data_struct + _fields_ = [ + ("effective", ctypes.c_uint32), + ("permitted", ctypes.c_uint32), + ("inheritable", ctypes.c_uint32), + ] + + +libc = ctypes.CDLL(None, use_errno=True) + +libc.syscall.restype = ctypes.c_long +libc.unshare.argtypes = (ctypes.c_int,) +libc.statfs.argtypes = (ctypes.c_char_p, ctypes.c_void_p) +libc.eventfd.argtypes = (ctypes.c_int, ctypes.c_int) +libc.mount.argtypes = (ctypes.c_char_p, ctypes.c_char_p, ctypes.c_char_p, ctypes.c_ulong, ctypes.c_char_p) +libc.pivot_root.argtypes = (ctypes.c_char_p, ctypes.c_char_p) +libc.umount2.argtypes = (ctypes.c_char_p, ctypes.c_int) +libc.capget.argtypes = (ctypes.c_void_p, ctypes.c_void_p) +libc.capset.argtypes = (ctypes.c_void_p, ctypes.c_void_p) + + +def oserror(filename: str = "") -> None: + raise OSError(ctypes.get_errno(), os.strerror(ctypes.get_errno()), filename or None) + + +def unshare(flags: int) -> None: + if libc.unshare(flags) < 0: + oserror() + + +def statfs(path: str) -> int: + # struct statfs is 120 bytes, which equals 15 longs. Since we only care about the first field and the first field + # is of type long, we avoid declaring the full struct by just passing an array of 15 longs as the output argument. + buffer = (ctypes.c_long * 15)() + + if libc.statfs(path.encode(), ctypes.byref(buffer)) < 0: + oserror() + + return int(buffer[0]) + + +def mount(src: str, dst: str, type: str, flags: int, options: str) -> None: + srcb = src.encode() if src else None + typeb = type.encode() if type else None + optionsb = options.encode() if options else None + if libc.mount(srcb, dst.encode(), typeb, flags, optionsb) < 0: + oserror() + + +def umount2(path: str, flags: int = 0) -> None: + if libc.umount2(path.encode(), flags) < 0: + oserror() + + +def cap_permitted_to_ambient() -> None: + """ + When unsharing a user namespace and mapping the current user to itself, the user has a full set of capabilities in + the user namespace. This allows the user to do mounts after unsharing a mount namespace for example. However, these + capabilities are lost again when the user executes a subprocess. As we also want subprocesses invoked by the user + to be able to mount stuff, we make sure the capabilities are inherited by adding all the user's capabilities to the + inherited and ambient capabilities set, which makes sure that they are passed down to subprocesses. + """ + header = cap_user_header_t(LINUX_CAPABILITY_VERSION_3, 0) + payload = (cap_user_data_t * LINUX_CAPABILITY_U32S_3)() + + if libc.capget(ctypes.addressof(header), ctypes.byref(payload)) < 0: + oserror() + + payload[0].inheritable = payload[0].permitted + payload[1].inheritable = payload[1].permitted + + if libc.capset(ctypes.addressof(header), ctypes.byref(payload)) < 0: + oserror() + + effective = payload[1].effective << 32 | payload[0].effective + + with open("/proc/sys/kernel/cap_last_cap", "rb") as f: + last_cap = int(f.read()) + + libc.prctl.argtypes = (ctypes.c_int, ctypes.c_ulong, ctypes.c_ulong, ctypes.c_ulong, ctypes.c_ulong) + + for cap in range(ctypes.sizeof(ctypes.c_uint64) * 8): + if cap > last_cap: + break + + if effective & (1 << cap) and libc.prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, cap, 0, 0) < 0: + oserror() + + +def have_effective_cap(capability: int) -> bool: + with open("/proc/self/status", "rb") as f: + for line in f.readlines(): + if line.startswith(b"CapEff:"): + return (int(line[7:], 16) & (1 << capability)) != 0 + + return False + + +def seccomp_suppress_chown() -> None: + """ + There's still a few files and directories left in distributions in /usr and /etc that are not owned by root. This + causes package managers to fail to install the corresponding packages when run from a single uid user namespace. + Unfortunately, non-root users can only create files owned by their own uid. To still allow non-root users to build + images, if requested we install a seccomp filter that makes calls to chown() and friends a noop. + """ + libseccomp = ctypes.CDLL("libseccomp.so.2") + if libseccomp is None: + raise FileNotFoundError("libseccomp.so.2") + + libseccomp.seccomp_init.argtypes = (ctypes.c_uint32,) + libseccomp.seccomp_init.restype = ctypes.c_void_p + libseccomp.seccomp_release.argtypes = (ctypes.c_void_p,) + libseccomp.seccomp_release.restype = None + libseccomp.seccomp_syscall_resolve_name.argtypes = (ctypes.c_char_p,) + libseccomp.seccomp_rule_add_exact.argtypes = (ctypes.c_void_p, ctypes.c_uint32, ctypes.c_int, ctypes.c_uint) + libseccomp.seccomp_load.argtypes = (ctypes.c_void_p,) + + seccomp = libseccomp.seccomp_init(SCMP_ACT_ALLOW) + + try: + for syscall in (b"chown", b"chown32", b"fchown", b"fchown32", b"fchownat", b"lchown", b"lchown32"): + id = libseccomp.seccomp_syscall_resolve_name(syscall) + libseccomp.seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO, id, 0) + + libseccomp.seccomp_load(seccomp) + finally: + libseccomp.seccomp_release(seccomp) + + +def mount_rbind(src: str, dst: str, attrs: int = 0) -> None: + """ + When using the old mount syscall to do a recursive bind mount, mount options are not applied recursively. Because + we want to do recursive read-only bind mounts in some cases, we use the new mount API for that which does allow + recursively changing mount options when doing bind mounts. + """ + + flags = AT_NO_AUTOMOUNT | AT_RECURSIVE | AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE + + try: + libc.open_tree.argtypes = (ctypes.c_int, ctypes.c_char_p, ctypes.c_uint) + fd = libc.open_tree(AT_FDCWD, src.encode(), flags) + except AttributeError: + libc.syscall.argtypes = (ctypes.c_long, ctypes.c_int, ctypes.c_char_p, ctypes.c_uint) + fd = libc.syscall(NR_open_tree, AT_FDCWD, src.encode(), flags) + + if fd < 0: + oserror(src) + + try: + attr = mount_attr() + attr.attr_set = attrs + + flags = AT_EMPTY_PATH | AT_RECURSIVE + + try: + libc.mount_setattr.argtypes = ( + ctypes.c_int, ctypes.c_char_p, ctypes.c_uint, ctypes.c_void_p, ctypes.c_size_t, + ) + r = libc.mount_setattr(fd, b"", flags, ctypes.addressof(attr), MOUNT_ATTR_SIZE_VER0) + except AttributeError: + libc.syscall.argtypes = ( + ctypes.c_long, ctypes.c_int, ctypes.c_char_p, ctypes.c_uint, ctypes.c_void_p, ctypes.c_size_t, + ) + r = libc.syscall(NR_mount_setattr, fd, b"", flags, ctypes.addressof(attr), MOUNT_ATTR_SIZE_VER0) + + if r < 0: + oserror(src) + + try: + libc.move_mount.argtypes = (ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p, ctypes.c_uint) + r = libc.move_mount(fd, b"", AT_FDCWD, dst.encode(), MOVE_MOUNT_F_EMPTY_PATH) + except AttributeError: + libc.syscall.argtypes = ( + ctypes.c_long, ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p, ctypes.c_uint, + ) + r = libc.syscall(NR_move_mount, fd, b"", AT_FDCWD, dst.encode(), MOVE_MOUNT_F_EMPTY_PATH) + + if r < 0: + oserror(dst) + finally: + os.close(fd) + + +class umask: + def __init__(self, mask: int): + self.mask = mask + + def __enter__(self) -> None: + self.mask = os.umask(self.mask) + + def __exit__(self, *args: object, **kwargs: object) -> None: + os.umask(self.mask) + + +def become_user(uid: int, gid: int) -> None: + """ + This function implements the required dance to unshare a user namespace and map the current user to itself or to + root within it. The kernel only allows a process running outside of the unshared user namespace to write the + necessary uid and gid mappings, so we fork off a child process, make it wait until the parent process has unshared + a user namespace, and then writes the necessary uid and gid mappings. + """ + ppid = os.getpid() + + event = libc.eventfd(0, 0) + if event < 0: + oserror() + + pid = os.fork() + if pid == 0: + try: + os.read(event, ctypes.sizeof(ctypes.c_uint64)) + os.close(event) + with open(f"/proc/{ppid}/setgroups", "wb") as f: + f.write(b"deny\n") + with open(f"/proc/{ppid}/gid_map", "wb") as f: + f.write(f"{gid} {os.getgid()} 1\n".encode()) + with open(f"/proc/{ppid}/uid_map", "wb") as f: + f.write(f"{uid} {os.getuid()} 1\n".encode()) + except OSError as e: + os._exit(e.errno) + except BaseException: + os._exit(1) + else: + os._exit(0) + + try: + unshare(CLONE_NEWUSER) + finally: + os.write(event, ctypes.c_uint64(1)) + os.close(event) + _, status = os.waitpid(pid, 0) + + rc = os.waitstatus_to_exitcode(status) + if rc != 0: + raise OSError(rc, os.strerror(rc)) + + +def acquire_privileges(*, become_root: bool = False) -> bool: + if os.getuid() == 0 or (not become_root and have_effective_cap(CAP_SYS_ADMIN)): + return False + + if become_root: + become_user(0, 0) + else: + become_user(os.getuid(), os.getgid()) + cap_permitted_to_ambient() + + return True + + +def userns_has_single_user() -> bool: + try: + with open("/proc/self/uid_map", "rb") as f: + lines = f.readlines() + except FileNotFoundError: + return False + + return len(lines) == 1 and int(lines[0].split()[-1]) == 1 + + +def chase(root: str, path: str) -> str: + fd = os.open("/", os.O_CLOEXEC | os.O_PATH | os.O_DIRECTORY) + + try: + os.chroot(root) + os.chdir("/") + return joinpath(root, os.path.realpath(path)) + finally: + os.fchdir(fd) + os.close(fd) + os.chroot(".") + + +def splitpath(path: str) -> tuple[str, ...]: + return tuple(p for p in path.split("/") if p) + + +def joinpath(path: str, *paths: str) -> str: + return os.path.join(path, *(p.lstrip("/") for p in paths)) + + +def is_relative_to(one: str, two: str) -> bool: + return os.path.commonpath((one, two)) == two + + +class FSOperation: + def __init__(self, dst: str) -> None: + self.dst = dst + + def execute(self, oldroot: str, newroot: str) -> None: + raise NotImplementedError() + + @classmethod + def optimize(cls, fsops: list["FSOperation"]) -> list["FSOperation"]: + binds = set() + rest = [] + + for fsop in fsops: + if isinstance(fsop, BindOperation): + binds.add(fsop) + else: + rest.append(fsop) + + # Drop all bind mounts that are mounted from beneath another bind mount to the same location within the new + # rootfs. + binds = [ + m for m in binds + if not any( + m != n and + m.readonly == n.readonly and + m.required == n.required and + is_relative_to(m.src, n.src) and + is_relative_to(m.dst, n.dst) and + os.path.relpath(m.src, n.src) == os.path.relpath(m.dst, n.dst) + for n in binds + ) + ] + + # Make sure bind mounts override other operations on the same destination by appending them to the rest and + # depending on python's stable sort behavior. + return sorted([*rest, *binds], key=lambda fsop: splitpath(fsop.dst)) + + +class BindOperation(FSOperation): + def __init__(self, src: str, dst: str, *, readonly: bool, required: bool) -> None: + self.src = src + self.readonly = readonly + self.required = required + super().__init__(dst) + + def __hash__(self) -> int: + return hash((splitpath(self.src), splitpath(self.dst), self.readonly, self.required)) + + def __eq__(self, other: object) -> bool: + return isinstance(other, BindOperation) and self.__hash__() == other.__hash__() + + def execute(self, oldroot: str, newroot: str) -> None: + src = chase(oldroot, self.src) + dst = chase(newroot, self.dst) + + if not os.path.exists(src) and not self.required: + return + + with umask(~0o755): + os.makedirs(os.path.dirname(dst), exist_ok=True) + + if not os.path.exists(dst): + isfile = os.path.isfile(src) + + with umask(~0o644 if isfile else ~0o755): + if isfile: + os.close(os.open(dst, os.O_CREAT | os.O_CLOEXEC)) + else: + os.mkdir(dst) + + mount_rbind(src, dst, attrs=MOUNT_ATTR_RDONLY if self.readonly else 0) + + +class ProcOperation(FSOperation): + def execute(self, oldroot: str, newroot: str) -> None: + dst = chase(newroot, self.dst) + with umask(~0o755): + os.makedirs(dst, exist_ok=True) + + mount_rbind(joinpath(oldroot, "proc"), dst) + + +class DevOperation(FSOperation): + ttyname = os.ttyname(2) if os.isatty(2) else None + + def execute(self, oldroot: str, newroot: str) -> None: + # We don't put actual devices in /dev, just the API stuff in there that all manner of things depend on, + # like /dev/null. + dst = chase(newroot, self.dst) + with umask(~0o755): + os.makedirs(dst, exist_ok=True) + + # Note that the mode is curcial here. If the default mode (1777) is used, trying to access /dev/null fails + # with EACCESS for unknown reasons. + mount("tmpfs", dst, "tmpfs", 0, "mode=0755") + + for node in ("null", "zero", "full", "random", "urandom", "tty"): + ndst = joinpath(dst, node) + os.close(os.open(ndst, os.O_CREAT | os.O_CLOEXEC)) + + mount(joinpath(oldroot, "dev", node), ndst, "", MS_BIND, "") + + for i, node in enumerate(("stdin", "stdout", "stderr")): + os.symlink(f"/proc/self/fd/{i}", joinpath(dst, node)) + + os.symlink("/proc/self/fd", joinpath(dst, "fd")) + os.symlink("/proc/kcore", joinpath(dst, "core")) + + with umask(~0o1777): + os.mkdir(joinpath(dst, "shm"), mode=0o1777) + with umask(~0o755): + os.mkdir(joinpath(dst, "pts")) + + mount("devpts", joinpath(dst, "pts"), "devpts", 0, "newinstance,ptmxmode=0666,mode=620") + + os.symlink("pts/ptmx", joinpath(dst, "ptmx")) + + if self.ttyname: + os.close(os.open(joinpath(dst, "console"), os.O_CREAT | os.O_CLOEXEC)) + mount(joinpath(oldroot, self.ttyname), joinpath(dst, "console"), "", MS_BIND, "") + + +class TmpfsOperation(FSOperation): + def execute(self, oldroot: str, newroot: str) -> None: + dst = chase(newroot, self.dst) + with umask(~0o755): + os.makedirs(dst, exist_ok=True) + + options = "" if any(dst.endswith(suffix) for suffix in ("/tmp", "/var/tmp")) else "mode=0755" + mount("tmpfs", dst, "tmpfs", 0, options) + + +class DirOperation(FSOperation): + def execute(self, oldroot: str, newroot: str) -> None: + dst = chase(newroot, self.dst) + with umask(~0o755): + os.makedirs(os.path.dirname(dst), exist_ok=True) + + mode = 0o1777 if any(dst.endswith(suffix) for suffix in ("/tmp", "/var/tmp")) else 0o755 + if not os.path.exists(dst): + with umask(~mode): + os.mkdir(dst, mode=mode) + + +class SymlinkOperation(FSOperation): + def __init__(self, src: str, dst: str) -> None: + self.src = src + super().__init__(dst) + + def execute(self, oldroot: str, newroot: str) -> None: + dst = joinpath(newroot, self.dst) + try: + os.symlink(self.src, dst) + except FileExistsError: + if os.readlink(dst) == self.src: + return + + raise + + +class WriteOperation(FSOperation): + def __init__(self, data: str, dst: str) -> None: + self.data = data + super().__init__(dst) + + def execute(self, oldroot: str, newroot: str) -> None: + dst = chase(newroot, self.dst) + with umask(~0o755): + os.makedirs(os.path.dirname(dst), exist_ok=True) + with open(dst, "wb") as f: + f.write(self.data.encode()) + + +class OverlayOperation(FSOperation): + def __init__(self, lowerdirs: tuple[str, ...], upperdir: str, workdir: str, dst: str) -> None: + self.lowerdirs = lowerdirs + self.upperdir = upperdir + self.workdir = workdir + super().__init__(dst) + + # This supports being used as a context manager so we can reuse the logic for mount_overlay() in mounts.py. + def __enter__(self) -> None: + self.execute("/", "/") + + def __exit__(self, *args: object, **kwargs: object) -> None: + umount2(self.dst) + + def execute(self, oldroot: str, newroot: str) -> None: + lowerdirs = tuple(chase(oldroot, p) for p in self.lowerdirs) + upperdir = chase(oldroot, self.upperdir) if self.upperdir else None + workdir = chase(oldroot, self.workdir) if self.workdir else None + dst = chase(newroot, self.dst) + with umask(~0o755): + os.makedirs(dst, exist_ok=True) + + options = [ + f"lowerdir={':'.join(lowerdirs)}", + "userxattr", + # Disable the inodes index and metacopy (only copy metadata upwards if possible) + # options. If these are enabled (e.g., if the kernel enables them by default), + # the mount will fail if the upper directory has been earlier used with a different + # lower directory, such as with a build overlay that was generated on top of a + # different temporary root. + # See https://www.kernel.org/doc/html/latest/filesystems/overlayfs.html#sharing-and-copying-layers + # and https://github.com/systemd/mkosi/issues/1841. + "index=off", + "metacopy=off", + ] + + if upperdir: + options += [f"upperdir={upperdir}"] + if workdir: + options += [f"workdir={workdir}"] + + mount("overlayfs", dst, "overlay", 0, ",".join(options)) + + +ANSI_HIGHLIGHT = "\x1B[0;1;39m" if os.isatty(2) else "" +ANSI_NORMAL = "\x1B[0m" if os.isatty(2) else "" + +HELP = f"""\ +mkosi-sandbox [OPTIONS...] COMMAND [ARGUMENTS...] + +{ANSI_HIGHLIGHT}Run the specified command in a custom sandbox.{ANSI_NORMAL} + + -h --help Show this help + --version Show package version + --tmpfs DST Mount a new tmpfs on DST + --dev DST Mount dev on DST + --proc DST Mount procfs on DST + --dir DST Create a new directory at DST + --bind SRC DST Bind mount the host path SRC to DST + --bind-try SRC DST Bind mount the host path SRC to DST if it exists + --ro-bind SRC DST Bind mount the host path SRC to DST read-only + --ro-bind-try SRC DST Bind mount the host path SRC to DST read-only if it exists + --symlink SRC DST Create a symlink at DST pointing to SRC + --write DATA DST Write DATA to DST + --overlay-lowerdir DIR Add a lower directory for the next overlayfs mount + --overlay-upperdir DIR Set the upper directory for the next overlayfs mount + --overlay-workdir DIR Set the working directory for the next overlayfs mount + --overlay DST Mount an overlay filesystem at DST + --unsetenv NAME Unset the environment variable with name NAME + --setenv NAME VALUE Set the environment variable with name NAME to VALUE + --chdir DIR Change the working directory in the sandbox to DIR + --same-dir Change the working directory in the sandbox to $PWD + --become-root Map the current user/group to root:root in the sandbox + --suppress-chown Make chown() syscalls in the sandbox a noop + --unshare-net Unshare the network namespace if possible + --unshare-ipc Unshare the IPC namespace if possible + +See the mkosi-sandbox(1) man page for details.\ +""" + +def main() -> None: + # We don't use argparse as it takes +- 10ms to import and since this is purely for internal use, it's not necessary + # to have good UX for this CLI interface so it's trivial to write ourselves. + argv = list(reversed(sys.argv[1:])) + fsops: list[FSOperation] = [] + setenv = [] + unsetenv = [] + lowerdirs = [] + upperdir = "" + workdir = "" + chdir = None + become_root = suppress_chown = unshare_net = unshare_ipc = False + + while argv: + arg = argv.pop() + + if arg == "--": + break + + if arg in ("-h", "--help"): + print(HELP, file=sys.stderr) + sys.exit(0) + elif arg == "--version": + print(__version__, file=sys.stderr) + sys.exit(0) + if arg == "--tmpfs": + fsops.append(TmpfsOperation(argv.pop())) + elif arg == "--dev": + fsops.append(DevOperation(argv.pop())) + elif arg == "--proc": + fsops.append(ProcOperation(argv.pop())) + elif arg == "--dir": + fsops.append(DirOperation(argv.pop())) + elif arg in ("--bind", "--ro-bind", "--bind-try", "--ro-bind-try"): + readonly = arg.startswith("--ro") + required = not arg.endswith("-try") + fsops.append(BindOperation(argv.pop(), argv.pop(), readonly=readonly, required=required)) + elif arg == "--symlink": + fsops.append(SymlinkOperation(argv.pop(), argv.pop())) + elif arg == "--write": + fsops.append(WriteOperation(argv.pop(), argv.pop())) + elif arg == "--overlay-lowerdir": + lowerdirs.append(argv.pop()) + elif arg == "--overlay-upperdir": + upperdir = argv.pop() + elif arg == "--overlay-workdir": + workdir = argv.pop() + elif arg == "--overlay": + fsops.append(OverlayOperation(tuple(reversed(lowerdirs)), upperdir, workdir, argv.pop())) + upperdir = "" + workdir = "" + lowerdirs = [] + elif arg == "--unsetenv": + unsetenv.append(argv.pop()) + elif arg == "--setenv": + setenv.append((argv.pop(), argv.pop())) + elif arg == "--chdir": + chdir = argv.pop() + elif arg == "--same-dir": + chdir = os.getcwd() + elif arg == "--become-root": + become_root = True + elif arg == "--suppress-chown": + suppress_chown = True + elif arg == "--unshare-net": + unshare_net = True + elif arg == "--unshare-ipc": + unshare_ipc = True + elif arg.startswith("-"): + raise RuntimeError(f"Unrecognized option {arg}") + else: + argv.append(arg) + break + + argv.reverse() + + argv = argv or ["bash"] + + # Make sure all destination paths are absolute. + for fsop in fsops: + if fsop.dst[0] != "/": + raise RuntimeError(f"{fsop.dst} is not an absolute path") + + fsops = FSOperation.optimize(fsops) + + for k, v in setenv: + os.environ[k] = v + + for e in unsetenv: + if e in os.environ: + del os.environ[e] + + # If $LISTEN_FDS is in the environment, let's automatically set $LISTEN_PID to the correct pid as well. + if "LISTEN_FDS" in os.environ: + os.environ["LISTEN_PID"] = str(os.getpid()) + + namespaces = CLONE_NEWNS + if unshare_net and have_effective_cap(CAP_NET_ADMIN): + namespaces |= CLONE_NEWNET + if unshare_ipc: + namespaces |= CLONE_NEWIPC + + userns = acquire_privileges(become_root=become_root) + + # If we're root in a user namespace with a single user, we're still not going to be able to chown() stuff, so check + # for that and apply the seccomp filter as well in that case. + if suppress_chown and (userns or userns_has_single_user()): + seccomp_suppress_chown() + + unshare(namespaces) + + # If we unshared the user namespace the mount propagation of root is changed to slave automatically. + if not userns: + mount("", "/", "", MS_SLAVE | MS_REC, "") + + # We need a workspace to setup the sandbox, the easiest way to do this in a tmpfs, since it's automatically cleaned + # up. We need a mountpoint to put the workspace on and it can't be root, so let's use /tmp which is almost + # guaranteed to exist. + mount("tmpfs", "/tmp", "tmpfs", 0, "") + + os.chdir("/tmp") + + with umask(~0o755): + os.mkdir("newroot") # This is where we set up the sandbox rootfs + os.mkdir("oldroot") # This is the old rootfs which is used as the source for mounts in the new rootfs. + + # Make sure that newroot is a mountpoint. + mount("newroot", "newroot", "", MS_BIND | MS_REC, "") + + # Make the workspace in /tmp / and put the old rootfs in oldroot. + if libc.pivot_root(b".", b"oldroot") < 0: + # pivot_root() can fail in the initramfs since / isn't a mountpoint there, so let's fall back to MS_MOVE if + # that's the case. + + # First we move the old rootfs to oldroot. + mount("/", "oldroot", "", MS_BIND | MS_REC, "") + + # Then we move the workspace (/tmp) to /. + mount(".", "/", "", MS_MOVE, "") + + # chroot and chdir to fully make the workspace the new root. + os.chroot(".") + os.chdir(".") + + # When we use MS_MOVE we have to unmount oldroot/tmp manually to reveal the original /tmp again as it might + # contain stuff that we want to mount into the sandbox. + umount2("oldroot/tmp", MNT_DETACH) + + for fsop in fsops: + fsop.execute("oldroot", "newroot") + + # Now that we're done setting up the sandbox let's pivot root into newroot to make it the new root. We use the + # pivot_root(".", ".") process described in the pivot_root() man page. + + os.chdir("newroot") + + # We're guaranteed to have / be a mount when we get here, so pivot_root() won't fail anymore, even if we're in the + # initramfs. + if libc.pivot_root(b".", b".") < 0: + oserror() + + # As documented in the pivot_root() man page, this will unmount the old rootfs. + umount2(".", MNT_DETACH) + + # Avoid surprises by making sure the sandbox's mount propagation is shared. This doesn't actually mean mounts get + # propagated into the host. Instead, a new mount propagation peer group is set up. + mount("", ".", "", MS_SHARED | MS_REC, "") + + if chdir: + os.chdir(chdir) + + try: + os.execvp(argv[0], argv) + except OSError as e: + # Let's return a recognizable error when the binary we're going to execute is not found. We use 127 as that's + # the exit code used by shells when a program to execute is not found. + if e.errno == ENOENT: + sys.exit(127) + + raise + + +if __name__ == "__main__": + main() diff --git a/mkosi/sandbox/__main__.py b/mkosi/sandbox/__main__.py new file mode 100644 index 000000000..6247975c1 --- /dev/null +++ b/mkosi/sandbox/__main__.py @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +from mkosi.sandbox import main + +if __name__ == "__main__": + main() diff --git a/mkosi/sandbox/resources/mkosi-sandbox.md b/mkosi/sandbox/resources/mkosi-sandbox.md new file mode 100644 index 000000000..5fcf3f2c5 --- /dev/null +++ b/mkosi/sandbox/resources/mkosi-sandbox.md @@ -0,0 +1,144 @@ +% mkosi-sandbox(1) +% +% + +# NAME + +mkosi-sandbox — Run commands in a custom sandbox + +# SYNOPSIS + +`mkosi-sandbox [options…] command [arguments]` + +# DESCRIPTION + +`mkosi-sandbox` runs the given command in a custom sandbox. The sandbox is configured +by specifying command line options that configure individual parts of the sandbox. + +If no command is specified, `mkosi-sandbox` will start `bash` in the sandbox. + +Note that this sandbox is not designed to be a security boundary. Its intended purpose +is to allow running commands in an isolated environment so they are not affected by the +host system. + +# OPTIONS + +`--tmpfs DST` +: Mounts a new tmpfs at `DST` in the sandbox. + +`--dev DST` +: Sets up a private `/dev` at `DST` in the sandbox. This private `/dev` will only + contain the basic device nodes required for a functioning sandbox (e.g. `/dev/null`) + and no actual devices. + +`--proc DST` +: Mounts `/proc` from the host at `DST` in the sandbox. + +`--dir DST` +: Creates a directory and all missing parent directories at `DST` in the sandbox. + All directories are created with mode 755 unless the path ends with `/tmp` or + `/var/tmp` in which case it is created with mode 1777. + +`--bind SRC DST` +: The source path `SRC` is recursively bind mounted to `DST` in the sandbox. The + mountpoint is created in the sandbox if it does not yet exist. Any missing parent + directories in the sandbox are created as well. + +`--bind-try SRC DST` +: Like `--bind`, but doesn't fail if the source path doesn't exist. + +`--ro-bind SRC DST` +: Like `--bind`, but does a recursive readonly bind mount. + +`--ro-bind-try SRC DST` +: Like `--bind-try`, but does a recursive readonly bind mount. + +`--symlink SRC DST` +: Creates a symlink at `DST` in the sandbox pointing to `SRC`. + +`--write DATA DST` +: Writes the string from `DATA` to `DST` in the sandbox. + +`--overlay-lowerdir DIR` +: Adds `DIR` from the host as a new lower directory for the next overlayfs mount. + +`--overlay-upperdir DIR` +: Sets the upper directory for the next overlayfs mount to `DIR` from the host. + +`--overlay-workdir DIR` +: Sets the working directory for the next overlayfs mount to `DIR` from the host. + +`--overlay DST` +: Mounts a new overlay filesystem at `DST` in the sandbox. The lower directories, upper + directory and working directory are specified using the `--overlay-lowerdir`, + `--overlay-upperdir` and `--overlay-workdir` options respectively. After each + `--overlay` option is parsed, the other overlay options are reset. + +`--unsetenv NAME` +: Unsets the `NAME` environment variable in the sandbox. + +`--setenv NAME VALUE` +: Sets the `NAME` environment variable to `VALUE` in the sandbox + +`--chdir DIR` +: Changes the working directory to `DIR` in the sandbox. + +`--same-dir` +: Changes to the working directory in the sandbox to the current working directory that + `mkosi-sandbox` is invoked in on the host. + +`--become-root` +: Maps the current user to the root user in the sandbox. If this option is not specified, + the current user is mapped to itself in the sandbox. Regardless of whether this option + is specified or not, the current user will have a full set of ambient capabilities in + the sandbox. This includes `CAP_SYS_ADMIN` which means that the invoked process in the + sandbox will be able to do bind mounts and other operations. + + If `mkosi-sandbox` is invoked as the root user, this option won't do anything. + +`--suppress-chown` +: Specifying this option causes all calls to `chown()` or similar system calls to become a + noop in the sandbox. This is primarily useful when invoking package managers in the + sandbox which might try to `chown()` files to different users or groups which would fail + unless `mkosi-sandbox` is invoked by a privileged user. + +`--unshare-net` +: Specifying this option makes `mkosi-sandbox` unshare a network namespace if possible. + +`--unshare-ipc` +: Specifying this option makes `mkosi-sandbox` unshare an IPC namespace if possible. + +`--version` +: Show package version. + +`--help`, `-h` +: Show brief usage information. + +# EXAMPLES + +Start `bash` in the current working directory in its own network namespace as the current user. + +```sh +mkosi-sandbox --bind / / --same-dir --unshare-net +``` + +Run `id` as the root user in a sandbox with only `/usr` from the host plus the necessary symlinks +to be able to run commands. + +```sh +mkosi-sandbox \ + --ro-bind /usr /usr \ + --symlink usr/bin /bin \ + --symlink usr/bin /bin \ + --symlink usr/lib /lib \ + --symlink usr/lib64 /lib64 \ + --symlink usr/sbin /sbin \ + --dev /dev \ + --proc /proc \ + --tmpfs /tmp \ + --become-root \ + id +``` + +# SEE ALSO +`mkosi(1)` diff --git a/mkosi/tree.py b/mkosi/tree.py index f35bca1a1..471a42c64 100644 --- a/mkosi/tree.py +++ b/mkosi/tree.py @@ -11,22 +11,15 @@ from mkosi.config import ConfigFeature from mkosi.log import ARG_DEBUG, die -from mkosi.run import run -from mkosi.sandbox import Mount, SandboxProtocol, nosandbox +from mkosi.run import SandboxProtocol, nosandbox, run +from mkosi.sandbox import BTRFS_SUPER_MAGIC, statfs from mkosi.types import PathString +from mkosi.util import flatten from mkosi.versioncomp import GenericVersion -def statfs(path: Path, *, sandbox: SandboxProtocol = nosandbox) -> str: - return run( - ["stat", "--file-system", "--format", "%T", path], - stdout=subprocess.PIPE, - sandbox=sandbox(binary="stat", mounts=[Mount(path, path, ro=True)]), - ).stdout.strip() - - -def is_subvolume(path: Path, *, sandbox: SandboxProtocol = nosandbox) -> bool: - return path.is_dir() and path.stat().st_ino == 256 and statfs(path, sandbox=sandbox) == "btrfs" +def is_subvolume(path: Path) -> bool: + return path.is_dir() and path.stat().st_ino == 256 and statfs(str(path)) == BTRFS_SUPER_MAGIC def cp_version(*, sandbox: SandboxProtocol = nosandbox) -> GenericVersion: @@ -45,7 +38,7 @@ def make_tree( use_subvolumes: ConfigFeature = ConfigFeature.disabled, sandbox: SandboxProtocol = nosandbox, ) -> Path: - if statfs(path.parent, sandbox=sandbox) != "btrfs": + if statfs(str(path.parent)) != BTRFS_SUPER_MAGIC: if use_subvolumes == ConfigFeature.enabled: die(f"Subvolumes requested but {path} is not located on a btrfs filesystem") @@ -54,7 +47,7 @@ def make_tree( if use_subvolumes != ConfigFeature.disabled: result = run(["btrfs", "subvolume", "create", path], - sandbox=sandbox(binary="btrfs", mounts=[Mount(path.parent, path.parent)]), + sandbox=sandbox(binary="btrfs", options=["--bind", path.parent, path.parent]), check=use_subvolumes == ConfigFeature.enabled).returncode else: result = 1 @@ -101,7 +94,7 @@ def copy_tree( if cp_version(sandbox=sandbox) >= "9.5": copy += ["--keep-directory-symlink"] - mounts = [Mount(src, src, ro=True), Mount(dst.parent, dst.parent)] + options: list[PathString] = ["--ro-bind", src, src, "--bind", dst.parent, dst.parent] # If the source and destination are both directories, we want to merge the source directory with the # destination directory. If the source if a file and the destination is a directory, we want to copy @@ -113,7 +106,7 @@ def copy_tree( if ( use_subvolumes == ConfigFeature.disabled or not preserve or - not is_subvolume(src, sandbox=sandbox) or + not is_subvolume(src) or (dst.exists() and any(dst.iterdir())) ): with ( @@ -121,7 +114,7 @@ def copy_tree( if not preserve else contextlib.nullcontext() ): - run(copy, sandbox=sandbox(binary="cp", mounts=mounts)) + run(copy, sandbox=sandbox(binary="cp", options=options)) return dst # btrfs can't snapshot to an existing directory so make sure the destination does not exist. @@ -131,7 +124,7 @@ def copy_tree( result = run( ["btrfs", "subvolume", "snapshot", src, dst], check=use_subvolumes == ConfigFeature.enabled, - sandbox=sandbox(binary="btrfs", mounts=mounts), + sandbox=sandbox(binary="btrfs", options=options), ).returncode if result != 0: @@ -140,7 +133,7 @@ def copy_tree( if not preserve else contextlib.nullcontext() ): - run(copy, sandbox=sandbox(binary="cp", mounts=mounts)) + run(copy, sandbox=sandbox(binary="cp", options=options)) return dst @@ -149,19 +142,19 @@ def rmtree(*paths: Path, sandbox: SandboxProtocol = nosandbox) -> None: if not paths: return - if subvolumes := sorted({p for p in paths if p.exists() and is_subvolume(p, sandbox=sandbox)}): + if subvolumes := sorted({p for p in paths if p.exists() and is_subvolume(p)}): # Silence and ignore failures since when not running as root, this will fail with a permission error unless the # btrfs filesystem is mounted with user_subvol_rm_allowed. run(["btrfs", "subvolume", "delete", *subvolumes], check=False, - sandbox=sandbox(binary="btrfs", mounts=[Mount(p.parent, p.parent) for p in subvolumes]), + sandbox=sandbox(binary="btrfs", options=flatten(("--bind", p.parent, p.parent) for p in subvolumes)), stdout=subprocess.DEVNULL if not ARG_DEBUG.get() else None, stderr=subprocess.DEVNULL if not ARG_DEBUG.get() else None) filtered = sorted({p for p in paths if p.exists() or p.is_symlink()}) if filtered: run(["rm", "-rf", "--", *filtered], - sandbox=sandbox(binary="rm", mounts=[Mount(p.parent, p.parent) for p in filtered])) + sandbox=sandbox(binary="rm", options=flatten(("--bind", p.parent, p.parent) for p in filtered))) def move_tree( diff --git a/mkosi/user.py b/mkosi/user.py index 8a37f675a..b70e01ba3 100644 --- a/mkosi/user.py +++ b/mkosi/user.py @@ -1,88 +1,58 @@ # SPDX-License-Identifier: LGPL-2.1-or-later -import ctypes -import ctypes.util import fcntl import functools -import logging import os import pwd import tempfile -from collections.abc import Sequence from pathlib import Path from mkosi.log import die -from mkosi.run import run, spawn +from mkosi.run import spawn +from mkosi.sandbox import CLONE_NEWUSER, unshare from mkosi.util import flock, parents_below SUBRANGE = 65536 class INVOKING_USER: - uid = int(os.getenv("SUDO_UID") or os.getenv("PKEXEC_UID") or os.getuid()) - gid = int(os.getenv("SUDO_GID") or os.getgid()) - invoked_as_root = os.getuid() == 0 - - @classmethod - def init(cls) -> None: - name = cls.name() - home = cls.home() - extra_groups = cls.extra_groups() - logging.debug( - f"Running as user '{name}' ({cls.uid}:{cls.gid}) with home {home} " - f"and extra groups {extra_groups}." - ) - - @classmethod - def is_running_user(cls) -> bool: - return cls.uid == os.getuid() - @classmethod @functools.lru_cache(maxsize=1) def name(cls) -> str: try: - return pwd.getpwuid(cls.uid).pw_name + return pwd.getpwuid(os.getuid()).pw_name except KeyError: - if cls.uid == 0: + if os.getuid() == 0: return "root" if not (user := os.getenv("USER")): - die(f"Could not find user name for UID {cls.uid}") + die(f"Could not find user name for UID {os.getuid()}") return user @classmethod @functools.lru_cache(maxsize=1) def home(cls) -> Path: - if cls.invoked_as_root and Path.cwd().is_relative_to("/home") and len(Path.cwd().parents) > 2: + if os.getuid() == 0 and Path.cwd().is_relative_to("/home") and len(Path.cwd().parents) > 2: return list(Path.cwd().parents)[-3] try: - return Path(pwd.getpwuid(cls.uid).pw_dir or "/") + return Path(pwd.getpwuid(os.getuid()).pw_dir or "/") except KeyError: if not (home := os.getenv("HOME")): - die(f"Could not find home directory for UID {cls.uid}") + die(f"Could not find home directory for UID {os.getuid()}") return Path(home) @classmethod - @functools.lru_cache(maxsize=1) - def extra_groups(cls) -> Sequence[int]: - return os.getgrouplist(cls.name(), cls.gid) - - @classmethod - def is_regular_user(cls) -> bool: - return cls.uid >= 1000 + def is_regular_user(cls, uid: int) -> bool: + return uid >= 1000 @classmethod def cache_dir(cls) -> Path: if (env := os.getenv("XDG_CACHE_HOME")) or (env := os.getenv("CACHE_DIRECTORY")): cache = Path(env) - elif ( - cls.is_regular_user() and - INVOKING_USER.home() != Path("/") and - (Path.cwd().is_relative_to(INVOKING_USER.home()) or not cls.invoked_as_root) - ): - cache = INVOKING_USER.home() / ".cache" + elif cls.is_regular_user(os.getuid()) and cls.home() != Path("/"): + cache = cls.home() / ".cache" else: cache = Path("/var/cache") @@ -92,31 +62,24 @@ def cache_dir(cls) -> Path: def runtime_dir(cls) -> Path: if (env := os.getenv("XDG_RUNTIME_DIR")) or (env := os.getenv("RUNTIME_DIRECTORY")): d = Path(env) - elif cls.is_regular_user(): - d = Path("/run/user") / str(cls.uid) + elif cls.is_regular_user(os.getuid()): + d = Path(f"/run/user/{os.getuid()}") else: d = Path("/run") return d / "mkosi" - @classmethod - def rchown(cls, path: Path) -> None: - if cls.is_regular_user() and any(p.stat().st_uid == cls.uid for p in path.parents) and path.exists(): - run(["chown", "--recursive", f"{INVOKING_USER.uid}:{INVOKING_USER.gid}", path]) - @classmethod def chown(cls, path: Path) -> None: - # If we created a file/directory in a parent directory owned by the invoking user, make sure the path and any + # If we created a file/directory in a parent directory owned by a regular user, make sure the path and any # parent directories are owned by the invoking user as well. - def is_valid_dir(path: Path) -> bool: - return path.stat().st_uid == cls.uid or path in (Path("/tmp"), Path("/var/tmp")) - - if cls.is_regular_user() and (q := next((parent for parent in path.parents if is_valid_dir(parent)), None)): - os.chown(path, INVOKING_USER.uid, INVOKING_USER.gid) + if (q := next((parent for parent in path.parents if cls.is_regular_user(parent.stat().st_uid)), None)): + st = q.stat() + os.chown(path, st.st_uid, st.st_gid) for parent in parents_below(path, q): - os.chown(parent, INVOKING_USER.uid, INVOKING_USER.gid) + os.chown(parent, st.st_uid, st.st_gid) def read_subrange(path: Path) -> int: @@ -143,30 +106,12 @@ def read_subrange(path: Path) -> int: return int(start) -CLONE_NEWNS = 0x00020000 -CLONE_NEWUSER = 0x10000000 - - -def unshare(flags: int) -> None: - libc_name = ctypes.util.find_library("c") - if libc_name is None: - die("Could not find libc") - libc = ctypes.CDLL(libc_name, use_errno=True) - - if libc.unshare(ctypes.c_int(flags)) != 0: - e = ctypes.get_errno() - raise OSError(e, os.strerror(e)) - - -def become_root() -> None: +def become_root_in_subuid_range() -> None: """ Set up a new user namespace mapping using /etc/subuid and /etc/subgid. - The current user will be mapped to root and 65436 will be mapped to the UID/GID of the invoking user. - The other IDs will be mapped through. - - The function modifies the uid, gid of the INVOKING_USER object to the uid, gid of the invoking user in the user - namespace. + The current process becomes the root user in the new user namespace and the current user and group will be mapped + to 65436. The other IDs will be mapped through. """ if os.getuid() == 0: return @@ -179,11 +124,11 @@ def become_root() -> None: with tempfile.NamedTemporaryFile(prefix="mkosi-uidmap-lock-") as lockfile: lock = Path(lockfile.name) - # We map the private UID range configured in /etc/subuid and /etc/subgid into the container using + # We map the private UID range configured in /etc/subuid and /etc/subgid into the user namespace using # newuidmap and newgidmap. On top of that, we also make sure to map in the user running mkosi so that - # we can run still chown stuff to that user or run stuff as that user which will make sure any - # generated files are owned by that user. We don't map to the last user in the range as the last user - # is sometimes used in tests as a default value and mapping to that user might break those tests. + # we can access files and directories from the current user from within the user namespace. We don't map to the + # last user in the range as the last user is sometimes used in tests as a default value and mapping to that + # user might break those tests. newuidmap = [ "flock", "--exclusive", "--close", lock, "newuidmap", pid, 0, subuid, SUBRANGE - 100, @@ -207,11 +152,7 @@ def become_root() -> None: # execute using flock so they don't execute before they can get a lock on the same temporary file, then we # unshare the user namespace and finally we unlock the temporary file, which allows the newuidmap and newgidmap # processes to execute. we then wait for the processes to finish before continuing. - with ( - flock(lock) as fd, - spawn(newuidmap, innerpid=False) as (uidmap, _), - spawn(newgidmap, innerpid=False) as (gidmap, _) - ): + with flock(lock) as fd, spawn(newuidmap) as uidmap, spawn(newgidmap) as gidmap: unshare(CLONE_NEWUSER) fcntl.flock(fd, fcntl.LOCK_UN) uidmap.wait() @@ -223,11 +164,8 @@ def become_root() -> None: os.setresgid(0, 0, 0) os.setgroups([0]) - INVOKING_USER.uid = SUBRANGE - 100 - INVOKING_USER.gid = SUBRANGE - 100 - -def become_root_cmd() -> list[str]: +def become_root_in_subuid_range_cmd() -> list[str]: if os.getuid() == 0: return [] diff --git a/mkosi/util.py b/mkosi/util.py index da9b1140c..9d16f9d22 100644 --- a/mkosi/util.py +++ b/mkosi/util.py @@ -187,15 +187,6 @@ def choices(cls) -> list[str]: return [*cls.values(), ""] -@contextlib.contextmanager -def umask(mask: int) -> Iterator[None]: - old = os.umask(mask) - try: - yield - finally: - os.umask(old) - - def parents_below(path: Path, below: Path) -> list[Path]: parents = list(path.parents) return parents[:parents.index(below)] diff --git a/mkosi/vmspawn.py b/mkosi/vmspawn.py index 61b9295c9..26de1f6aa 100644 --- a/mkosi/vmspawn.py +++ b/mkosi/vmspawn.py @@ -14,7 +14,6 @@ yes_no, ) from mkosi.log import die -from mkosi.mounts import finalize_source_mounts from mkosi.qemu import ( apply_runtime_size, copy_ephemeral, @@ -74,9 +73,9 @@ def run_vmspawn(args: Args, config: Config) -> None: apply_runtime_size(config, fname) if config.runtime_build_sources: - with finalize_source_mounts(config, ephemeral=False) as mounts: - for mount in mounts: - cmdline += ["--bind", f"{mount.src}:{mount.dst}"] + for t in config.build_sources: + src, dst = t.with_prefix("/work/src") + cmdline += ["--bind", f"{src}:{dst}"] if config.build_dir: cmdline += ["--bind", f"{config.build_dir}:/work/build"] diff --git a/pyproject.toml b/pyproject.toml index 035519ae6..780148be7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,7 @@ bootable = [ [project.scripts] mkosi = "mkosi.__main__:main" mkosi-initrd = "mkosi.initrd.__main__:main" +mkosi-sandbox = "mkosi.sandbox.__main__:main" [tool.setuptools] packages = [ @@ -30,11 +31,14 @@ packages = [ "mkosi.initrd.resources", "mkosi.installer", "mkosi.resources", + "mkosi.sandbox", + "mkosi.sandbox.resources", ] [tool.setuptools.package-data] "mkosi.resources" = ["repart/**/*", "mkosi.md", "mkosi.1", "mkosi-initrd/**/*", "mkosi-tools/**/*"] "mkosi.initrd.resources" = ["mkosi-initrd.md", "mkosi-initrd.1"] +"mkosi.sandbox.resources" = ["mkosi-sandbox.md", "mkosi-sandbox.1"] [tool.isort] profile = "black" diff --git a/tests/__init__.py b/tests/__init__.py index 3900198a5..1e9eae4c3 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -16,7 +16,6 @@ from mkosi.distributions import Distribution from mkosi.run import run from mkosi.types import _FILE, CompletedProcess, PathString -from mkosi.user import INVOKING_USER @dataclasses.dataclass(frozen=True) @@ -32,6 +31,9 @@ class Image: def __init__(self, config: ImageConfig, options: Sequence[PathString] = []) -> None: self.options = options self.config = config + st = Path.cwd().stat() + self.uid = st.st_uid + self.gid = st.st_gid def __enter__(self) -> "Image": self.output_dir = Path(os.getenv("TMPDIR", "/var/tmp")) / uuid.uuid4().hex[:16] @@ -44,7 +46,7 @@ def __exit__( value: Optional[BaseException], traceback: Optional[TracebackType], ) -> None: - self.mkosi("clean", user=INVOKING_USER.uid, group=INVOKING_USER.gid) + self.mkosi("clean", user=self.uid, group=self.gid) def mkosi( self, @@ -99,8 +101,8 @@ def build(self, options: Sequence[str] = (), args: Sequence[str] = ()) -> Comple [*options, "--debug", "--force", *(["--debug-shell"] if self.config.debug_shell else [])], args, stdin=sys.stdin if sys.stdin.isatty() else None, - user=INVOKING_USER.uid, - group=INVOKING_USER.gid, + user=self.uid, + group=self.gid, ) def boot(self, options: Sequence[str] = (), args: Sequence[str] = ()) -> CompletedProcess: @@ -122,8 +124,8 @@ def qemu(self, options: Sequence[str] = (), args: Sequence[str] = ()) -> Complet [*options, "--debug"], args, stdin=sys.stdin if sys.stdin.isatty() else None, - user=INVOKING_USER.uid, - group=INVOKING_USER.gid, + user=self.uid, + group=self.gid, check=False, ) @@ -151,10 +153,10 @@ def vmspawn(self, options: Sequence[str] = (), args: Sequence[str] = ()) -> Comp return result def summary(self, options: Sequence[str] = ()) -> CompletedProcess: - return self.mkosi("summary", options, user=INVOKING_USER.uid, group=INVOKING_USER.gid) + return self.mkosi("summary", options, user=self.uid, group=self.gid) def genkey(self) -> CompletedProcess: - return self.mkosi("genkey", ["--force"], user=INVOKING_USER.uid, group=INVOKING_USER.gid) + return self.mkosi("genkey", ["--force"], user=self.uid, group=self.gid) @pytest.fixture(scope="session", autouse=True) diff --git a/tests/test_initrd.py b/tests/test_initrd.py index 59833fba2..ae91a6fd7 100644 --- a/tests/test_initrd.py +++ b/tests/test_initrd.py @@ -11,16 +11,31 @@ import pytest from mkosi.distributions import Distribution -from mkosi.mounts import mount from mkosi.run import run +from mkosi.sandbox import umask from mkosi.tree import copy_tree -from mkosi.user import INVOKING_USER +from mkosi.types import PathString from . import Image, ImageConfig pytestmark = pytest.mark.integration +@contextlib.contextmanager +def mount(what: PathString, where: PathString) -> Iterator[Path]: + where = Path(where) + + if not where.exists(): + with umask(~0o755): + where.mkdir(parents=True) + + run(["mount", "--no-mtab", what, where]) + try: + yield where + finally: + run(["umount", "--no-mtab", where]) + + @pytest.fixture(scope="module") def passphrase() -> Iterator[Path]: # We can't use tmp_path fixture because pytest creates it in a nested directory we can't access using our @@ -29,7 +44,8 @@ def passphrase() -> Iterator[Path]: with tempfile.NamedTemporaryFile(prefix="mkosi.passphrase", mode="w") as passphrase: passphrase.write("mkosi") passphrase.flush() - os.fchown(passphrase.fileno(), INVOKING_USER.uid, INVOKING_USER.gid) + st = Path.cwd().stat() + os.fchown(passphrase.fileno(), st.st_uid, st.st_gid) os.fchmod(passphrase.fileno(), 0o600) yield Path(passphrase.name) @@ -83,7 +99,8 @@ def test_initrd_lvm(config: ImageConfig) -> None: def test_initrd_luks(config: ImageConfig, passphrase: Path) -> None: with tempfile.TemporaryDirectory() as repartd: - os.chown(repartd, INVOKING_USER.uid, INVOKING_USER.gid) + st = Path.cwd().stat() + os.chown(repartd, st.st_uid, st.st_gid) (Path(repartd) / "00-esp.conf").write_text( textwrap.dedent( diff --git a/tests/test_json.py b/tests/test_json.py index bc8e7a32d..c8561a872 100644 --- a/tests/test_json.py +++ b/tests/test_json.py @@ -86,7 +86,6 @@ def test_config() -> None: dump = textwrap.dedent( """\ { - "Acl": true, "Architecture": "ia64", "Autologin": false, "BaseTrees": [ @@ -364,7 +363,6 @@ def test_config() -> None: ) args = Config( - acl=True, architecture=Architecture.ia64, autologin=False, base_trees=[Path("/hello/world")], diff --git a/tools/do-a-release.sh b/tools/do-a-release.sh index 67a04d2d5..d1f52ec83 100755 --- a/tools/do-a-release.sh +++ b/tools/do-a-release.sh @@ -14,7 +14,7 @@ if ! git diff-index --quiet HEAD; then fi sed -r -i "s/^version = \".*\"$/version = \"$VERSION\"/" pyproject.toml -sed -r -i "s/^__version__ = \".*\"$/__version__ = \"$VERSION\"/" mkosi/config.py +sed -r -i "s/^__version__ = \".*\"$/__version__ = \"$VERSION\"/" mkosi/sandbox.py git add -p pyproject.toml mkosi @@ -25,7 +25,7 @@ git tag -s "v$VERSION" -m "mkosi $VERSION" VERSION_MAJOR=${VERSION%%.*} VERSION="$((VERSION_MAJOR + 1))~devel" -sed -r -i "s/^__version__ = \".*\"$/__version__ = \"$VERSION\"/" mkosi/config.py +sed -r -i "s/^__version__ = \".*\"$/__version__ = \"$VERSION\"/" mkosi/sandbox.py git add -p mkosi diff --git a/tools/make-man-page.sh b/tools/make-man-page.sh index f7b85f382..17c5cd706 100755 --- a/tools/make-man-page.sh +++ b/tools/make-man-page.sh @@ -4,3 +4,4 @@ set -ex pandoc -t man -s -o mkosi/resources/mkosi.1 mkosi/resources/mkosi.md pandoc -t man -s -o mkosi/initrd/resources/mkosi-initrd.1 mkosi/initrd/resources/mkosi-initrd.md +pandoc -t man -s -o mkosi/sandbox/resources/mkosi-sandbox.1 mkosi/sandbox/resources/mkosi-sandbox.md