From d8b2686603b6fcb7754b5852ad6824fb078c56a7 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Tue, 9 Jan 2024 10:57:29 -0500 Subject: [PATCH 01/22] Linux: Defer loading the object set in zfs_setattr() We need to wait until after having done a zfs_enter() to load some fields from the zfsvfs structure. Otherwise a use-after-free is possible in the face of a concurrent rollback. Other functions in this file are careful to avoid this bug, I believe this is the only instance. Reviewed-by: Brian Atkinson Reviewed-by: Brian Behlendorf Signed-off-by: Mark Johnston Closes #15752 --- module/os/linux/zfs/zfs_vnops_os.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index 10162f62cda2..2a766a585b70 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -1856,7 +1856,7 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns) { struct inode *ip; zfsvfs_t *zfsvfs = ZTOZSB(zp); - objset_t *os = zfsvfs->z_os; + objset_t *os; zilog_t *zilog; dmu_tx_t *tx; vattr_t oldva; @@ -1888,6 +1888,7 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns) if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (err); ip = ZTOI(zp); + os = zfsvfs->z_os; /* * If this is a xvattr_t, then get a pointer to the structure of From 1a11ad9d20ab115b5d53f038f97a25d59a6bb5cf Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Tue, 9 Jan 2024 18:57:09 -0500 Subject: [PATCH 02/22] Fix a potential use-after-free in zfs_setsecattr() In general, VOPs must not load the "z_log" field until having called zfs_enter_verify_zp(). Reviewed-by: Brian Atkinson Reviewed-by: Brian Behlendorf Signed-off-by: Mark Johnston Closes #15752 --- module/zfs/zfs_vnops.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 384cdf0dca97..5377da401126 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -795,11 +795,11 @@ zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) zfsvfs_t *zfsvfs = ZTOZSB(zp); int error; boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; - zilog_t *zilog = zfsvfs->z_log; + zilog_t *zilog; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); - + zilog = zfsvfs->z_log; error = zfs_setacl(zp, vsecp, skipaclchk, cr); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) From 20dd16d9f7c9e633c08400f9810db9a84aa20a80 Mon Sep 17 00:00:00 2001 From: Rich Ercolani <214141+rincebrain@users.noreply.github.com> Date: Fri, 12 Jan 2024 14:55:17 -0500 Subject: [PATCH 03/22] Make zdb -R scale less poorly zdb -R with :d tries to use gzip decompression 9 times per size. There's absolutely no reason for that, they're all the same decompressor. Reviewed-by: Brian Atkinson Reviewed-by: Brian Behlendorf Signed-off-by: Rich Ercolani Closes #15726 --- cmd/zdb/zdb.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index cd934b2ae9ad..6894e33cbc0a 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -8506,6 +8506,14 @@ zdb_decompress_block(abd_t *pabd, void *buf, void *lbuf, uint64_t lsize, *cfuncp++ = ZIO_COMPRESS_LZ4; *cfuncp++ = ZIO_COMPRESS_LZJB; mask |= ZIO_COMPRESS_MASK(LZ4) | ZIO_COMPRESS_MASK(LZJB); + /* + * Every gzip level has the same decompressor, no need to + * run it 9 times per bruteforce attempt. + */ + mask |= ZIO_COMPRESS_MASK(GZIP_2) | ZIO_COMPRESS_MASK(GZIP_3); + mask |= ZIO_COMPRESS_MASK(GZIP_4) | ZIO_COMPRESS_MASK(GZIP_5); + mask |= ZIO_COMPRESS_MASK(GZIP_6) | ZIO_COMPRESS_MASK(GZIP_7); + mask |= ZIO_COMPRESS_MASK(GZIP_8) | ZIO_COMPRESS_MASK(GZIP_9); for (int c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) if (((1ULL << c) & mask) == 0) *cfuncp++ = c; From c4fa674367776a8ced56df64d56a8797ed2cba48 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 12 Jan 2024 11:57:13 -0800 Subject: [PATCH 04/22] Enable block_cloning tests on FreeBSD Reviewed-by: Brian Behlendorf Signed-off-by: Pawel Jakub Dawidek Closes #15749 --- tests/runfiles/common.run | 11 +++++++++++ tests/runfiles/linux.run | 14 +++----------- .../block_cloning/block_cloning_copyfilerange.ksh | 2 +- .../block_cloning_copyfilerange_cross_dataset.ksh | 2 +- .../block_cloning_copyfilerange_fallback.ksh | 2 +- ...ock_cloning_copyfilerange_fallback_same_txg.ksh | 2 +- .../block_cloning_copyfilerange_partial.ksh | 2 +- .../block_cloning_cross_enc_dataset.ksh | 2 +- .../block_cloning_disabled_copyfilerange.ksh | 2 +- .../block_cloning_lwb_buffer_overflow.ksh | 7 ++++--- .../block_cloning/block_cloning_replay.ksh | 9 +++++---- .../block_cloning_replay_encrypted.ksh | 9 +++++---- 12 files changed, 35 insertions(+), 29 deletions(-) diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 48ec059de6ba..f6e5367f552e 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -71,6 +71,17 @@ tests = ['bclone_crossfs_corner_cases_limited', tags = ['functional', 'bclone'] timeout = 7200 +[tests/functional/block_cloning] +tests = ['block_cloning_copyfilerange', 'block_cloning_copyfilerange_partial', + 'block_cloning_copyfilerange_fallback', + 'block_cloning_disabled_copyfilerange', + 'block_cloning_copyfilerange_cross_dataset', + 'block_cloning_cross_enc_dataset', + 'block_cloning_copyfilerange_fallback_same_txg', + 'block_cloning_replay', 'block_cloning_replay_encrypted', + 'block_cloning_lwb_buffer_overflow'] +tags = ['functional', 'block_cloning'] + [tests/functional/bootfs] tests = ['bootfs_001_pos', 'bootfs_002_neg', 'bootfs_003_pos', 'bootfs_004_neg', 'bootfs_005_neg', 'bootfs_006_pos', 'bootfs_007_pos', diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index c7c17f271762..6a4cd3fe691c 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -35,17 +35,9 @@ tests = ['atime_003_pos', 'root_relatime_on'] tags = ['functional', 'atime'] [tests/functional/block_cloning:Linux] -tests = ['block_cloning_copyfilerange', 'block_cloning_copyfilerange_partial', - 'block_cloning_copyfilerange_fallback', - 'block_cloning_ficlone', 'block_cloning_ficlonerange', - 'block_cloning_ficlonerange_partial', - 'block_cloning_disabled_copyfilerange', 'block_cloning_disabled_ficlone', - 'block_cloning_disabled_ficlonerange', - 'block_cloning_copyfilerange_cross_dataset', - 'block_cloning_cross_enc_dataset', - 'block_cloning_copyfilerange_fallback_same_txg', - 'block_cloning_replay', 'block_cloning_replay_encrypted', - 'block_cloning_lwb_buffer_overflow'] +tests = ['block_cloning_ficlone', 'block_cloning_ficlonerange', + 'block_cloning_ficlonerange_partial', 'block_cloning_disabled_ficlone', + 'block_cloning_disabled_ficlonerange'] tags = ['functional', 'block_cloning'] [tests/functional/chattr:Linux] diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange.ksh index 43ea47b0ef19..0599739abee6 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange.ksh @@ -29,7 +29,7 @@ verify_runnable "global" -if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then +if is_linux && [[ $(linux_version) -lt $(linux_version "4.5") ]]; then log_unsupported "copy_file_range not available before Linux 4.5" fi diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh index 74e6b04903a3..43323c207a62 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh @@ -29,7 +29,7 @@ verify_runnable "global" -if [[ $(linux_version) -lt $(linux_version "5.3") ]]; then +if is_linux && [[ $(linux_version) -lt $(linux_version "5.3") ]]; then log_unsupported "copy_file_range can't copy cross-filesystem before Linux 5.3" fi diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback.ksh index 9a96eacd60af..475910be7478 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback.ksh @@ -30,7 +30,7 @@ verify_runnable "global" -if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then +if is_linux && [[ $(linux_version) -lt $(linux_version "4.5") ]]; then log_unsupported "copy_file_range not available before Linux 4.5" fi diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh index e52b34ec8a51..00982f68db86 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh @@ -30,7 +30,7 @@ verify_runnable "global" -if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then +if is_linux && [[ $(linux_version) -lt $(linux_version "4.5") ]]; then log_unsupported "copy_file_range not available before Linux 4.5" fi diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_partial.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_partial.ksh index a5da0a0bd359..38c46e4741cb 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_partial.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_partial.ksh @@ -29,7 +29,7 @@ verify_runnable "global" -if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then +if is_linux && [[ $(linux_version) -lt $(linux_version "4.5") ]]; then log_unsupported "copy_file_range not available before Linux 4.5" fi diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_cross_enc_dataset.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_cross_enc_dataset.ksh index fe8f0867b909..34d3d2692555 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_cross_enc_dataset.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_cross_enc_dataset.ksh @@ -29,7 +29,7 @@ verify_runnable "global" -if [[ $(linux_version) -lt $(linux_version "5.3") ]]; then +if is_linux && [[ $(linux_version) -lt $(linux_version "5.3") ]]; then log_unsupported "copy_file_range can't copy cross-filesystem before Linux 5.3" fi diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_disabled_copyfilerange.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_disabled_copyfilerange.ksh index d21b6251134e..3d916ab92165 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_disabled_copyfilerange.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_disabled_copyfilerange.ksh @@ -29,7 +29,7 @@ verify_runnable "global" -if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then +if is_linux && [[ $(linux_version) -lt $(linux_version "4.5") ]]; then log_unsupported "copy_file_range not available before Linux 4.5" fi diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_lwb_buffer_overflow.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_lwb_buffer_overflow.ksh index 0ae76b7e54a5..919f320dea3f 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_lwb_buffer_overflow.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_lwb_buffer_overflow.ksh @@ -45,7 +45,7 @@ verify_runnable "global" -if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then +if is_linux && [[ $(linux_version) -lt $(linux_version "4.5") ]]; then log_unsupported "copy_file_range not available before Linux 4.5" fi @@ -77,13 +77,14 @@ log_must zfs create -o recordsize=32K $TESTPOOL/$TESTFS log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/file1 bs=32K count=1022 \ conv=fsync sync_pool $TESTPOOL -log_must clonefile -c /$TESTPOOL/$TESTFS/file1 /$TESTPOOL/$TESTFS/file2 +log_must clonefile -f /$TESTPOOL/$TESTFS/file1 /$TESTPOOL/$TESTFS/file2 log_must sync sync_pool $TESTPOOL log_must have_same_content /$TESTPOOL/$TESTFS/file1 /$TESTPOOL/$TESTFS/file2 typeset blocks=$(get_same_blocks $TESTPOOL/$TESTFS file1 $TESTPOOL/$TESTFS file2) -log_must [ "$blocks" = "$(seq -s " " 0 1021)" ] +# FreeBSD's seq(1) leaves a trailing space, remove it with sed(1). +log_must [ "$blocks" = "$(seq -s " " 0 1021 | sed 's/ $//')" ] log_pass "LWB buffer overflow is not triggered with multiple VDEVs ZIL" diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_replay.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_replay.ksh index 1fdf379ed2d2..530152004686 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_replay.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_replay.ksh @@ -42,7 +42,7 @@ verify_runnable "global" -if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then +if is_linux && [[ $(linux_version) -lt $(linux_version "4.5") ]]; then log_unsupported "copy_file_range not available before Linux 4.5" fi @@ -90,8 +90,8 @@ log_must zpool freeze $TESTPOOL # # 4. TX_CLONE_RANGE: Clone the file # -log_must clonefile -c /$TESTPOOL/$TESTFS/file1 /$TESTPOOL/$TESTFS/clone1 -log_must clonefile -c /$TESTPOOL/$TESTFS/file2 /$TESTPOOL/$TESTFS/clone2 +log_must clonefile -f /$TESTPOOL/$TESTFS/file1 /$TESTPOOL/$TESTFS/clone1 +log_must clonefile -f /$TESTPOOL/$TESTFS/file2 /$TESTPOOL/$TESTFS/clone2 # # 5. Unmount filesystem and export the pool @@ -126,6 +126,7 @@ log_must [ "$blocks" = "0 1 2 3" ] typeset blocks=$(get_same_blocks $TESTPOOL/$TESTFS file2 \ $TESTPOOL/$TESTFS clone2) -log_must [ "$blocks" = "$(seq -s " " 0 2047)" ] +# FreeBSD's seq(1) leaves a trailing space, remove it with sed(1). +log_must [ "$blocks" = "$(seq -s " " 0 2047 | sed 's/ $//')" ] log_pass $claim diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_replay_encrypted.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_replay_encrypted.ksh index f9f687c83e5b..0967415b7b7b 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_replay_encrypted.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_replay_encrypted.ksh @@ -42,7 +42,7 @@ verify_runnable "global" -if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then +if is_linux && [[ $(linux_version) -lt $(linux_version "4.5") ]]; then log_unsupported "copy_file_range not available before Linux 4.5" fi @@ -92,8 +92,8 @@ log_must zpool freeze $TESTPOOL # # 4. TX_CLONE_RANGE: Clone the file # -log_must clonefile -c /$TESTPOOL/$TESTFS/file1 /$TESTPOOL/$TESTFS/clone1 -log_must clonefile -c /$TESTPOOL/$TESTFS/file2 /$TESTPOOL/$TESTFS/clone2 +log_must clonefile -f /$TESTPOOL/$TESTFS/file1 /$TESTPOOL/$TESTFS/clone1 +log_must clonefile -f /$TESTPOOL/$TESTFS/file2 /$TESTPOOL/$TESTFS/clone2 # # 5. Unmount filesystem and export the pool @@ -128,6 +128,7 @@ log_must [ "$blocks" = "0 1 2 3" ] typeset blocks=$(get_same_blocks $TESTPOOL/$TESTFS file2 \ $TESTPOOL/$TESTFS clone2 $PASSPHRASE) -log_must [ "$blocks" = "$(seq -s " " 0 2047)" ] +# FreeBSD's seq(1) leaves a trailing space, remove it with sed(1). +log_must [ "$blocks" = "$(seq -s " " 0 2047 | sed 's/ $//')" ] log_pass $claim From 66670ba9f0f2ee04794f20628be28c46858badc7 Mon Sep 17 00:00:00 2001 From: Stefan Lendl <1321542+stfl@users.noreply.github.com> Date: Fri, 12 Jan 2024 21:05:11 +0100 Subject: [PATCH 05/22] fix(mount): do not truncate shares not zfs mount When running zfs share -a resetting the exports.d/zfs.exports makes sense the get a clean state. Truncating was also called with zfs mount which would not populate the file again. Add test to verify shares persist after mount -a. Reviewed-by: Brian Behlendorf Signed-off-by: Stefan Lendl Closes #15607 Closes #15660 --- cmd/zfs/zfs_main.c | 3 +- tests/runfiles/common.run | 3 +- tests/zfs-tests/tests/Makefile.am | 1 + .../zfs_share/zfs_share_after_mount.ksh | 62 +++++++++++++++++++ 4 files changed, 67 insertions(+), 2 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_after_mount.ksh diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index 9939f206a7f2..f67f6114d0f1 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -7234,7 +7234,8 @@ share_mount(int op, int argc, char **argv) pthread_mutex_init(&share_mount_state.sm_lock, NULL); /* For a 'zfs share -a' operation start with a clean slate. */ - zfs_truncate_shares(NULL); + if (op == OP_SHARE) + zfs_truncate_shares(NULL); /* * libshare isn't mt-safe, so only do the operation in parallel diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index f6e5367f552e..a600140ea25f 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -316,7 +316,8 @@ tags = ['functional', 'cli_root', 'zfs_set'] [tests/functional/cli_root/zfs_share] tests = ['zfs_share_001_pos', 'zfs_share_002_pos', 'zfs_share_003_pos', 'zfs_share_004_pos', 'zfs_share_006_pos', 'zfs_share_008_neg', - 'zfs_share_010_neg', 'zfs_share_011_pos', 'zfs_share_concurrent_shares'] + 'zfs_share_010_neg', 'zfs_share_011_pos', 'zfs_share_concurrent_shares', + 'zfs_share_after_mount'] tags = ['functional', 'cli_root', 'zfs_share'] [tests/functional/cli_root/zfs_snapshot] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index c20b428db26d..3798194f096c 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -912,6 +912,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zfs_share/zfs_share_012_pos.ksh \ functional/cli_root/zfs_share/zfs_share_013_pos.ksh \ functional/cli_root/zfs_share/zfs_share_concurrent_shares.ksh \ + functional/cli_root/zfs_share/zfs_share_after_mount.ksh \ functional/cli_root/zfs_snapshot/cleanup.ksh \ functional/cli_root/zfs_snapshot/setup.ksh \ functional/cli_root/zfs_snapshot/zfs_snapshot_001_neg.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_after_mount.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_after_mount.ksh new file mode 100755 index 000000000000..0d4b66ea854c --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_after_mount.ksh @@ -0,0 +1,62 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023 by Proxmox. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# DESCRIPTION: +# Verify that nfs shares persist after zfs mount -a +# +# STRATEGY: +# 1. Verify that the filesystem is not shared. +# 2. Enable the 'sharenfs' property +# 3. Verify filesystem is shared +# 4. Invoke 'zfs mount -a' +# 5. Verify filesystem is still shared + +verify_runnable "global" + +function cleanup +{ + log_must zfs set sharenfs=off $TESTPOOL/$TESTFS + is_shared $TESTPOOL/$TESTFS && \ + log_must unshare_fs $TESTPOOL/$TESTFS + log_must zfs share -a +} + + +log_onexit cleanup + +cleanup + +log_must zfs set sharenfs="on" $TESTPOOL/$TESTFS +log_must is_shared $TESTPOOL/$TESTFS +log_must is_exported $TESTPOOL/$TESTFS + +log_must zfs mount -a +log_must is_shared $TESTPOOL/$TESTFS +log_must is_exported $TESTPOOL/$TESTFS + +log_pass "Verify that nfs shares persist after zfs mount -a" From 6138af86b317d24f16e1aae71f68ebb5fcb92e46 Mon Sep 17 00:00:00 2001 From: Rich Ercolani <214141+rincebrain@users.noreply.github.com> Date: Fri, 12 Jan 2024 15:17:26 -0500 Subject: [PATCH 06/22] Stop wasting time on malloc in snprintf_zstd_header Profiling zdb -vvvvv on datasets with a lot of zstd blocks, we find ourselves spending quite a lot of time on malloc/free, because we allocate a 16M abd each call, and never free it, so we're leaking 16M per call as well. This seems sub-optimal. So let's just keep the buffer around and reuse it. Reviewed-by: Brian Behlendorf Reviewed-by: Rob Norris Signed-off-by: Rich Ercolani Closes #15721 --- cmd/zdb/zdb.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 6894e33cbc0a..0b2e7e56e6ae 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -2360,7 +2360,7 @@ static void snprintf_zstd_header(spa_t *spa, char *blkbuf, size_t buflen, const blkptr_t *bp) { - abd_t *pabd; + static abd_t *pabd = NULL; void *buf; zio_t *zio; zfs_zstdhdr_t zstd_hdr; @@ -2391,7 +2391,8 @@ snprintf_zstd_header(spa_t *spa, char *blkbuf, size_t buflen, return; } - pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE); + if (!pabd) + pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE); zio = zio_root(spa, NULL, NULL, 0); /* Decrypt but don't decompress so we can read the compression header */ From 3bddc4daec2a3ebc12cbc9785121765fe187f01e Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Fri, 29 Dec 2023 10:22:58 -0500 Subject: [PATCH 07/22] spa: Fix FreeBSD sysctl handlers sbuf_cpy() resets the sbuf state, which is wrong for sbufs allocated by sbuf_new_for_sysctl(). In particular, this code triggers an assertion failure in sbuf_clear(). Simplify by just using sysctl_handle_string() for both reading and setting the tunable. Fixes: 6930ecbb7 ("spa: make read/write queues configurable") Reviewed-by: Rob Norris Reviewed-by: Alexander Motin Reported-by: Peter Holm Signed-off-by: Mark Johnston Closes #15719 --- module/zfs/spa.c | 30 ++++++------------------------ 1 file changed, 6 insertions(+), 24 deletions(-) diff --git a/module/zfs/spa.c b/module/zfs/spa.c index a21b0decf6a3..dc8a99e7bd25 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1445,8 +1445,6 @@ spa_taskq_write_param_get(char *buf, zfs_kernel_param_t *kp) return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf)); } #else -#include - /* * On FreeBSD load-time parameters can be set up before malloc() is available, * so we have to do all the parsing work on the stack. @@ -1457,19 +1455,11 @@ static int spa_taskq_read_param(ZFS_MODULE_PARAM_ARGS) { char buf[SPA_TASKQ_PARAM_MAX]; - int err = 0; - - if (req->newptr == NULL) { - int len = spa_taskq_param_get(ZIO_TYPE_READ, buf); - struct sbuf *s = sbuf_new_for_sysctl(NULL, NULL, len+1, req); - sbuf_cpy(s, buf); - err = sbuf_finish(s); - sbuf_delete(s); - return (err); - } + int err; + (void) spa_taskq_param_get(ZIO_TYPE_READ, buf); err = sysctl_handle_string(oidp, buf, sizeof (buf), req); - if (err) + if (err || req->newptr == NULL) return (err); return (spa_taskq_param_set(ZIO_TYPE_READ, buf)); } @@ -1478,19 +1468,11 @@ static int spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS) { char buf[SPA_TASKQ_PARAM_MAX]; - int err = 0; - - if (req->newptr == NULL) { - int len = spa_taskq_param_get(ZIO_TYPE_WRITE, buf); - struct sbuf *s = sbuf_new_for_sysctl(NULL, NULL, len+1, req); - sbuf_cpy(s, buf); - err = sbuf_finish(s); - sbuf_delete(s); - return (err); - } + int err; + (void) spa_taskq_param_get(ZIO_TYPE_WRITE, buf); err = sysctl_handle_string(oidp, buf, sizeof (buf), req); - if (err) + if (err || req->newptr == NULL) return (err); return (spa_taskq_param_set(ZIO_TYPE_WRITE, buf)); } From 5a703d1368bf85a6e665f62011e257b3b526c97c Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Fri, 29 Dec 2023 12:56:35 -0500 Subject: [PATCH 08/22] spa: Let spa_taskq_param_get()'s addition of a newline be optional For FreeBSD sysctls, we don't want the extra newline, since the sysctl(8) utility will format strings appropriately. Reviewed-by: Rob Norris Reviewed-by: Alexander Motin Reported-by: Peter Holm Signed-off-by: Mark Johnston Closes #15719 --- module/zfs/spa.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/module/zfs/spa.c b/module/zfs/spa.c index dc8a99e7bd25..b144d0652930 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1392,7 +1392,7 @@ spa_taskq_param_set(zio_type_t t, char *cfg) } static int -spa_taskq_param_get(zio_type_t t, char *buf) +spa_taskq_param_get(zio_type_t t, char *buf, boolean_t add_newline) { int pos = 0; @@ -1410,7 +1410,8 @@ spa_taskq_param_get(zio_type_t t, char *buf) sep = " "; } - buf[pos++] = '\n'; + if (add_newline) + buf[pos++] = '\n'; buf[pos] = '\0'; return (pos); @@ -1428,7 +1429,7 @@ spa_taskq_read_param_set(const char *val, zfs_kernel_param_t *kp) static int spa_taskq_read_param_get(char *buf, zfs_kernel_param_t *kp) { - return (spa_taskq_param_get(ZIO_TYPE_READ, buf)); + return (spa_taskq_param_get(ZIO_TYPE_READ, buf, TRUE)); } static int @@ -1442,7 +1443,7 @@ spa_taskq_write_param_set(const char *val, zfs_kernel_param_t *kp) static int spa_taskq_write_param_get(char *buf, zfs_kernel_param_t *kp) { - return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf)); + return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf, TRUE)); } #else /* @@ -1457,7 +1458,7 @@ spa_taskq_read_param(ZFS_MODULE_PARAM_ARGS) char buf[SPA_TASKQ_PARAM_MAX]; int err; - (void) spa_taskq_param_get(ZIO_TYPE_READ, buf); + (void) spa_taskq_param_get(ZIO_TYPE_READ, buf, FALSE); err = sysctl_handle_string(oidp, buf, sizeof (buf), req); if (err || req->newptr == NULL) return (err); @@ -1470,7 +1471,7 @@ spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS) char buf[SPA_TASKQ_PARAM_MAX]; int err; - (void) spa_taskq_param_get(ZIO_TYPE_WRITE, buf); + (void) spa_taskq_param_get(ZIO_TYPE_WRITE, buf, FALSE); err = sysctl_handle_string(oidp, buf, sizeof (buf), req); if (err || req->newptr == NULL) return (err); From 363368c67045657b012a2756f54b4c04de01e4ef Mon Sep 17 00:00:00 2001 From: Benjamin Sherman Date: Fri, 12 Jan 2024 14:33:41 -0600 Subject: [PATCH 09/22] fix: preserve linux kmod signature in zfs-kmod rpm spec This change provides rpm spec macros to sign the zfs and spl kmods as the final step after the %install scriptlet. This is needed since the find-debuginfo.sh script strips out debug symbols plus signatures. Kernel module signing only occurs when the required files are present as typically required in the Linux source tree: - certs/signing_key.pem - certs/signing_key.x509 The method for overriding the default __spec_install_post macro is inspired by (and largely copied from) the Fedora kernel.spec. Reviewed-by: Tony Hutter Reviewed-by: Tino Reichardt Signed-off-by: Benjamin Sherman Closes #15744 --- rpm/generic/zfs-kmod.spec.in | 24 ++++++++++++++++++++++++ rpm/redhat/zfs-kmod.spec.in | 24 ++++++++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/rpm/generic/zfs-kmod.spec.in b/rpm/generic/zfs-kmod.spec.in index 3c73e2ff2d6d..4cc075585d4b 100644 --- a/rpm/generic/zfs-kmod.spec.in +++ b/rpm/generic/zfs-kmod.spec.in @@ -150,6 +150,30 @@ for kernel_version in %{?kernel_versions}; do done +# Module signing (modsign) +# +# This must be run _after_ find-debuginfo.sh runs, otherwise that will strip +# the signature off of the modules. +# (Based on Fedora's kernel.spec workaround) +%define __modsign_install_post \ + sign_pem="%{ksrc}/certs/signing_key.pem"; \ + sign_x509="%{ksrc}/certs/signing_key.x509"; \ + if [ -f "${sign_x509}" ]\ + then \ + echo "Signing kernel modules ..."; \ + for kmod in $(find ${RPM_BUILD_ROOT}%{kmodinstdir_prefix}/*/extra/ -name \*.ko); do \ + %{ksrc}/scripts/sign-file sha256 ${sign_pem} ${sign_x509} ${kmod}; \ + done \ + fi \ +%{nil} + +# hack to ensure signing happens after find-debuginfo.sh runs +%define __spec_install_post \ + %{?__debug_package:%{__debug_install_post}}\ + %{__arch_install_post}\ + %{__os_install_post}\ + %{__modsign_install_post} + %install rm -rf ${RPM_BUILD_ROOT} diff --git a/rpm/redhat/zfs-kmod.spec.in b/rpm/redhat/zfs-kmod.spec.in index f59551c0b43a..9c836786baea 100644 --- a/rpm/redhat/zfs-kmod.spec.in +++ b/rpm/redhat/zfs-kmod.spec.in @@ -72,6 +72,30 @@ fi %{?kernel_llvm} make %{?_smp_mflags} +# Module signing (modsign) +# +# This must be run _after_ find-debuginfo.sh runs, otherwise that will strip +# the signature off of the modules. +# (Based on Fedora's kernel.spec workaround) +%define __modsign_install_post \ + sign_pem="%{ksrc}/certs/signing_key.pem"; \ + sign_x509="%{ksrc}/certs/signing_key.x509"; \ + if [ -f "${sign_x509}" ]\ + then \ + echo "Signing kernel modules ..."; \ + for kmod in $(find %{buildroot}/lib/modules/%{kverrel}/extra/ -name \*.ko); do \ + %{ksrc}/scripts/sign-file sha256 ${sign_pem} ${sign_x509} ${kmod}; \ + done \ + fi \ +%{nil} + +# hack to ensure signing happens after find-debuginfo.sh runs +%define __spec_install_post \ + %{?__debug_package:%{__debug_install_post}}\ + %{__arch_install_post}\ + %{__os_install_post}\ + %{__modsign_install_post} + %install make install \ DESTDIR=${RPM_BUILD_ROOT} \ From a1771d243a57524c6599d7beedcb2085e5c05395 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 12 Jan 2024 12:35:29 -0800 Subject: [PATCH 10/22] Fix "out of memory" error Drop the no_memory() call from zpool_in_use() when reading the label fails and instead return the error to the caller. This prevents a misleading "internal error: out of memory" error when the label can't be read. This will result in is_spare() returning B_FALSE instead of aborting, which is already safely handled. Furthermore, on Linux it's possible for EREMOTEIO to returned by an NVMe device if the device has been low-level formatted and not rescanned. In this case we want to fallback to the legacy scanning method and read any of the labels we can. Reviewed-by: Brian Atkinson Signed-off-by: Brian Behlendorf Issue #13538 Closes #15747 --- lib/libzfs/libzfs_import.c | 4 +--- lib/libzutil/zutil_import.c | 15 +++++++++++++-- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/lib/libzfs/libzfs_import.c b/lib/libzfs/libzfs_import.c index 2a7c5a76a0a6..e2d40a7b3bfb 100644 --- a/lib/libzfs/libzfs_import.c +++ b/lib/libzfs/libzfs_import.c @@ -291,10 +291,8 @@ zpool_in_use(libzfs_handle_t *hdl, int fd, pool_state_t *state, char **namestr, *inuse = B_FALSE; - if (zpool_read_label(fd, &config, NULL) != 0) { - (void) no_memory(hdl); + if (zpool_read_label(fd, &config, NULL) != 0) return (-1); - } if (config == NULL) return (0); diff --git a/lib/libzutil/zutil_import.c b/lib/libzutil/zutil_import.c index d023dd105355..0b5c47b08c61 100644 --- a/lib/libzutil/zutil_import.c +++ b/lib/libzutil/zutil_import.c @@ -1056,10 +1056,21 @@ zpool_read_label(int fd, nvlist_t **config, int *num_labels) case EINVAL: break; case EINPROGRESS: - // This shouldn't be possible to - // encounter, die if we do. + /* + * This shouldn't be possible to + * encounter, die if we do. + */ ASSERT(B_FALSE); zfs_fallthrough; + case EREMOTEIO: + /* + * May be returned by an NVMe device + * which is visible in /dev/ but due + * to a low-level format change, or + * other error, needs to be rescanned. + * Try the slow method. + */ + zfs_fallthrough; case EOPNOTSUPP: case ENOSYS: do_slow = B_TRUE; From 995734ed12de767f98e1c02ba39efe9595e710b8 Mon Sep 17 00:00:00 2001 From: Umer Saleem Date: Wed, 17 Jan 2024 02:15:10 +0500 Subject: [PATCH 11/22] ZTS: Test for clone, mmap and write for block cloning For block cloning, if we mmap the cloned file and write from the map into the file, it triggers a panic in dbuf_redirty() on Linux. The same scenario causes data corruption on FreeBSD. Both these issues are fixed under PR#15656 and PR#15665. It would be good to add a test for this scenario in ZTS. The test program and issue was produced by @robn. Reviewed-by: Pawel Jakub Dawidek Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Reviewed-by: Ameer Hamza Signed-off-by: Umer Saleem Closes #15717 --- tests/runfiles/common.run | 2 +- tests/test-runner/bin/zts-report.py.in | 2 + tests/zfs-tests/cmd/.gitignore | 1 + tests/zfs-tests/cmd/Makefile.am | 1 + tests/zfs-tests/cmd/clone_mmap_write.c | 123 ++++++++++++++++++ tests/zfs-tests/include/commands.cfg | 1 + tests/zfs-tests/tests/Makefile.am | 1 + .../block_cloning_clone_mmap_write.ksh | 79 +++++++++++ 8 files changed, 209 insertions(+), 1 deletion(-) create mode 100644 tests/zfs-tests/cmd/clone_mmap_write.c create mode 100755 tests/zfs-tests/tests/functional/block_cloning/block_cloning_clone_mmap_write.ksh diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index a600140ea25f..47f8de0dd4c1 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -79,7 +79,7 @@ tests = ['block_cloning_copyfilerange', 'block_cloning_copyfilerange_partial', 'block_cloning_cross_enc_dataset', 'block_cloning_copyfilerange_fallback_same_txg', 'block_cloning_replay', 'block_cloning_replay_encrypted', - 'block_cloning_lwb_buffer_overflow'] + 'block_cloning_lwb_buffer_overflow', 'block_cloning_clone_mmap_write'] tags = ['functional', 'block_cloning'] [tests/functional/bootfs] diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index 7bf4d05d542b..c84f75cd806b 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -287,6 +287,8 @@ elif sys.platform.startswith('linux'): 'bclone/bclone_samefs_data': ['SKIP', cfr_reason], 'bclone/bclone_samefs_embedded': ['SKIP', cfr_reason], 'bclone/bclone_samefs_hole': ['SKIP', cfr_reason], + 'block_cloning/block_cloning_clone_mmap_write': + ['SKIP', cfr_reason], 'block_cloning/block_cloning_copyfilerange': ['SKIP', cfr_reason], 'block_cloning/block_cloning_copyfilerange_cross_dataset': diff --git a/tests/zfs-tests/cmd/.gitignore b/tests/zfs-tests/cmd/.gitignore index 5f53b687191a..a696fd387111 100644 --- a/tests/zfs-tests/cmd/.gitignore +++ b/tests/zfs-tests/cmd/.gitignore @@ -2,6 +2,7 @@ /btree_test /chg_usr_exec /clonefile +/clone_mmap_write /devname2devid /dir_rd_update /draid diff --git a/tests/zfs-tests/cmd/Makefile.am b/tests/zfs-tests/cmd/Makefile.am index 1b915ae98cab..379dc5e236c5 100644 --- a/tests/zfs-tests/cmd/Makefile.am +++ b/tests/zfs-tests/cmd/Makefile.am @@ -3,6 +3,7 @@ scripts_zfs_tests_bindir = $(datadir)/$(PACKAGE)/zfs-tests/bin scripts_zfs_tests_bin_PROGRAMS = %D%/chg_usr_exec scripts_zfs_tests_bin_PROGRAMS += %D%/clonefile +scripts_zfs_tests_bin_PROGRAMS += %D%/clone_mmap_write scripts_zfs_tests_bin_PROGRAMS += %D%/cp_files scripts_zfs_tests_bin_PROGRAMS += %D%/ctime scripts_zfs_tests_bin_PROGRAMS += %D%/dir_rd_update diff --git a/tests/zfs-tests/cmd/clone_mmap_write.c b/tests/zfs-tests/cmd/clone_mmap_write.c new file mode 100644 index 000000000000..6a5cd8721c57 --- /dev/null +++ b/tests/zfs-tests/cmd/clone_mmap_write.c @@ -0,0 +1,123 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * This program clones the file, mmap it, and writes from the map into + * file. This scenario triggers a panic on Linux in dbuf_redirty(), + * which is fixed under PR#15656. On FreeBSD, the same test causes data + * corruption, which is fixed by PR#15665. + * + * It would be good to test for this scenario in ZTS. This program and + * issue was initially produced by @robn. + */ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __FreeBSD__ +#define loff_t off_t +#endif + +ssize_t +copy_file_range(int, loff_t *, int, loff_t *, size_t, unsigned int) + __attribute__((weak)); + +static int +open_file(const char *source) +{ + int fd; + if ((fd = open(source, O_RDWR | O_APPEND)) < 0) { + (void) fprintf(stderr, "Error opening %s\n", source); + exit(1); + } + sync(); + return (fd); +} + +static int +clone_file(int sfd, long long size, const char *dest) +{ + int dfd; + + if ((dfd = open(dest, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR)) < 0) { + (void) fprintf(stderr, "Error opening %s\n", dest); + exit(1); + } + + if (copy_file_range(sfd, 0, dfd, 0, size, 0) < 0) { + (void) fprintf(stderr, "copy_file_range failed\n"); + exit(1); + } + + return (dfd); +} + +static void * +map_file(int fd, long long size) +{ + void *p = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0); + if (p == MAP_FAILED) { + (void) fprintf(stderr, "mmap failed\n"); + exit(1); + } + + return (p); +} + +static void +map_write(void *p, int fd) +{ + if (pwrite(fd, p, 1024*128, 0) < 0) { + (void) fprintf(stderr, "write failed\n"); + exit(1); + } +} + +int +main(int argc, char **argv) +{ + int sfd, dfd; + void *p; + struct stat sb; + if (argc != 3) { + (void) printf("usage: %s " + "\n", argv[0]); + exit(1); + } + sfd = open_file(argv[1]); + if (fstat(sfd, &sb) == -1) { + (void) fprintf(stderr, "fstat failed\n"); + exit(1); + } + dfd = clone_file(sfd, sb.st_size, argv[2]); + p = map_file(dfd, sb.st_size); + map_write(p, dfd); + return (0); +} diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg index c6f74cd81a1c..797078ed3ab6 100644 --- a/tests/zfs-tests/include/commands.cfg +++ b/tests/zfs-tests/include/commands.cfg @@ -185,6 +185,7 @@ export ZFSTEST_FILES='badsend btree_test chg_usr_exec clonefile + clone_mmap_write devname2devid dir_rd_update draid diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 3798194f096c..2fc54301bbeb 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -461,6 +461,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/bclone/setup.ksh \ functional/block_cloning/cleanup.ksh \ functional/block_cloning/setup.ksh \ + functional/block_cloning/block_cloning_clone_mmap_write.ksh \ functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh \ functional/block_cloning/block_cloning_copyfilerange_fallback.ksh \ functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh \ diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_clone_mmap_write.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_clone_mmap_write.ksh new file mode 100755 index 000000000000..6215b3178e7e --- /dev/null +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_clone_mmap_write.ksh @@ -0,0 +1,79 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib + +# +# DESCRIPTION: +# A PANIC is triggered in dbuf_redirty() if we clone a file, mmap it +# and write from the map into the file. PR#15656 fixes this scenario. +# This scenario also causes data corruption on FreeBSD, which is fixed +# by PR#15665. +# +# STRATEGY: +# 1. Create a pool +# 2. Create a test file +# 3. Clone, mmap and write to the file using clone_mmap_write +# 5. Synchronize cached writes +# 6. Verfiy data is correctly written to the disk +# + +verify_runnable "global" + +if is_linux && [[ $(linux_version) -lt $(linux_version "4.5") ]]; then + log_unsupported "copy_file_range not available before Linux 4.5" +fi + +VDIR=$TEST_BASE_DIR/disk-bclone +VDEV="$VDIR/a" + +function cleanup +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL + rm -rf $VDIR +} + +log_onexit cleanup + +log_assert "Test for clone, mmap and write scenario" + +log_must rm -rf $VDIR +log_must mkdir -p $VDIR +log_must truncate -s 1G $VDEV + +log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $VDEV +log_must zfs create $TESTPOOL/$TESTFS + +log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/file bs=1M count=512 +log_must clone_mmap_write /$TESTPOOL/$TESTFS/file /$TESTPOOL/$TESTFS/clone + +sync_pool $TESTPOOL +log_must sync + +log_must have_same_content /$TESTPOOL/$TESTFS/file /$TESTPOOL/$TESTFS/clone +blocks=$(get_same_blocks $TESTPOOL/$TESTFS file $TESTPOOL/$TESTFS clone) +# FreeBSD's seq(1) leaves a trailing space, remove it with sed(1). +log_must [ "$blocks" = "$(seq -s " " 1 4095 | sed 's/ $//')" ] + +log_pass "Clone, mmap and write does not cause data corruption or " \ + "trigger panic" From 1f5bf96001469a61e6da576456bde2a7dee9c40f Mon Sep 17 00:00:00 2001 From: Rich Ercolani <214141+rincebrain@users.noreply.github.com> Date: Tue, 16 Jan 2024 16:16:08 -0500 Subject: [PATCH 12/22] Make zdb -R a little more sane. zdb -R has a minor flaw in which it will not always print the full output of a decompressed block. Oops. While I was in there, I also reworked the logic so it won't try ZLE unless everything else fails, which will hopefully avoid the problem ZDB_NO_ZLE was intended to mitigate of reporting a lot of false positives of ZLE compressed blocks... Reviewed-by: Brian Behlendorf Signed-off-by: Rich Ercolani Closes #15723 --- cmd/zdb/zdb.c | 91 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 57 insertions(+), 34 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 0b2e7e56e6ae..2062f4fa1026 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -8488,11 +8488,45 @@ zdb_parse_block_sizes(char *sizes, uint64_t *lsize, uint64_t *psize) #define ZIO_COMPRESS_MASK(alg) (1ULL << (ZIO_COMPRESS_##alg)) static boolean_t +try_decompress_block(abd_t *pabd, uint64_t lsize, uint64_t psize, + int flags, int cfunc, void *lbuf, void *lbuf2) +{ + if (flags & ZDB_FLAG_VERBOSE) { + (void) fprintf(stderr, + "Trying %05llx -> %05llx (%s)\n", + (u_longlong_t)psize, + (u_longlong_t)lsize, + zio_compress_table[cfunc].ci_name); + } + + /* + * We set lbuf to all zeros and lbuf2 to all + * ones, then decompress to both buffers and + * compare their contents. This way we can + * know if decompression filled exactly to + * lsize or if it left some bytes unwritten. + */ + + memset(lbuf, 0x00, lsize); + memset(lbuf2, 0xff, lsize); + + if (zio_decompress_data(cfunc, pabd, + lbuf, psize, lsize, NULL) == 0 && + zio_decompress_data(cfunc, pabd, + lbuf2, psize, lsize, NULL) == 0 && + memcmp(lbuf, lbuf2, lsize) == 0) + return (B_TRUE); + return (B_FALSE); +} + +static uint64_t zdb_decompress_block(abd_t *pabd, void *buf, void *lbuf, uint64_t lsize, uint64_t psize, int flags) { (void) buf; - boolean_t exceeded = B_FALSE; + uint64_t orig_lsize = lsize; + boolean_t tryzle = ((getenv("ZDB_NO_ZLE") == NULL)); + boolean_t found = B_FALSE; /* * We don't know how the data was compressed, so just try * every decompress function at every inflated blocksize. @@ -8503,7 +8537,7 @@ zdb_decompress_block(abd_t *pabd, void *buf, void *lbuf, uint64_t lsize, uint64_t maxlsize = SPA_MAXBLOCKSIZE; uint64_t mask = ZIO_COMPRESS_MASK(ON) | ZIO_COMPRESS_MASK(OFF) | ZIO_COMPRESS_MASK(INHERIT) | ZIO_COMPRESS_MASK(EMPTY) | - (getenv("ZDB_NO_ZLE") ? ZIO_COMPRESS_MASK(ZLE) : 0); + ZIO_COMPRESS_MASK(ZLE); *cfuncp++ = ZIO_COMPRESS_LZ4; *cfuncp++ = ZIO_COMPRESS_LZJB; mask |= ZIO_COMPRESS_MASK(LZ4) | ZIO_COMPRESS_MASK(LZJB); @@ -8530,49 +8564,38 @@ zdb_decompress_block(abd_t *pabd, void *buf, void *lbuf, uint64_t lsize, lsize += SPA_MINBLOCKSIZE; else maxlsize = lsize; + for (; lsize <= maxlsize; lsize += SPA_MINBLOCKSIZE) { for (cfuncp = cfuncs; *cfuncp; cfuncp++) { - if (flags & ZDB_FLAG_VERBOSE) { - (void) fprintf(stderr, - "Trying %05llx -> %05llx (%s)\n", - (u_longlong_t)psize, - (u_longlong_t)lsize, - zio_compress_table[*cfuncp].\ - ci_name); - } - - /* - * We set lbuf to all zeros and lbuf2 to all - * ones, then decompress to both buffers and - * compare their contents. This way we can - * know if decompression filled exactly to - * lsize or if it left some bytes unwritten. - */ - memset(lbuf, 0x00, lsize); - memset(lbuf2, 0xff, lsize); - - if (zio_decompress_data(*cfuncp, pabd, - lbuf, psize, lsize, NULL) == 0 && - zio_decompress_data(*cfuncp, pabd, - lbuf2, psize, lsize, NULL) == 0 && - memcmp(lbuf, lbuf2, lsize) == 0) + if (try_decompress_block(pabd, lsize, psize, flags, + *cfuncp, lbuf, lbuf2)) { + found = B_TRUE; break; + } } if (*cfuncp != 0) break; } + if (!found && tryzle) { + for (lsize = orig_lsize; lsize <= maxlsize; + lsize += SPA_MINBLOCKSIZE) { + if (try_decompress_block(pabd, lsize, psize, flags, + ZIO_COMPRESS_ZLE, lbuf, lbuf2)) { + *cfuncp = ZIO_COMPRESS_ZLE; + found = B_TRUE; + break; + } + } + } umem_free(lbuf2, SPA_MAXBLOCKSIZE); - if (lsize > maxlsize) { - exceeded = B_TRUE; - } if (*cfuncp == ZIO_COMPRESS_ZLE) { printf("\nZLE decompression was selected. If you " "suspect the results are wrong,\ntry avoiding ZLE " "by setting and exporting ZDB_NO_ZLE=\"true\"\n"); } - return (exceeded); + return (lsize > maxlsize ? -1 : lsize); } /* @@ -8751,9 +8774,9 @@ zdb_read_block(char *thing, spa_t *spa) uint64_t orig_lsize = lsize; buf = lbuf; if (flags & ZDB_FLAG_DECOMPRESS) { - boolean_t failed = zdb_decompress_block(pabd, buf, lbuf, + lsize = zdb_decompress_block(pabd, buf, lbuf, lsize, psize, flags); - if (failed) { + if (lsize == -1) { (void) printf("Decompress of %s failed\n", thing); goto out; } @@ -8774,11 +8797,11 @@ zdb_read_block(char *thing, spa_t *spa) abd_return_buf_copy(pabd, buf, lsize); borrowed = B_FALSE; buf = lbuf; - boolean_t failed = zdb_decompress_block(pabd, buf, + lsize = zdb_decompress_block(pabd, buf, lbuf, lsize, psize, flags); b = (const blkptr_t *)(void *) ((uintptr_t)buf + (uintptr_t)blkptr_offset); - if (failed || zfs_blkptr_verify(spa, b, + if (lsize == -1 || zfs_blkptr_verify(spa, b, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG) == B_FALSE) { printf("invalid block pointer at this DVA\n"); goto out; From d9885b3776f1ec1032e6edd1f60424d66d419113 Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Thu, 4 Jan 2024 19:02:50 +0500 Subject: [PATCH 13/22] fix: Uber block label not always found for aux vdevs When spare or l2cache (aux) vdev is added during pool creation, spa->spa_uberblock is not dumped until that point. Subsequently, the aux label is never synchronized after its initial creation, resulting in the uberblock label remaining undumped. The uberblock is crucial for lib_blkid in identifying the ZFS partition type. To address this issue, we now ensure sync of the uberblock label once if it's not dumped initially. Reviewed-by: Umer Saleem Reviewed-by: Alexander Motin Signed-off-by: Ameer Hamza Closes #15737 --- include/sys/spa_impl.h | 1 + module/zfs/vdev_label.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index ee91816ac48e..0cd0c4720fbe 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -278,6 +278,7 @@ struct spa { spa_aux_vdev_t spa_spares; /* hot spares */ spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */ + boolean_t spa_aux_sync_uber; /* need to sync aux uber */ nvlist_t *spa_label_features; /* Features for reading MOS */ uint64_t spa_config_object; /* MOS object for pool config */ uint64_t spa_config_generation; /* config generation number */ diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 4c6501cc9a09..d063747bc559 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -1156,6 +1156,14 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) */ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift) == 0); + + /* + * When spare or l2cache (aux) vdev is added during pool + * creation, spa->spa_uberblock is not written until this + * point. Write it on next config sync. + */ + if (uberblock_verify(&spa->spa_uberblock)) + spa->spa_aux_sync_uber = B_TRUE; } else { uint64_t txg = 0ULL; @@ -1794,6 +1802,16 @@ vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) for (int v = 0; v < svdcount; v++) vdev_uberblock_sync(zio, &good_writes, ub, svd[v], flags); + if (spa->spa_aux_sync_uber) { + for (int v = 0; v < spa->spa_spares.sav_count; v++) { + vdev_uberblock_sync(zio, &good_writes, ub, + spa->spa_spares.sav_vdevs[v], flags); + } + for (int v = 0; v < spa->spa_l2cache.sav_count; v++) { + vdev_uberblock_sync(zio, &good_writes, ub, + spa->spa_l2cache.sav_vdevs[v], flags); + } + } (void) zio_wait(zio); /* @@ -1808,6 +1826,19 @@ vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) zio_flush(zio, svd[v]); } } + if (spa->spa_aux_sync_uber) { + spa->spa_aux_sync_uber = B_FALSE; + for (int v = 0; v < spa->spa_spares.sav_count; v++) { + if (vdev_writeable(spa->spa_spares.sav_vdevs[v])) { + zio_flush(zio, spa->spa_spares.sav_vdevs[v]); + } + } + for (int v = 0; v < spa->spa_l2cache.sav_count; v++) { + if (vdev_writeable(spa->spa_l2cache.sav_vdevs[v])) { + zio_flush(zio, spa->spa_l2cache.sav_vdevs[v]); + } + } + } (void) zio_wait(zio); From 2df2a58dc1a47ac1f358c78b029d59c4ab629a9c Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Thu, 4 Jan 2024 19:32:53 +0500 Subject: [PATCH 14/22] Extend aux label to add path information Pool import logic uses vdev paths, so it makes sense to add path information on AUX vdev as well. Reviewed-by: Umer Saleem Reviewed-by: Alexander Motin Signed-off-by: Ameer Hamza Closes #15737 --- module/zfs/vdev_label.c | 54 +++++++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 23 deletions(-) diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index d063747bc559..c31f48028bbc 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -1031,6 +1031,10 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) int error; uint64_t spare_guid = 0, l2cache_guid = 0; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; + boolean_t reason_spare = (reason == VDEV_LABEL_SPARE || (reason == + VDEV_LABEL_REMOVE && vd->vdev_isspare)); + boolean_t reason_l2cache = (reason == VDEV_LABEL_L2CACHE || (reason == + VDEV_LABEL_REMOVE && vd->vdev_isl2cache)); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); @@ -1116,34 +1120,20 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) * really part of an active pool just yet. The labels will * be written again with a meaningful txg by spa_sync(). */ - if (reason == VDEV_LABEL_SPARE || - (reason == VDEV_LABEL_REMOVE && vd->vdev_isspare)) { + if (reason_spare || reason_l2cache) { /* - * For inactive hot spares, we generate a special label that - * identifies as a mutually shared hot spare. We write the - * label if we are adding a hot spare, or if we are removing an - * active hot spare (in which case we want to revert the - * labels). + * For inactive hot spares and level 2 ARC devices, we generate + * a special label that identifies as a mutually shared hot + * spare or l2cache device. We write the label in case of + * addition or removal of hot spare or l2cache vdev (in which + * case we want to revert the labels). */ VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION, spa_version(spa)) == 0); VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE, - POOL_STATE_SPARE) == 0); - VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID, - vd->vdev_guid) == 0); - } else if (reason == VDEV_LABEL_L2CACHE || - (reason == VDEV_LABEL_REMOVE && vd->vdev_isl2cache)) { - /* - * For level 2 ARC devices, add a special label. - */ - VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0); - - VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION, - spa_version(spa)) == 0); - VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE, - POOL_STATE_L2CACHE) == 0); + reason_spare ? POOL_STATE_SPARE : POOL_STATE_L2CACHE) == 0); VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0); @@ -1154,8 +1144,26 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) * spa->spa_l2cache->sav_config (populated in * spa_ld_open_aux_vdevs()). */ - VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_ASHIFT, - vd->vdev_ashift) == 0); + if (reason_l2cache) { + VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_ASHIFT, + vd->vdev_ashift) == 0); + } + + /* + * Add path information to help find it during pool import + */ + if (vd->vdev_path != NULL) { + VERIFY(nvlist_add_string(label, ZPOOL_CONFIG_PATH, + vd->vdev_path) == 0); + } + if (vd->vdev_devid != NULL) { + VERIFY(nvlist_add_string(label, ZPOOL_CONFIG_DEVID, + vd->vdev_devid) == 0); + } + if (vd->vdev_physpath != NULL) { + VERIFY(nvlist_add_string(label, ZPOOL_CONFIG_PHYS_PATH, + vd->vdev_physpath) == 0); + } /* * When spare or l2cache (aux) vdev is added during pool From b64be1624c3f833ada9b8f8f947d04496a1ab5b8 Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Thu, 4 Jan 2024 19:35:04 +0500 Subject: [PATCH 15/22] Add path handling for aux vdevs in `label_path` If the AUX vdev is added using UUID, importing the pool falls back AUX vdev to open it with disk name instead of UUID due to the absence of path information for AUX vdevs. Since AUX label now have path information, this PR adds path handling for it in `label_path`. Reviewed-by: Umer Saleem Reviewed-by: Alexander Motin Signed-off-by: Ameer Hamza Closes #15737 --- lib/libzutil/zutil_import.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/lib/libzutil/zutil_import.c b/lib/libzutil/zutil_import.c index 0b5c47b08c61..eb9131190458 100644 --- a/lib/libzutil/zutil_import.c +++ b/lib/libzutil/zutil_import.c @@ -1221,13 +1221,26 @@ label_paths(libpc_handle_t *hdl, nvlist_t *label, const char **path, nvlist_t *nvroot; uint64_t pool_guid; uint64_t vdev_guid; + uint64_t state; *path = NULL; *devid = NULL; + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &vdev_guid) != 0) + return (ENOENT); + + /* + * In case of spare or l2cache, we directly return path/devid from the + * label. + */ + if (!(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state)) && + (state == POOL_STATE_SPARE || state == POOL_STATE_L2CACHE)) { + (void) nvlist_lookup_string(label, ZPOOL_CONFIG_PATH, path); + (void) nvlist_lookup_string(label, ZPOOL_CONFIG_DEVID, devid); + return (0); + } if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || - nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &pool_guid) || - nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &vdev_guid)) + nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) return (ENOENT); return (label_paths_impl(hdl, nvroot, pool_guid, vdev_guid, path, From 29ea6faf8ff5173da33c5506a7ef8ae21d3926e3 Mon Sep 17 00:00:00 2001 From: youzhongyang Date: Tue, 16 Jan 2024 16:30:58 -0500 Subject: [PATCH 16/22] Make spl_kmem_cache size check consistent On Linux x86_64, kmem cache can have size up to 4M, however increasing spl_kmem_cache_slab_limit can lead to crash due to the size check inconsistency. Reviewed-by: Brian Behlendorf Signed-off-by: Youzhong Yang Closes #15757 --- module/os/linux/spl/spl-kmem-cache.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/module/os/linux/spl/spl-kmem-cache.c b/module/os/linux/spl/spl-kmem-cache.c index a2920c746672..4b15081715ac 100644 --- a/module/os/linux/spl/spl-kmem-cache.c +++ b/module/os/linux/spl/spl-kmem-cache.c @@ -91,7 +91,8 @@ MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB"); * of 16K was determined to be optimal for architectures using 4K pages and * to also work well on architecutres using larger 64K page sizes. */ -static unsigned int spl_kmem_cache_slab_limit = 16384; +static unsigned int spl_kmem_cache_slab_limit = + SPL_MAX_KMEM_ORDER_NR_PAGES * PAGE_SIZE; module_param(spl_kmem_cache_slab_limit, uint, 0644); MODULE_PARM_DESC(spl_kmem_cache_slab_limit, "Objects less than N bytes use the Linux slab"); @@ -783,7 +784,7 @@ spl_kmem_cache_create(const char *name, size_t size, size_t align, } else { unsigned long slabflags = 0; - if (size > (SPL_MAX_KMEM_ORDER_NR_PAGES * PAGE_SIZE)) + if (size > spl_kmem_cache_slab_limit) goto out; #if defined(SLAB_USERCOPY) From ef00da803de24f93fb02757f1a4295fe7f0016b9 Mon Sep 17 00:00:00 2001 From: Lalufu Date: Tue, 16 Jan 2024 22:32:59 +0100 Subject: [PATCH 17/22] Make sure all necessary RPM path macros are defined When building (s)rpm files through the Makefile, a directory structure is created in /tmp to hold the various files. In case the user running the command has overridden some of the RPM path settings through their user profile (for example in `~/.rpmmacros`), these paths do not line up with the configuration, and the build fails. Make sure all paths used are properly defined. Reviewed-by: Brian Behlendorf Signed-off-by: Ralf Ertzinger Closes #15756 --- config/rpm.am | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/config/rpm.am b/config/rpm.am index 13bd54a625b0..85c56c0b2e3a 100644 --- a/config/rpm.am +++ b/config/rpm.am @@ -83,6 +83,11 @@ srpm-common: rpm-local || exit 1; \ LANG=C $(RPMBUILD) \ --define "_tmppath $$rpmbuild/TMP" \ + --define "_builddir $$rpmbuild/BUILD" \ + --define "_rpmdir $$rpmbuild/RPMS" \ + --define "_srcrpmdir $$rpmbuild/SRPMS" \ + --define "_specdir $$rpmbuild/SPECS" \ + --define "_sourcedir $$rpmbuild/SOURCES" \ --define "_topdir $$rpmbuild" \ $(def) -bs $$rpmbuild/SPECS/$$rpmspec || exit 1; \ cp $$rpmbuild/SRPMS/$$rpmpkg . || exit 1; \ @@ -99,6 +104,11 @@ rpm-common: rpm-local || exit 1; \ LANG=C ${RPMBUILD} \ --define "_tmppath $$rpmbuild/TMP" \ + --define "_builddir $$rpmbuild/BUILD" \ + --define "_rpmdir $$rpmbuild/RPMS" \ + --define "_srcrpmdir $$rpmbuild/SRPMS" \ + --define "_specdir $$rpmbuild/SPECS" \ + --define "_sourcedir $$rpmbuild/SOURCES" \ --define "_topdir $$rpmbuild" \ $(def) --rebuild $$rpmpkg || exit 1; \ cp $$rpmbuild/RPMS/*/* . || exit 1; \ From f0bf7a247dbb030d68c7fd2b5526dd111cc775d0 Mon Sep 17 00:00:00 2001 From: Rob N Date: Wed, 17 Jan 2024 09:01:17 +1100 Subject: [PATCH 18/22] Linux 6.7 compat: zfs_setattr fix atime update In db4fc559c I messed up and changed this bit of code to set the inode atime to an uninitialised value, when actually it was just supposed to loading the atime from the inode to be stored in the SA. This changes it to what it should have been. Ensure times change by the right amount Previously, we only checked if the times changed at all, which missed a bug where the atime was being set to an undefined value. Now ensure the times change by two seconds (or thereabouts), ensuring we catch cases where we set the time to something bonkers Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: https://despairlabs.com/sponsor/ Closes #15762 Closes #15773 --- module/os/linux/zfs/zfs_vnops_os.c | 3 +-- tests/zfs-tests/cmd/ctime.c | 14 +++++++++++--- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index 2a766a585b70..b7b89b8afc56 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -2439,9 +2439,8 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns) if ((mask & ATTR_ATIME) || zp->z_atime_dirty) { zp->z_atime_dirty = B_FALSE; - inode_timespec_t tmp_atime; + inode_timespec_t tmp_atime = zpl_inode_get_atime(ip); ZFS_TIME_ENCODE(&tmp_atime, atime); - zpl_inode_set_atime_to_ts(ZTOI(zp), tmp_atime); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, sizeof (atime)); } diff --git a/tests/zfs-tests/cmd/ctime.c b/tests/zfs-tests/cmd/ctime.c index 0f5d81aea613..5ff1cea8a869 100644 --- a/tests/zfs-tests/cmd/ctime.c +++ b/tests/zfs-tests/cmd/ctime.c @@ -362,12 +362,20 @@ main(void) return (1); } - if (t1 == t2) { - (void) fprintf(stderr, "%s: t1(%ld) == t2(%ld)\n", + + /* + * Ideally, time change would be exactly two seconds, but allow + * a little slack in case of scheduling delays or similar. + */ + long delta = (long)t2 - (long)t1; + if (delta < 2 || delta > 4) { + (void) fprintf(stderr, + "%s: BAD time change: t1(%ld), t2(%ld)\n", timetest_table[i].name, (long)t1, (long)t2); return (1); } else { - (void) fprintf(stderr, "%s: t1(%ld) != t2(%ld)\n", + (void) fprintf(stderr, + "%s: good time change: t1(%ld), t2(%ld)\n", timetest_table[i].name, (long)t1, (long)t2); } } From f45dd90f34cf8dadecc0236f9016046a6bf54ee5 Mon Sep 17 00:00:00 2001 From: Pawel Jakub Dawidek Date: Wed, 17 Jan 2024 08:51:07 -0800 Subject: [PATCH 19/22] Fix cloning into mmaped and cached file. If the destination file is mmaped and the mmaped region was already read, so it is cached, we need to update mmaped pages after successful clone using update_pages(). Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Pointed out by: Ka Ho Ng Signed-off-by: Pawel Jakub Dawidek Closes #15772 --- module/zfs/zfs_vnops.c | 4 + tests/runfiles/common.run | 4 +- tests/test-runner/bin/zts-report.py.in | 1 + tests/zfs-tests/cmd/.gitignore | 1 + tests/zfs-tests/cmd/Makefile.am | 1 + tests/zfs-tests/cmd/clone_mmap_cached.c | 146 ++++++++++++++++++ tests/zfs-tests/include/commands.cfg | 1 + tests/zfs-tests/tests/Makefile.am | 1 + .../block_cloning_clone_mmap_cached.ksh | 86 +++++++++++ .../tests/functional/block_cloning/setup.ksh | 3 + 10 files changed, 247 insertions(+), 1 deletion(-) create mode 100644 tests/zfs-tests/cmd/clone_mmap_cached.c create mode 100755 tests/zfs-tests/tests/functional/block_cloning/block_cloning_clone_mmap_cached.ksh diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 5377da401126..c8ff7b6432fd 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -1349,6 +1349,10 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, break; } + if (zn_has_cached_data(outzp, outoff, outoff + size - 1)) { + update_pages(outzp, outoff, size, outos); + } + zfs_clear_setid_bits_if_necessary(outzfsvfs, outzp, cr, &clear_setid_bits_txg, tx); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 47f8de0dd4c1..7f0cce252372 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -72,7 +72,9 @@ tags = ['functional', 'bclone'] timeout = 7200 [tests/functional/block_cloning] -tests = ['block_cloning_copyfilerange', 'block_cloning_copyfilerange_partial', +tests = ['block_cloning_clone_mmap_cached', + 'block_cloning_copyfilerange', + 'block_cloning_copyfilerange_partial', 'block_cloning_copyfilerange_fallback', 'block_cloning_disabled_copyfilerange', 'block_cloning_copyfilerange_cross_dataset', diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index c84f75cd806b..ae4aa6275465 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -287,6 +287,7 @@ elif sys.platform.startswith('linux'): 'bclone/bclone_samefs_data': ['SKIP', cfr_reason], 'bclone/bclone_samefs_embedded': ['SKIP', cfr_reason], 'bclone/bclone_samefs_hole': ['SKIP', cfr_reason], + 'block_cloning/block_cloning_clone_mmap_cached': ['SKIP', cfr_reason], 'block_cloning/block_cloning_clone_mmap_write': ['SKIP', cfr_reason], 'block_cloning/block_cloning_copyfilerange': diff --git a/tests/zfs-tests/cmd/.gitignore b/tests/zfs-tests/cmd/.gitignore index a696fd387111..0ed0a69eb013 100644 --- a/tests/zfs-tests/cmd/.gitignore +++ b/tests/zfs-tests/cmd/.gitignore @@ -2,6 +2,7 @@ /btree_test /chg_usr_exec /clonefile +/clone_mmap_cached /clone_mmap_write /devname2devid /dir_rd_update diff --git a/tests/zfs-tests/cmd/Makefile.am b/tests/zfs-tests/cmd/Makefile.am index 379dc5e236c5..23848a82ffbd 100644 --- a/tests/zfs-tests/cmd/Makefile.am +++ b/tests/zfs-tests/cmd/Makefile.am @@ -3,6 +3,7 @@ scripts_zfs_tests_bindir = $(datadir)/$(PACKAGE)/zfs-tests/bin scripts_zfs_tests_bin_PROGRAMS = %D%/chg_usr_exec scripts_zfs_tests_bin_PROGRAMS += %D%/clonefile +scripts_zfs_tests_bin_PROGRAMS += %D%/clone_mmap_cached scripts_zfs_tests_bin_PROGRAMS += %D%/clone_mmap_write scripts_zfs_tests_bin_PROGRAMS += %D%/cp_files scripts_zfs_tests_bin_PROGRAMS += %D%/ctime diff --git a/tests/zfs-tests/cmd/clone_mmap_cached.c b/tests/zfs-tests/cmd/clone_mmap_cached.c new file mode 100644 index 000000000000..c1cdf796cfb4 --- /dev/null +++ b/tests/zfs-tests/cmd/clone_mmap_cached.c @@ -0,0 +1,146 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2024 by Pawel Jakub Dawidek + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __FreeBSD__ +#define loff_t off_t +#endif + +ssize_t +copy_file_range(int, loff_t *, int, loff_t *, size_t, unsigned int) + __attribute__((weak)); + +static void * +mmap_file(int fd, size_t size) +{ + void *p; + + p = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0); + if (p == MAP_FAILED) { + (void) fprintf(stderr, "mmap failed: %s\n", strerror(errno)); + exit(2); + } + + return (p); +} + +static void +usage(const char *progname) +{ + + /* + * -i cache input before copy_file_range(2). + * -o cache input before copy_file_range(2). + */ + (void) fprintf(stderr, "usage: %s [-io] \n", progname); + exit(3); +} + +int +main(int argc, char *argv[]) +{ + int dfd, sfd; + size_t dsize, ssize; + void *dmem, *smem, *ptr; + off_t doff, soff; + struct stat sb; + bool cache_input, cache_output; + const char *progname; + int c; + + progname = argv[0]; + cache_input = cache_output = false; + + while ((c = getopt(argc, argv, "io")) != -1) { + switch (c) { + case 'i': + cache_input = true; + break; + case 'o': + cache_output = true; + break; + default: + usage(progname); + } + } + argc -= optind; + argv += optind; + + if (argc != 2) { + usage(progname); + } + + sfd = open(argv[0], O_RDONLY); + if (fstat(sfd, &sb) == -1) { + (void) fprintf(stderr, "fstat failed: %s\n", strerror(errno)); + exit(2); + } + ssize = sb.st_size; + smem = mmap_file(sfd, ssize); + + dfd = open(argv[1], O_RDWR); + if (fstat(dfd, &sb) == -1) { + (void) fprintf(stderr, "fstat failed: %s\n", strerror(errno)); + exit(2); + } + dsize = sb.st_size; + dmem = mmap_file(dfd, dsize); + + /* + * Hopefully it won't be compiled out. + */ + if (cache_input) { + ptr = malloc(ssize); + assert(ptr != NULL); + memcpy(ptr, smem, ssize); + free(ptr); + } + if (cache_output) { + ptr = malloc(ssize); + assert(ptr != NULL); + memcpy(ptr, dmem, dsize); + free(ptr); + } + + soff = doff = 0; + if (copy_file_range(sfd, &soff, dfd, &doff, ssize, 0) < 0) { + (void) fprintf(stderr, "copy_file_range failed: %s\n", + strerror(errno)); + exit(2); + } + + exit(memcmp(smem, dmem, ssize) == 0 ? 0 : 1); +} diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg index 797078ed3ab6..daa794551682 100644 --- a/tests/zfs-tests/include/commands.cfg +++ b/tests/zfs-tests/include/commands.cfg @@ -185,6 +185,7 @@ export ZFSTEST_FILES='badsend btree_test chg_usr_exec clonefile + clone_mmap_cached clone_mmap_write devname2devid dir_rd_update diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 2fc54301bbeb..4040e60434a7 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -461,6 +461,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/bclone/setup.ksh \ functional/block_cloning/cleanup.ksh \ functional/block_cloning/setup.ksh \ + functional/block_cloning/block_cloning_clone_mmap_cached.ksh \ functional/block_cloning/block_cloning_clone_mmap_write.ksh \ functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh \ functional/block_cloning/block_cloning_copyfilerange_fallback.ksh \ diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_clone_mmap_cached.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_clone_mmap_cached.ksh new file mode 100755 index 000000000000..b0ef8ec99533 --- /dev/null +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_clone_mmap_cached.ksh @@ -0,0 +1,86 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib + +# +# DESCRIPTION: +# When the destination file is mmaped and is already cached we need to +# update mmaped pages after successful clone. +# +# STRATEGY: +# 1. Create a pool. +# 2. Create a two test files with random content. +# 3. mmap the files, read them and clone from one to the other using +# clone_mmap_cached. +# 4. clone_mmap_cached also verifies if the content of the destination +# file was updated while reading it from mmaped memory. +# + +verify_runnable "global" + +if is_linux && [[ $(linux_version) -lt $(linux_version "4.5") ]]; then + log_unsupported "copy_file_range not available before Linux 4.5" +fi + +VDIR=$TEST_BASE_DIR/disk-bclone +VDEV="$VDIR/a" + +function cleanup +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL + rm -rf $VDIR +} + +log_onexit cleanup + +log_assert "Test for clone into mmaped and cached file" + +log_must rm -rf $VDIR +log_must mkdir -p $VDIR +log_must truncate -s 1G $VDEV + +log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $VDEV +log_must zfs create $TESTPOOL/$TESTFS + +for opts in "--" "-i" "-o" "-io" +do + log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/src bs=1M count=1 + log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/dst bs=1M count=1 + + # Clear cache. + log_must zpool export $TESTPOOL + log_must zpool import -d $VDIR $TESTPOOL + + log_must clone_mmap_cached $opts /$TESTPOOL/$TESTFS/src /$TESTPOOL/$TESTFS/dst + + sync_pool $TESTPOOL + log_must sync + + log_must have_same_content /$TESTPOOL/$TESTFS/src /$TESTPOOL/$TESTFS/dst + blocks=$(get_same_blocks $TESTPOOL/$TESTFS src $TESTPOOL/$TESTFS dst) + # FreeBSD's seq(1) leaves a trailing space, remove it with sed(1). + log_must [ "$blocks" = "$(seq -s " " 0 7 | sed 's/ $//')" ] +done + +log_pass "Clone properly updates mmapped and cached pages" diff --git a/tests/zfs-tests/tests/functional/block_cloning/setup.ksh b/tests/zfs-tests/tests/functional/block_cloning/setup.ksh index 58441bf8f3ad..a9b13f062a4e 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/setup.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/setup.ksh @@ -30,6 +30,9 @@ if ! command -v clonefile > /dev/null ; then log_unsupported "clonefile program required to test block cloning" fi +if ! command -v clone_mmap_cached > /dev/null ; then + log_unsupported "clone_mmap_cached program required to test block cloning" +fi verify_runnable "global" From 1494e8fbaac6d1c360f8e2fdb76975f63d147cc0 Mon Sep 17 00:00:00 2001 From: Kevin Jin <33590050+jxdking@users.noreply.github.com> Date: Wed, 17 Jan 2024 12:03:58 -0500 Subject: [PATCH 20/22] Autotrim High Load Average Fix Switch from cv_wait() to cv_wait_idle() in vdev_autotrim_wait_kick(), which should mitigate the high load average while waiting. Reviewed-by: Brian Atkinson Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: jxdking Closes #15781 --- module/zfs/vdev_trim.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c index 1c54eae40355..7e3c5f684703 100644 --- a/module/zfs/vdev_trim.c +++ b/module/zfs/vdev_trim.c @@ -196,7 +196,8 @@ vdev_autotrim_wait_kick(vdev_t *vd, int num_of_kick) for (int i = 0; i < num_of_kick; i++) { if (vd->vdev_autotrim_exit_wanted) break; - cv_wait(&vd->vdev_autotrim_kick_cv, &vd->vdev_autotrim_lock); + cv_wait_idle(&vd->vdev_autotrim_kick_cv, + &vd->vdev_autotrim_lock); } boolean_t exit_wanted = vd->vdev_autotrim_exit_wanted; mutex_exit(&vd->vdev_autotrim_lock); From e3d3d772de1cfc02d1df26bc6aadbbc48a28dcff Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Wed, 17 Jan 2024 18:05:12 +0100 Subject: [PATCH 21/22] linux spl: fix typo in top comment of spl-condvar.c Credential Implementation -> Condition Variables Implementation Reviewed-by: Brian Atkinson Reviewed-by: Brian Behlendorf Signed-off-by: Tino Reichardt Closes #15782 --- module/os/linux/spl/spl-condvar.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/os/linux/spl/spl-condvar.c b/module/os/linux/spl/spl-condvar.c index e87954714e3a..5898789ad53d 100644 --- a/module/os/linux/spl/spl-condvar.c +++ b/module/os/linux/spl/spl-condvar.c @@ -20,7 +20,7 @@ * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . * - * Solaris Porting Layer (SPL) Credential Implementation. + * Solaris Porting Layer (SPL) Condition Variables Implementation. */ #include From a0b2a93c41b0b0d7723d1b20eb1eca7a1a63e45b Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Wed, 17 Jan 2024 18:06:14 +0100 Subject: [PATCH 22/22] fix: variable type with zfs-tests/cmd/clonefile.c Compiling on arm64 freebsd-13.2 and arm64 almalinux-8 brings currently this error: ``` CC tests/zfs-tests/cmd/clonefile.o tests/zfs-tests/cmd/clonefile.c:166:43: error: result of comparison of \ constant -1 with expression of type 'char' is always true \ [-Werror,-Wtautological-constant-out-of-range-compare] while ((c = getopt(argc, argv, "crfdq")) != -1) { ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ^ ~~ 1 error generated. gmake[2]: *** [Makefile:8675: tests/zfs-tests/cmd/clonefile.o] Error 1 ``` Fix: use correct variable type `int`. Reviewed-by: Brian Behlendorf Reviewed-by: Rob Norris Signed-off-by: Tino Reichardt Closes #15783 --- tests/zfs-tests/cmd/clonefile.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/zfs-tests/cmd/clonefile.c b/tests/zfs-tests/cmd/clonefile.c index d002cd9b587e..bc30bb7798e9 100644 --- a/tests/zfs-tests/cmd/clonefile.c +++ b/tests/zfs-tests/cmd/clonefile.c @@ -162,7 +162,7 @@ main(int argc, char **argv) { cf_mode_t mode = CF_MODE_NONE; - char c; + int c; while ((c = getopt(argc, argv, "crfdq")) != -1) { switch (c) { case 'c':