From fe2f7cf6d75aee51727f52d2c74caec06fc5e2f6 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Thu, 11 Sep 2025 01:59:57 +1000 Subject: [PATCH 01/23] linux/rw_destroy: assert no holders before destroying While rw_destroy() may do nothing on Linux, we still want to make sure that we don't have any holders outstanding like we do for mutexes. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Closes #17718 --- include/os/linux/spl/sys/rwlock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/os/linux/spl/sys/rwlock.h b/include/os/linux/spl/sys/rwlock.h index 563e0a19663..c883836c2f8 100644 --- a/include/os/linux/spl/sys/rwlock.h +++ b/include/os/linux/spl/sys/rwlock.h @@ -130,7 +130,7 @@ RW_READ_HELD(krwlock_t *rwp) /* * The Linux rwsem implementation does not require a matching destroy. */ -#define rw_destroy(rwp) ((void) 0) +#define rw_destroy(rwp) ASSERT(!(RW_LOCK_HELD(rwp))) /* * Upgrading a rwsem from a reader to a writer is not supported by the From c6fe41cac503fe62e6ffbbd53052fbdf4e1aa00f Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Wed, 10 Sep 2025 10:25:58 -0700 Subject: [PATCH 02/23] CI: Increase setup timeout to 20min, add timestamps - Increase qemu-1-setup.sh timeout to 20min since it sometimes fails to complete after 15min. - Timestamp all qemu-1-setup.sh lines to look for hangs. - Add a 'watchdog' process to print out the top running process every 30sec to help with debugging. Reviewed-by: Brian Behlendorf Signed-off-by: Tony Hutter Closes #17714 --- .github/workflows/scripts/qemu-1-setup.sh | 10 ++++++++++ .github/workflows/zfs-qemu.yml | 8 ++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/.github/workflows/scripts/qemu-1-setup.sh b/.github/workflows/scripts/qemu-1-setup.sh index de29ad1f57b..0278264d927 100755 --- a/.github/workflows/scripts/qemu-1-setup.sh +++ b/.github/workflows/scripts/qemu-1-setup.sh @@ -6,6 +6,13 @@ set -eu +# We've been seeing this script take over 15min to run. This may or +# may not be normal. Just to get a little more insight, print out +# a message to stdout with the top running process, and do this every +# 30 seconds. We can delete this watchdog later once we get a better +# handle on what the timeout value should be. +(while [ 1 ] ; do sleep 30 && echo "[watchdog: $(ps -eo cmd --sort=-pcpu | head -n 2 | tail -n 1)}')]"; done) & + # install needed packages export DEBIAN_FRONTEND="noninteractive" sudo apt-get -y update @@ -65,3 +72,6 @@ sudo zpool create -f -o ashift=12 zpool $SSD1 $SSD2 -O relatime=off \ for i in /sys/block/s*/queue/scheduler; do echo "none" | sudo tee $i done + +# Kill off our watchdog +kill $(jobs -p) diff --git a/.github/workflows/zfs-qemu.yml b/.github/workflows/zfs-qemu.yml index a5dbfc099c9..69349678d84 100644 --- a/.github/workflows/zfs-qemu.yml +++ b/.github/workflows/zfs-qemu.yml @@ -77,8 +77,12 @@ jobs: ref: ${{ github.event.pull_request.head.sha }} - name: Setup QEMU - timeout-minutes: 15 - run: .github/workflows/scripts/qemu-1-setup.sh + timeout-minutes: 20 + run: | + # Add a timestamp to each line to debug timeouts + while IFS=$'\n' read -r line; do + echo "$(date +'%H:%M:%S') $line" + done < <(.github/workflows/scripts/qemu-1-setup.sh) - name: Start build machine timeout-minutes: 10 From d3429a75b01378a23b081988a2430569af13c743 Mon Sep 17 00:00:00 2001 From: Shengqi Chen Date: Tue, 9 Sep 2025 17:46:40 +0800 Subject: [PATCH 03/23] Remove renaming notice and symlinks for arcstat and arc_summary Reviewed-by: Brian Behlendorf Reviewed-by: Colm Buckley Signed-off-by: Shengqi Chen Closes #16357 Closes #17712 --- cmd/Makefile.am | 5 ----- cmd/arc_summary | 7 ------- cmd/arcstat.in | 7 ------- contrib/debian/openzfs-zfsutils.install | 2 -- contrib/debian/rules.in | 2 -- man/man1/arcstat.1 | 2 -- rpm/generic/zfs.spec.in | 2 -- 7 files changed, 27 deletions(-) diff --git a/cmd/Makefile.am b/cmd/Makefile.am index e79bfae2b10..c1c4e2fe1c4 100644 --- a/cmd/Makefile.am +++ b/cmd/Makefile.am @@ -107,11 +107,6 @@ $(call SUBST,dbufstat,%D%/) $(call SUBST,zilstat,%D%/) arc_summary: %D%/arc_summary $(AM_V_at)cp $< $@ - -cmd-rename-install-exec-hook: - $(LN_S) -f arcstat $(DESTDIR)$(bindir)/zarcstat - $(LN_S) -f arc_summary $(DESTDIR)$(bindir)/zarcsummary -INSTALL_EXEC_HOOKS += cmd-rename-install-exec-hook endif PHONY += cmd diff --git a/cmd/arc_summary b/cmd/arc_summary index 9538dd599cb..e60c6b64e8a 100755 --- a/cmd/arc_summary +++ b/cmd/arc_summary @@ -1021,13 +1021,6 @@ def main(): treated separately because they come with their own call. """ - # notify user for upcoming renaming in 2.4.0 - abs_path = os.path.abspath(sys.argv[0].strip()) - script_name = os.path.basename(abs_path) - if script_name != "zarcsummary": - sys.stderr.write("Note: this script will be renamed to zarcsummary in ") - sys.stderr.write("zfs 2.4.0. Please migrate ASAP.\n") - kstats = get_kstats() if ARGS.graph: diff --git a/cmd/arcstat.in b/cmd/arcstat.in index e153eddb36c..10115aeb105 100755 --- a/cmd/arcstat.in +++ b/cmd/arcstat.in @@ -768,13 +768,6 @@ def calculate(): def main(): - # notify user for upcoming renaming in 2.4.0 - abs_path = os.path.abspath(sys.argv[0].strip()) - script_name = os.path.basename(abs_path) - if script_name != "zarcstat": - sys.stderr.write("Note: this script will be renamed to zarcstat in ") - sys.stderr.write("zfs 2.4.0. Please migrate ASAP.\n") - global sint global count global hdr_intr diff --git a/contrib/debian/openzfs-zfsutils.install b/contrib/debian/openzfs-zfsutils.install index 2362c83dfa3..37284a78ad1 100644 --- a/contrib/debian/openzfs-zfsutils.install +++ b/contrib/debian/openzfs-zfsutils.install @@ -37,9 +37,7 @@ usr/lib/zfs-linux/zpool.d/ usr/lib/zfs-linux/zpool_influxdb usr/lib/zfs-linux/zfs_prepare_disk usr/sbin/arc_summary -usr/sbin/zarcsummary usr/sbin/arcstat -usr/sbin/zarcstat usr/sbin/dbufstat usr/sbin/zilstat usr/share/zfs/compatibility.d/ diff --git a/contrib/debian/rules.in b/contrib/debian/rules.in index 966e34bf9dc..2b0568938b2 100755 --- a/contrib/debian/rules.in +++ b/contrib/debian/rules.in @@ -82,9 +82,7 @@ override_dh_auto_install: # https://www.debian.org/doc/debian-policy/ch-files.html#s-scripts mkdir -p '$(CURDIR)/debian/tmp/usr/sbin/' mv '$(CURDIR)/debian/tmp/usr/bin/arc_summary' '$(CURDIR)/debian/tmp/usr/sbin/arc_summary' - mv '$(CURDIR)/debian/tmp/usr/bin/zarcsummary' '$(CURDIR)/debian/tmp/usr/sbin/zarcsummary' mv '$(CURDIR)/debian/tmp/usr/bin/arcstat' '$(CURDIR)/debian/tmp/usr/sbin/arcstat' - mv '$(CURDIR)/debian/tmp/usr/bin/zarcstat' '$(CURDIR)/debian/tmp/usr/sbin/zarcstat' mv '$(CURDIR)/debian/tmp/usr/bin/dbufstat' '$(CURDIR)/debian/tmp/usr/sbin/dbufstat' mv '$(CURDIR)/debian/tmp/usr/bin/zilstat' '$(CURDIR)/debian/tmp/usr/sbin/zilstat' diff --git a/man/man1/arcstat.1 b/man/man1/arcstat.1 index 288b98d57a1..fe2e0407e07 100644 --- a/man/man1/arcstat.1 +++ b/man/man1/arcstat.1 @@ -20,8 +20,6 @@ .Sh NAME .Nm arcstat .Nd report ZFS ARC and L2ARC statistics -.Sh NOTICE -It will be renamed to zarcstat in zfs 2.4.0. Please migrate ASAP. .Sh SYNOPSIS .Nm .Op Fl havxp diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in index edcfdd2d713..1ce668e7b86 100644 --- a/rpm/generic/zfs.spec.in +++ b/rpm/generic/zfs.spec.in @@ -509,9 +509,7 @@ systemctl --system daemon-reload >/dev/null || true %{_bindir}/zvol_wait # Optional Python 3 scripts %{_bindir}/arc_summary -%{_bindir}/zarcsummary %{_bindir}/arcstat -%{_bindir}/zarcstat %{_bindir}/dbufstat %{_bindir}/zilstat # Man pages From a5571a0dd1a182a2c84cc52250116f1e545c5a31 Mon Sep 17 00:00:00 2001 From: Shengqi Chen Date: Tue, 9 Sep 2025 17:48:04 +0800 Subject: [PATCH 04/23] cmd: rename arc_summary to zarcsummary Reviewed-by: Brian Behlendorf Reviewed-by: Colm Buckley Signed-off-by: Shengqi Chen Closes #16357 Closes #17712 --- cmd/Makefile.am | 8 ++++---- cmd/arcstat.in | 8 ++++---- cmd/{arc_summary => zarcsummary} | 6 +++--- contrib/debian/not-installed | 2 +- contrib/debian/openzfs-zfsutils.install | 2 +- contrib/debian/rules.in | 2 +- rpm/generic/zfs.spec.in | 4 ++-- scripts/spdxcheck.pl | 2 +- tests/runfiles/common.run | 2 +- tests/runfiles/sanity.run | 2 +- tests/test-runner/bin/zts-report.py.in | 2 +- tests/zfs-tests/include/commands.cfg | 2 +- tests/zfs-tests/tests/Makefile.am | 4 ++-- ...arc_summary_001_pos.ksh => zarcsummary_001_pos.ksh} | 10 +++++----- ...arc_summary_002_neg.ksh => zarcsummary_002_neg.ksh} | 6 +++--- 15 files changed, 31 insertions(+), 31 deletions(-) rename cmd/{arc_summary => zarcsummary} (99%) rename tests/zfs-tests/tests/functional/cli_user/misc/{arc_summary_001_pos.ksh => zarcsummary_001_pos.ksh} (82%) rename tests/zfs-tests/tests/functional/cli_user/misc/{arc_summary_002_neg.ksh => zarcsummary_002_neg.ksh} (86%) diff --git a/cmd/Makefile.am b/cmd/Makefile.am index c1c4e2fe1c4..cfc5f7357a3 100644 --- a/cmd/Makefile.am +++ b/cmd/Makefile.am @@ -98,14 +98,14 @@ endif if USING_PYTHON -bin_SCRIPTS += arc_summary arcstat dbufstat zilstat -CLEANFILES += arc_summary arcstat dbufstat zilstat -dist_noinst_DATA += %D%/arc_summary %D%/arcstat.in %D%/dbufstat.in %D%/zilstat.in +bin_SCRIPTS += zarcsummary arcstat dbufstat zilstat +CLEANFILES += zarcsummary arcstat dbufstat zilstat +dist_noinst_DATA += %D%/zarcsummary %D%/arcstat.in %D%/dbufstat.in %D%/zilstat.in $(call SUBST,arcstat,%D%/) $(call SUBST,dbufstat,%D%/) $(call SUBST,zilstat,%D%/) -arc_summary: %D%/arc_summary +zarcsummary: %D%/zarcsummary $(AM_V_at)cp $< $@ endif diff --git a/cmd/arcstat.in b/cmd/arcstat.in index 10115aeb105..ad06135b9c2 100755 --- a/cmd/arcstat.in +++ b/cmd/arcstat.in @@ -172,7 +172,7 @@ cols = { "zactive": [7, 1000, "zfetch prefetches active per second"], } -# ARC structural breakdown from arc_summary +# ARC structural breakdown from zarcsummary structfields = { "cmp": ["compressed", "Compressed"], "ovh": ["overhead", "Overhead"], @@ -188,7 +188,7 @@ structstats = { # size stats "sz": ["_size", "size"], } -# ARC types breakdown from arc_summary +# ARC types breakdown from zarcsummary typefields = { "data": ["data", "ARC data"], "meta": ["metadata", "ARC metadata"], @@ -199,7 +199,7 @@ typestats = { # size stats "sz": ["_size", "size"], } -# ARC states breakdown from arc_summary +# ARC states breakdown from zarcsummary statefields = { "ano": ["anon", "Anonymous"], "mfu": ["mfu", "MFU"], @@ -367,7 +367,7 @@ def snap_stats(): cur = kstat - # fill in additional values from arc_summary + # fill in additional values from zarcsummary cur["caches_size"] = caches_size = cur["anon_data"]+cur["anon_metadata"]+\ cur["mfu_data"]+cur["mfu_metadata"]+cur["mru_data"]+cur["mru_metadata"]+\ cur["uncached_data"]+cur["uncached_metadata"] diff --git a/cmd/arc_summary b/cmd/zarcsummary similarity index 99% rename from cmd/arc_summary rename to cmd/zarcsummary index e60c6b64e8a..24a129d9ca7 100755 --- a/cmd/arc_summary +++ b/cmd/zarcsummary @@ -34,7 +34,7 @@ Provides basic information on the ARC, its efficiency, the L2ARC (if present), the Data Management Unit (DMU), Virtual Devices (VDEVs), and tunables. See the in-source documentation and code at https://github.com/openzfs/zfs/blob/master/module/zfs/arc.c for details. -The original introduction to arc_summary can be found at +The original introduction to zarcsummary can be found at http://cuddletech.com/?p=454 """ @@ -161,7 +161,7 @@ elif sys.platform.startswith('linux'): return get_params(TUNABLES_PATH) def get_version_impl(request): - # The original arc_summary called /sbin/modinfo/{spl,zfs} to get + # The original zarcsummary called /sbin/modinfo/{spl,zfs} to get # the version information. We switch to /sys/module/{spl,zfs}/version # to make sure we get what is really loaded in the kernel try: @@ -439,7 +439,7 @@ def print_header(): """ # datetime is now recommended over time but we keep the exact formatting - # from the older version of arc_summary in case there are scripts + # from the older version of zarcsummary in case there are scripts # that expect it in this way daydate = time.strftime(DATE_FORMAT) spc_date = LINE_LENGTH-len(daydate) diff --git a/contrib/debian/not-installed b/contrib/debian/not-installed index 88557f76fca..b473b5e50b1 100644 --- a/contrib/debian/not-installed +++ b/contrib/debian/not-installed @@ -1,4 +1,4 @@ -usr/bin/arc_summary.py +usr/bin/zarcsummary.py usr/share/zfs/zfs-helpers.sh etc/default/zfs etc/init.d diff --git a/contrib/debian/openzfs-zfsutils.install b/contrib/debian/openzfs-zfsutils.install index 37284a78ad1..514153adfa5 100644 --- a/contrib/debian/openzfs-zfsutils.install +++ b/contrib/debian/openzfs-zfsutils.install @@ -36,7 +36,7 @@ usr/lib/modules-load.d/ lib/ usr/lib/zfs-linux/zpool.d/ usr/lib/zfs-linux/zpool_influxdb usr/lib/zfs-linux/zfs_prepare_disk -usr/sbin/arc_summary +usr/sbin/zarcsummary usr/sbin/arcstat usr/sbin/dbufstat usr/sbin/zilstat diff --git a/contrib/debian/rules.in b/contrib/debian/rules.in index 2b0568938b2..9bf50c93b05 100755 --- a/contrib/debian/rules.in +++ b/contrib/debian/rules.in @@ -81,7 +81,7 @@ override_dh_auto_install: # Remove suffix (.py) as per policy 10.4 - Scripts # https://www.debian.org/doc/debian-policy/ch-files.html#s-scripts mkdir -p '$(CURDIR)/debian/tmp/usr/sbin/' - mv '$(CURDIR)/debian/tmp/usr/bin/arc_summary' '$(CURDIR)/debian/tmp/usr/sbin/arc_summary' + mv '$(CURDIR)/debian/tmp/usr/bin/zarcsummary' '$(CURDIR)/debian/tmp/usr/sbin/zarcsummary' mv '$(CURDIR)/debian/tmp/usr/bin/arcstat' '$(CURDIR)/debian/tmp/usr/sbin/arcstat' mv '$(CURDIR)/debian/tmp/usr/bin/dbufstat' '$(CURDIR)/debian/tmp/usr/sbin/dbufstat' mv '$(CURDIR)/debian/tmp/usr/bin/zilstat' '$(CURDIR)/debian/tmp/usr/sbin/zilstat' diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in index 1ce668e7b86..79173156127 100644 --- a/rpm/generic/zfs.spec.in +++ b/rpm/generic/zfs.spec.in @@ -433,7 +433,7 @@ make install DESTDIR=%{?buildroot} find %{?buildroot}%{_libdir} -name '*.la' -exec rm -f {} \; %if 0%{!?__brp_mangle_shebangs:1} find %{?buildroot}%{_bindir} \ - \( -name arc_summary -or -name arcstat -or -name dbufstat \ + \( -name zarcsummary -or -name arcstat -or -name dbufstat \ -or -name zilstat \) \ -exec %{__sed} -i 's|^#!.*|#!%{__python}|' {} \; find %{?buildroot}%{_datadir} \ @@ -508,7 +508,7 @@ systemctl --system daemon-reload >/dev/null || true %{_bindir}/raidz_test %{_bindir}/zvol_wait # Optional Python 3 scripts -%{_bindir}/arc_summary +%{_bindir}/zarcsummary %{_bindir}/arcstat %{_bindir}/dbufstat %{_bindir}/zilstat diff --git a/scripts/spdxcheck.pl b/scripts/spdxcheck.pl index cdab5368f19..248d5ce6eb5 100755 --- a/scripts/spdxcheck.pl +++ b/scripts/spdxcheck.pl @@ -84,7 +84,7 @@ my $tagged_patterns = q( # Unsuffixed programs (or generated of same) cmd/arcstat.in - cmd/arc_summary + cmd/zarcsummary cmd/dbufstat.in cmd/zilstat.in cmd/zpool/zpool.d/* diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 2da46458289..5f95a6d58ec 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -625,7 +625,7 @@ tests = ['zdb_001_neg', 'zfs_001_neg', 'zfs_allow_001_neg', 'zpool_offline_001_neg', 'zpool_online_001_neg', 'zpool_remove_001_neg', 'zpool_replace_001_neg', 'zpool_scrub_001_neg', 'zpool_set_001_neg', 'zpool_status_001_neg', 'zpool_upgrade_001_neg', 'arcstat_001_pos', - 'arc_summary_001_pos', 'arc_summary_002_neg', 'zpool_wait_privilege', + 'zarcsummary_001_pos', 'zarcsummary_002_neg', 'zpool_wait_privilege', 'zilstat_001_pos'] user = tags = ['functional', 'cli_user', 'misc'] diff --git a/tests/runfiles/sanity.run b/tests/runfiles/sanity.run index 7767c0c2d53..881dfb5e74e 100644 --- a/tests/runfiles/sanity.run +++ b/tests/runfiles/sanity.run @@ -401,7 +401,7 @@ tests = ['zdb_001_neg', 'zfs_001_neg', 'zfs_allow_001_neg', 'zpool_history_001_neg', 'zpool_offline_001_neg', 'zpool_online_001_neg', 'zpool_remove_001_neg', 'zpool_scrub_001_neg', 'zpool_set_001_neg', 'zpool_status_001_neg', 'zpool_upgrade_001_neg', 'arcstat_001_pos', - 'arc_summary_001_pos', 'arc_summary_002_neg', 'zpool_wait_privilege', + 'zarcsummary_001_pos', 'zarcsummary_002_neg', 'zpool_wait_privilege', 'zilstat_001_pos'] user = tags = ['functional', 'cli_user', 'misc'] diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index 00197012014..5bc65f99373 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -232,7 +232,7 @@ maybe = { 'cli_root/zpool_trim/zpool_trim_fault_export_import_online': ['FAIL', known_reason], 'cli_root/zpool_upgrade/zpool_upgrade_004_pos': ['FAIL', 6141], - 'cli_user/misc/arc_summary_001_pos': ['FAIL', known_reason], + 'cli_user/misc/zarcsummary_001_pos': ['FAIL', known_reason], 'delegate/setup': ['SKIP', exec_reason], 'events/zed_cksum_config': ['FAIL', known_reason], 'fault/auto_replace_002_pos': ['FAIL', known_reason], diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg index 580281b30d7..cf4ae284118 100644 --- a/tests/zfs-tests/include/commands.cfg +++ b/tests/zfs-tests/include/commands.cfg @@ -171,7 +171,7 @@ export ZFS_FILES='zdb zpool ztest raidz_test - arc_summary + zarcsummary arcstat zilstat dbufstat diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 94db292c951..eeba7ebb2d9 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1352,8 +1352,8 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool/zpool_003_pos.ksh \ functional/cli_root/zpool/zpool_colors.ksh \ functional/cli_user/misc/arcstat_001_pos.ksh \ - functional/cli_user/misc/arc_summary_001_pos.ksh \ - functional/cli_user/misc/arc_summary_002_neg.ksh \ + functional/cli_user/misc/zarcsummary_001_pos.ksh \ + functional/cli_user/misc/zarcsummary_002_neg.ksh \ functional/cli_user/misc/zilstat_001_pos.ksh \ functional/cli_user/misc/cleanup.ksh \ functional/cli_user/misc/setup.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_user/misc/arc_summary_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_user/misc/zarcsummary_001_pos.ksh similarity index 82% rename from tests/zfs-tests/tests/functional/cli_user/misc/arc_summary_001_pos.ksh rename to tests/zfs-tests/tests/functional/cli_user/misc/zarcsummary_001_pos.ksh index 0840878fdb0..b7faac5243c 100755 --- a/tests/zfs-tests/tests/functional/cli_user/misc/arc_summary_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_user/misc/zarcsummary_001_pos.ksh @@ -30,16 +30,16 @@ is_freebsd && ! python3 -c 'import sysctl' 2>/dev/null && log_unsupported "python3 sysctl module missing" -log_assert "arc_summary generates output and doesn't return an error code" +log_assert "zarcsummary generates output and doesn't return an error code" # Without this, the below checks aren't going to work the way we hope... set -o pipefail for arg in "" "-a" "-d" "-p 1" "-g" "-s arc" "-r"; do - log_must eval "arc_summary $arg > /dev/null" + log_must eval "zarcsummary $arg > /dev/null" done -log_must eval "arc_summary | head > /dev/null" -log_must eval "arc_summary | head -1 > /dev/null" +log_must eval "zarcsummary | head > /dev/null" +log_must eval "zarcsummary | head -1 > /dev/null" -log_pass "arc_summary generates output and doesn't return an error code" +log_pass "zarcsummary generates output and doesn't return an error code" diff --git a/tests/zfs-tests/tests/functional/cli_user/misc/arc_summary_002_neg.ksh b/tests/zfs-tests/tests/functional/cli_user/misc/zarcsummary_002_neg.ksh similarity index 86% rename from tests/zfs-tests/tests/functional/cli_user/misc/arc_summary_002_neg.ksh rename to tests/zfs-tests/tests/functional/cli_user/misc/zarcsummary_002_neg.ksh index ec4abe35409..227646777ba 100755 --- a/tests/zfs-tests/tests/functional/cli_user/misc/arc_summary_002_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_user/misc/zarcsummary_002_neg.ksh @@ -30,10 +30,10 @@ is_freebsd && ! python3 -c 'import sysctl' 2>/dev/null && log_unsupported "python3 sysctl module missing" -log_assert "arc_summary generates an error code with invalid options" +log_assert "zarcsummary generates an error code with invalid options" for arg in "-x" "-5" "-p 7" "--err" "-@"; do - log_mustnot eval "arc_summary $arg > /dev/null" + log_mustnot eval "zarcsummary $arg > /dev/null" done -log_pass "arc_summary generates an error code with invalid options" +log_pass "zarcsummary generates an error code with invalid options" From 9ae20cf03de17f53c87106de39c808bda2af9efe Mon Sep 17 00:00:00 2001 From: Shengqi Chen Date: Tue, 9 Sep 2025 17:52:56 +0800 Subject: [PATCH 05/23] cmd: rename arcstat to zarcstat Reviewed-by: Brian Behlendorf Reviewed-by: Colm Buckley Signed-off-by: Shengqi Chen Closes #16357 Closes #17712 --- cmd/Makefile.am | 8 ++++---- cmd/{arcstat.in => zarcstat.in} | 12 ++++++------ contrib/debian/openzfs-zfsutils.install | 4 ++-- contrib/debian/rules.in | 2 +- man/Makefile.am | 2 +- man/man1/{arcstat.1 => zarcstat.1} | 4 ++-- rpm/generic/zfs.spec.in | 4 ++-- scripts/spdxcheck.pl | 2 +- tests/runfiles/common.run | 2 +- tests/runfiles/sanity.run | 2 +- tests/zfs-tests/include/commands.cfg | 2 +- tests/zfs-tests/tests/Makefile.am | 2 +- .../{arcstat_001_pos.ksh => zarcstat_001_pos.ksh} | 2 +- 13 files changed, 24 insertions(+), 24 deletions(-) rename cmd/{arcstat.in => zarcstat.in} (98%) rename man/man1/{arcstat.1 => zarcstat.1} (99%) rename tests/zfs-tests/tests/functional/cli_user/misc/{arcstat_001_pos.ksh => zarcstat_001_pos.ksh} (96%) diff --git a/cmd/Makefile.am b/cmd/Makefile.am index cfc5f7357a3..ca94f6b77e0 100644 --- a/cmd/Makefile.am +++ b/cmd/Makefile.am @@ -98,11 +98,11 @@ endif if USING_PYTHON -bin_SCRIPTS += zarcsummary arcstat dbufstat zilstat -CLEANFILES += zarcsummary arcstat dbufstat zilstat -dist_noinst_DATA += %D%/zarcsummary %D%/arcstat.in %D%/dbufstat.in %D%/zilstat.in +bin_SCRIPTS += zarcsummary zarcstat dbufstat zilstat +CLEANFILES += zarcsummary zarcstat dbufstat zilstat +dist_noinst_DATA += %D%/zarcsummary %D%/zarcstat.in %D%/dbufstat.in %D%/zilstat.in -$(call SUBST,arcstat,%D%/) +$(call SUBST,zarcstat,%D%/) $(call SUBST,dbufstat,%D%/) $(call SUBST,zilstat,%D%/) zarcsummary: %D%/zarcsummary diff --git a/cmd/arcstat.in b/cmd/zarcstat.in similarity index 98% rename from cmd/arcstat.in rename to cmd/zarcstat.in index ad06135b9c2..8ffd2048116 100755 --- a/cmd/arcstat.in +++ b/cmd/zarcstat.in @@ -2,7 +2,7 @@ # SPDX-License-Identifier: CDDL-1.0 # # Print out ZFS ARC Statistics exported via kstat(1) -# For a definition of fields, or usage, use arcstat -v +# For a definition of fields, or usage, use zarcstat -v # # This script was originally a fork of the original arcstat.pl (0.1) # by Neelakanth Nadgir, originally published on his Sun blog on @@ -262,7 +262,7 @@ hdr_intr = 20 # Print header every 20 lines of output opfile = None sep = " " # Default separator is 2 spaces l2exist = False -cmd = ("Usage: arcstat [-havxp] [-f fields] [-o file] [-s string] [interval " +cmd = ("Usage: zarcstat [-havxp] [-f fields] [-o file] [-s string] [interval " "[count]]\n") cur = {} d = {} @@ -349,10 +349,10 @@ def usage(): "character or string\n") sys.stderr.write("\t -p : Disable auto-scaling of numerical fields\n") sys.stderr.write("\nExamples:\n") - sys.stderr.write("\tarcstat -o /tmp/a.log 2 10\n") - sys.stderr.write("\tarcstat -s \",\" -o /tmp/a.log 2 10\n") - sys.stderr.write("\tarcstat -v\n") - sys.stderr.write("\tarcstat -f time,hit%,dh%,ph%,mh% 1\n") + sys.stderr.write("\tzarcstat -o /tmp/a.log 2 10\n") + sys.stderr.write("\tzarcstat -s \",\" -o /tmp/a.log 2 10\n") + sys.stderr.write("\tzarcstat -v\n") + sys.stderr.write("\tzarcstat -f time,hit%,dh%,ph%,mh% 1\n") sys.stderr.write("\n") sys.exit(1) diff --git a/contrib/debian/openzfs-zfsutils.install b/contrib/debian/openzfs-zfsutils.install index 514153adfa5..1c5bfdc4827 100644 --- a/contrib/debian/openzfs-zfsutils.install +++ b/contrib/debian/openzfs-zfsutils.install @@ -37,12 +37,12 @@ usr/lib/zfs-linux/zpool.d/ usr/lib/zfs-linux/zpool_influxdb usr/lib/zfs-linux/zfs_prepare_disk usr/sbin/zarcsummary -usr/sbin/arcstat +usr/sbin/zarcstat usr/sbin/dbufstat usr/sbin/zilstat usr/share/zfs/compatibility.d/ usr/share/bash-completion/completions -usr/share/man/man1/arcstat.1 +usr/share/man/man1/zarcstat.1 usr/share/man/man1/zhack.1 usr/share/man/man1/zvol_wait.1 usr/share/man/man5/ diff --git a/contrib/debian/rules.in b/contrib/debian/rules.in index 9bf50c93b05..5d12621798b 100755 --- a/contrib/debian/rules.in +++ b/contrib/debian/rules.in @@ -82,7 +82,7 @@ override_dh_auto_install: # https://www.debian.org/doc/debian-policy/ch-files.html#s-scripts mkdir -p '$(CURDIR)/debian/tmp/usr/sbin/' mv '$(CURDIR)/debian/tmp/usr/bin/zarcsummary' '$(CURDIR)/debian/tmp/usr/sbin/zarcsummary' - mv '$(CURDIR)/debian/tmp/usr/bin/arcstat' '$(CURDIR)/debian/tmp/usr/sbin/arcstat' + mv '$(CURDIR)/debian/tmp/usr/bin/zarcstat' '$(CURDIR)/debian/tmp/usr/sbin/zarcstat' mv '$(CURDIR)/debian/tmp/usr/bin/dbufstat' '$(CURDIR)/debian/tmp/usr/sbin/dbufstat' mv '$(CURDIR)/debian/tmp/usr/bin/zilstat' '$(CURDIR)/debian/tmp/usr/sbin/zilstat' diff --git a/man/Makefile.am b/man/Makefile.am index 6a7b2d3e46b..9fb8fdb175b 100644 --- a/man/Makefile.am +++ b/man/Makefile.am @@ -2,7 +2,7 @@ dist_noinst_man_MANS = \ %D%/man1/cstyle.1 dist_man_MANS = \ - %D%/man1/arcstat.1 \ + %D%/man1/zarcstat.1 \ %D%/man1/raidz_test.1 \ %D%/man1/test-runner.1 \ %D%/man1/zhack.1 \ diff --git a/man/man1/arcstat.1 b/man/man1/zarcstat.1 similarity index 99% rename from man/man1/arcstat.1 rename to man/man1/zarcstat.1 index fe2e0407e07..3633c5d417f 100644 --- a/man/man1/arcstat.1 +++ b/man/man1/zarcstat.1 @@ -14,11 +14,11 @@ .\" Copyright (c) 2020 by AJ Jordan. All rights reserved. .\" .Dd September 19, 2024 -.Dt ARCSTAT 1 +.Dt ZARCSTAT 1 .Os . .Sh NAME -.Nm arcstat +.Nm zarcstat .Nd report ZFS ARC and L2ARC statistics .Sh SYNOPSIS .Nm diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in index 79173156127..8986e29eb7f 100644 --- a/rpm/generic/zfs.spec.in +++ b/rpm/generic/zfs.spec.in @@ -433,7 +433,7 @@ make install DESTDIR=%{?buildroot} find %{?buildroot}%{_libdir} -name '*.la' -exec rm -f {} \; %if 0%{!?__brp_mangle_shebangs:1} find %{?buildroot}%{_bindir} \ - \( -name zarcsummary -or -name arcstat -or -name dbufstat \ + \( -name zarcsummary -or -name zarcstat -or -name dbufstat \ -or -name zilstat \) \ -exec %{__sed} -i 's|^#!.*|#!%{__python}|' {} \; find %{?buildroot}%{_datadir} \ @@ -509,7 +509,7 @@ systemctl --system daemon-reload >/dev/null || true %{_bindir}/zvol_wait # Optional Python 3 scripts %{_bindir}/zarcsummary -%{_bindir}/arcstat +%{_bindir}/zarcstat %{_bindir}/dbufstat %{_bindir}/zilstat # Man pages diff --git a/scripts/spdxcheck.pl b/scripts/spdxcheck.pl index 248d5ce6eb5..4d4e14368be 100755 --- a/scripts/spdxcheck.pl +++ b/scripts/spdxcheck.pl @@ -83,7 +83,7 @@ my $tagged_patterns = q( man/man?/*.?.in # Unsuffixed programs (or generated of same) - cmd/arcstat.in + cmd/zarcstat.in cmd/zarcsummary cmd/dbufstat.in cmd/zilstat.in diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 5f95a6d58ec..da33fc9024c 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -624,7 +624,7 @@ tests = ['zdb_001_neg', 'zfs_001_neg', 'zfs_allow_001_neg', 'zpool_history_001_neg', 'zpool_import_001_neg', 'zpool_import_002_neg', 'zpool_offline_001_neg', 'zpool_online_001_neg', 'zpool_remove_001_neg', 'zpool_replace_001_neg', 'zpool_scrub_001_neg', 'zpool_set_001_neg', - 'zpool_status_001_neg', 'zpool_upgrade_001_neg', 'arcstat_001_pos', + 'zpool_status_001_neg', 'zpool_upgrade_001_neg', 'zarcstat_001_pos', 'zarcsummary_001_pos', 'zarcsummary_002_neg', 'zpool_wait_privilege', 'zilstat_001_pos'] user = diff --git a/tests/runfiles/sanity.run b/tests/runfiles/sanity.run index 881dfb5e74e..b56ffc3a4a2 100644 --- a/tests/runfiles/sanity.run +++ b/tests/runfiles/sanity.run @@ -400,7 +400,7 @@ tests = ['zdb_001_neg', 'zfs_001_neg', 'zfs_allow_001_neg', 'zpool_detach_001_neg', 'zpool_export_001_neg', 'zpool_get_001_neg', 'zpool_history_001_neg', 'zpool_offline_001_neg', 'zpool_online_001_neg', 'zpool_remove_001_neg', 'zpool_scrub_001_neg', 'zpool_set_001_neg', - 'zpool_status_001_neg', 'zpool_upgrade_001_neg', 'arcstat_001_pos', + 'zpool_status_001_neg', 'zpool_upgrade_001_neg', 'zarcstat_001_pos', 'zarcsummary_001_pos', 'zarcsummary_002_neg', 'zpool_wait_privilege', 'zilstat_001_pos'] user = diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg index cf4ae284118..1c4d25e152a 100644 --- a/tests/zfs-tests/include/commands.cfg +++ b/tests/zfs-tests/include/commands.cfg @@ -172,7 +172,7 @@ export ZFS_FILES='zdb ztest raidz_test zarcsummary - arcstat + zarcstat zilstat dbufstat mount.zfs diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index eeba7ebb2d9..116873794b7 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1351,7 +1351,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool/zpool_002_pos.ksh \ functional/cli_root/zpool/zpool_003_pos.ksh \ functional/cli_root/zpool/zpool_colors.ksh \ - functional/cli_user/misc/arcstat_001_pos.ksh \ + functional/cli_user/misc/zarcstat_001_pos.ksh \ functional/cli_user/misc/zarcsummary_001_pos.ksh \ functional/cli_user/misc/zarcsummary_002_neg.ksh \ functional/cli_user/misc/zilstat_001_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_user/misc/arcstat_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_user/misc/zarcstat_001_pos.ksh similarity index 96% rename from tests/zfs-tests/tests/functional/cli_user/misc/arcstat_001_pos.ksh rename to tests/zfs-tests/tests/functional/cli_user/misc/zarcstat_001_pos.ksh index 700bd9a6f52..d63b72f8039 100755 --- a/tests/zfs-tests/tests/functional/cli_user/misc/arcstat_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_user/misc/zarcstat_001_pos.ksh @@ -37,7 +37,7 @@ log_assert "arcstat generates output and doesn't return an error code" typeset -i i=0 while [[ $i -lt ${#args[*]} ]]; do - log_must eval "arcstat ${args[i]} > /dev/null" + log_must eval "zarcstat ${args[i]} > /dev/null" ((i = i + 1)) done log_pass "arcstat generates output and doesn't return an error code" From 92ca3ae56ad564fc8c80592e49077c54e8183987 Mon Sep 17 00:00:00 2001 From: Shengqi Chen Date: Tue, 9 Sep 2025 22:22:57 +0800 Subject: [PATCH 06/23] contrib/debian: install files into merged /usr This commit synchronizes the debian packaging files with the distro version (also maintained by me) as much as possible. Reviewed-by: Brian Behlendorf Reviewed-by: Colm Buckley Signed-off-by: Shengqi Chen Closes #17712 --- contrib/debian/copyright | 4 +- contrib/debian/not-installed | 2 +- contrib/debian/openzfs-libnvpair3.install.in | 2 +- contrib/debian/openzfs-libpam-zfs.install | 2 +- contrib/debian/openzfs-libpam-zfs.postinst | 2 +- contrib/debian/openzfs-libuutil3.install.in | 2 +- contrib/debian/openzfs-libzfs-dev.install.in | 6 +- contrib/debian/openzfs-libzfs6.install.in | 4 +- .../debian/openzfs-libzfsbootenv1.install.in | 2 +- contrib/debian/openzfs-libzpool6.install.in | 2 +- contrib/debian/openzfs-zfs-test.install | 2 +- contrib/debian/openzfs-zfs-zed.install | 2 +- contrib/debian/openzfs-zfsutils.install | 70 +++++++++---------- contrib/debian/openzfs-zfsutils.links | 3 + contrib/debian/rules.in | 39 +++-------- .../usr/share/initramfs-tools/hooks/zdev | 2 +- 16 files changed, 68 insertions(+), 78 deletions(-) create mode 100644 contrib/debian/openzfs-zfsutils.links diff --git a/contrib/debian/copyright b/contrib/debian/copyright index 65c7d209d8e..006f32fdf92 100644 --- a/contrib/debian/copyright +++ b/contrib/debian/copyright @@ -4,7 +4,7 @@ The detailed contributor information can be found in [2][3]. Files: contrib/debian/* Copyright: - 2013-2016, Aron Xu + 2013-2025, Aron Xu 2016, Petter Reinholdtsen 2013, Carlos Alberto Lopez Perez 2013, Turbo Fredriksson @@ -12,6 +12,8 @@ Copyright: 2011-2013, Darik Horn 2018-2019, Mo Zhou 2018-2020, Mo Zhou + 2023-2024, Shengqi Chen + 2024-2025, Shengqi Chen License: GPL-2+ [1] https://tracker.debian.org/pkg/zfs-linux diff --git a/contrib/debian/not-installed b/contrib/debian/not-installed index b473b5e50b1..9c08da5a6a7 100644 --- a/contrib/debian/not-installed +++ b/contrib/debian/not-installed @@ -9,4 +9,4 @@ etc/zfs/vdev_id.conf.sas_direct.example etc/zfs/vdev_id.conf.sas_switch.example etc/zfs/vdev_id.conf.scsi.example etc/zfs/zfs-functions -lib/systemd/system/zfs-import.service +usr/lib/systemd/system/zfs-import.service diff --git a/contrib/debian/openzfs-libnvpair3.install.in b/contrib/debian/openzfs-libnvpair3.install.in index ed7b541e360..fce542270dd 100644 --- a/contrib/debian/openzfs-libnvpair3.install.in +++ b/contrib/debian/openzfs-libnvpair3.install.in @@ -1 +1 @@ -lib/@DEB_HOST_MULTIARCH@/libnvpair.so.* +usr/lib/@DEB_HOST_MULTIARCH@/libnvpair.so.* diff --git a/contrib/debian/openzfs-libpam-zfs.install b/contrib/debian/openzfs-libpam-zfs.install index c33123f69a8..bafdebe9bb9 100644 --- a/contrib/debian/openzfs-libpam-zfs.install +++ b/contrib/debian/openzfs-libpam-zfs.install @@ -1,2 +1,2 @@ -lib/*/security/pam_zfs_key.so +usr/lib/*/security/pam_zfs_key.so usr/share/pam-configs/zfs_key diff --git a/contrib/debian/openzfs-libpam-zfs.postinst b/contrib/debian/openzfs-libpam-zfs.postinst index 03893454eee..db4db73d6d5 100644 --- a/contrib/debian/openzfs-libpam-zfs.postinst +++ b/contrib/debian/openzfs-libpam-zfs.postinst @@ -1,7 +1,7 @@ #!/bin/sh set -e -if ! $(ldd "/lib/$(dpkg-architecture -qDEB_HOST_MULTIARCH)/security/pam_zfs_key.so" | grep -q "libasan") ; then +if ! $(ldd "/usr/lib/$(dpkg-architecture -qDEB_HOST_MULTIARCH)/security/pam_zfs_key.so" | grep -q "libasan") ; then pam-auth-update --package fi diff --git a/contrib/debian/openzfs-libuutil3.install.in b/contrib/debian/openzfs-libuutil3.install.in index a197d030d74..bb33386791e 100644 --- a/contrib/debian/openzfs-libuutil3.install.in +++ b/contrib/debian/openzfs-libuutil3.install.in @@ -1 +1 @@ -lib/@DEB_HOST_MULTIARCH@/libuutil.so.* +usr/lib/@DEB_HOST_MULTIARCH@/libuutil.so.* diff --git a/contrib/debian/openzfs-libzfs-dev.install.in b/contrib/debian/openzfs-libzfs-dev.install.in index eaa8c3925e2..5673e2661c6 100644 --- a/contrib/debian/openzfs-libzfs-dev.install.in +++ b/contrib/debian/openzfs-libzfs-dev.install.in @@ -1,3 +1,5 @@ -lib/@DEB_HOST_MULTIARCH@/*.a usr/lib/@DEB_HOST_MULTIARCH@ +usr/lib/@DEB_HOST_MULTIARCH@/*.a +usr/lib/@DEB_HOST_MULTIARCH@/*.so +usr/lib/@DEB_HOST_MULTIARCH@/pkgconfig usr/include -usr/lib/@DEB_HOST_MULTIARCH@ + diff --git a/contrib/debian/openzfs-libzfs6.install.in b/contrib/debian/openzfs-libzfs6.install.in index 6765aaee59c..a9054c14cc7 100644 --- a/contrib/debian/openzfs-libzfs6.install.in +++ b/contrib/debian/openzfs-libzfs6.install.in @@ -1,2 +1,2 @@ -lib/@DEB_HOST_MULTIARCH@/libzfs.so.* -lib/@DEB_HOST_MULTIARCH@/libzfs_core.so.* +usr/lib/@DEB_HOST_MULTIARCH@/libzfs.so.* +usr/lib/@DEB_HOST_MULTIARCH@/libzfs_core.so.* diff --git a/contrib/debian/openzfs-libzfsbootenv1.install.in b/contrib/debian/openzfs-libzfsbootenv1.install.in index 49216742433..b61b8ab6326 100644 --- a/contrib/debian/openzfs-libzfsbootenv1.install.in +++ b/contrib/debian/openzfs-libzfsbootenv1.install.in @@ -1 +1 @@ -lib/@DEB_HOST_MULTIARCH@/libzfsbootenv.so.* +usr/lib/@DEB_HOST_MULTIARCH@/libzfsbootenv.so.* diff --git a/contrib/debian/openzfs-libzpool6.install.in b/contrib/debian/openzfs-libzpool6.install.in index b9e872df9ba..0e087a2709b 100644 --- a/contrib/debian/openzfs-libzpool6.install.in +++ b/contrib/debian/openzfs-libzpool6.install.in @@ -1 +1 @@ -lib/@DEB_HOST_MULTIARCH@/libzpool.so.* +usr/lib/@DEB_HOST_MULTIARCH@/libzpool.so.* diff --git a/contrib/debian/openzfs-zfs-test.install b/contrib/debian/openzfs-zfs-test.install index b3afef50dbd..496cab2ad5e 100644 --- a/contrib/debian/openzfs-zfs-test.install +++ b/contrib/debian/openzfs-zfs-test.install @@ -1,4 +1,4 @@ -sbin/ztest +usr/sbin/ztest usr/bin/raidz_test usr/share/man/man1/raidz_test.1 usr/share/man/man1/test-runner.1 diff --git a/contrib/debian/openzfs-zfs-zed.install b/contrib/debian/openzfs-zfs-zed.install index a348ba828ee..30699a8a98d 100644 --- a/contrib/debian/openzfs-zfs-zed.install +++ b/contrib/debian/openzfs-zfs-zed.install @@ -1,5 +1,5 @@ etc/zfs/zed.d/* -lib/systemd/system/zfs-zed.service +usr/lib/systemd/system/zfs-zed.service usr/lib/zfs-linux/zed.d/* usr/sbin/zed usr/share/man/man8/zed.8 diff --git a/contrib/debian/openzfs-zfsutils.install b/contrib/debian/openzfs-zfsutils.install index 1c5bfdc4827..6810108f2c5 100644 --- a/contrib/debian/openzfs-zfsutils.install +++ b/contrib/debian/openzfs-zfsutils.install @@ -1,45 +1,45 @@ etc/default/zfs etc/zfs/zfs-functions etc/zfs/zpool.d/ -lib/systemd/system-generators/ -lib/systemd/system-preset/ -lib/systemd/system/zfs-import-cache.service -lib/systemd/system/zfs-import-scan.service -lib/systemd/system/zfs-import.target -lib/systemd/system/zfs-load-key.service -lib/systemd/system/zfs-mount.service -lib/systemd/system/zfs-mount@.service -lib/systemd/system/zfs-scrub-monthly@.timer -lib/systemd/system/zfs-scrub-weekly@.timer -lib/systemd/system/zfs-scrub@.service -lib/systemd/system/zfs-trim-monthly@.timer -lib/systemd/system/zfs-trim-weekly@.timer -lib/systemd/system/zfs-trim@.service -lib/systemd/system/zfs-share.service -lib/systemd/system/zfs-volume-wait.service -lib/systemd/system/zfs-volumes.target -lib/systemd/system/zfs.target -lib/udev/ -sbin/fsck.zfs -sbin/mount.zfs -sbin/zdb -sbin/zfs -sbin/zfs_ids_to_path -sbin/zgenhostid -sbin/zhack -sbin/zinject -sbin/zpool -sbin/zstream -sbin/zstreamdump +usr/lib/systemd/system-generators/ +usr/lib/systemd/system-preset/ +usr/lib/systemd/system/zfs-import-cache.service +usr/lib/systemd/system/zfs-import-scan.service +usr/lib/systemd/system/zfs-import.target +usr/lib/systemd/system/zfs-load-key.service +usr/lib/systemd/system/zfs-mount.service +usr/lib/systemd/system/zfs-mount@.service +usr/lib/systemd/system/zfs-scrub-monthly@.timer +usr/lib/systemd/system/zfs-scrub-weekly@.timer +usr/lib/systemd/system/zfs-scrub@.service +usr/lib/systemd/system/zfs-trim-monthly@.timer +usr/lib/systemd/system/zfs-trim-weekly@.timer +usr/lib/systemd/system/zfs-trim@.service +usr/lib/systemd/system/zfs-share.service +usr/lib/systemd/system/zfs-volume-wait.service +usr/lib/systemd/system/zfs-volumes.target +usr/lib/systemd/system/zfs.target +usr/lib/udev/ +usr/sbin/fsck.zfs +usr/sbin/mount.zfs +usr/sbin/zdb +usr/sbin/zfs +usr/sbin/zfs_ids_to_path +usr/sbin/zgenhostid +usr/sbin/zhack +usr/sbin/zinject +usr/sbin/zpool +usr/sbin/zstream +usr/sbin/zstreamdump usr/bin/zvol_wait -usr/lib/modules-load.d/ lib/ +usr/lib/modules-load.d/ usr/lib/zfs-linux/zpool.d/ usr/lib/zfs-linux/zpool_influxdb usr/lib/zfs-linux/zfs_prepare_disk -usr/sbin/zarcsummary -usr/sbin/zarcstat -usr/sbin/dbufstat -usr/sbin/zilstat +usr/bin/zarcsummary +usr/bin/zarcstat +usr/bin/dbufstat usr/sbin +usr/bin/zilstat usr/share/zfs/compatibility.d/ usr/share/bash-completion/completions usr/share/man/man1/zarcstat.1 diff --git a/contrib/debian/openzfs-zfsutils.links b/contrib/debian/openzfs-zfsutils.links new file mode 100644 index 00000000000..54099e6573b --- /dev/null +++ b/contrib/debian/openzfs-zfsutils.links @@ -0,0 +1,3 @@ +usr/sbin/zfs usr/bin/zfs +usr/sbin/zpool usr/bin/zpool +usr/lib/zfs-linux/zpool_influxdb usr/bin/zpool_influxdb diff --git a/contrib/debian/rules.in b/contrib/debian/rules.in index 5d12621798b..5087a7e18e1 100755 --- a/contrib/debian/rules.in +++ b/contrib/debian/rules.in @@ -37,18 +37,19 @@ override_dh_auto_configure: @# Build the userland, but don't build the kernel modules. dh_auto_configure -- @CFGOPTS@ \ --bindir=/usr/bin \ - --sbindir=/sbin \ - --libdir=/lib/"$(DEB_HOST_MULTIARCH)" \ - --with-udevdir=/lib/udev \ + --sbindir=/usr/sbin \ + --with-mounthelperdir=/usr/sbin \ + --libdir=/usr/lib/"$(DEB_HOST_MULTIARCH)" \ + --with-udevdir=/usr/lib/udev \ --with-zfsexecdir=/usr/lib/zfs-linux \ --enable-systemd \ --enable-pyzfs \ --with-python=python3 \ - --with-pammoduledir='/lib/$(DEB_HOST_MULTIARCH)/security' \ + --with-pammoduledir='/usr/lib/$(DEB_HOST_MULTIARCH)/security' \ --with-pkgconfigdir='/usr/lib/$(DEB_HOST_MULTIARCH)/pkgconfig' \ - --with-systemdunitdir=/lib/systemd/system \ - --with-systemdpresetdir=/lib/systemd/system-preset \ - --with-systemdgeneratordir=/lib/systemd/system-generators \ + --with-systemdunitdir=/usr/lib/systemd/system \ + --with-systemdpresetdir=/usr/lib/systemd/system-preset \ + --with-systemdgeneratordir=/usr/lib/systemd/system-generators \ --with-config=user for i in $(wildcard $(CURDIR)/debian/*.install.in) ; do \ @@ -77,19 +78,6 @@ override_dh_auto_install: @# Install the utilities. $(MAKE) install DESTDIR='$(CURDIR)/debian/tmp' - # Move from bin_dir to /usr/sbin - # Remove suffix (.py) as per policy 10.4 - Scripts - # https://www.debian.org/doc/debian-policy/ch-files.html#s-scripts - mkdir -p '$(CURDIR)/debian/tmp/usr/sbin/' - mv '$(CURDIR)/debian/tmp/usr/bin/zarcsummary' '$(CURDIR)/debian/tmp/usr/sbin/zarcsummary' - mv '$(CURDIR)/debian/tmp/usr/bin/zarcstat' '$(CURDIR)/debian/tmp/usr/sbin/zarcstat' - mv '$(CURDIR)/debian/tmp/usr/bin/dbufstat' '$(CURDIR)/debian/tmp/usr/sbin/dbufstat' - mv '$(CURDIR)/debian/tmp/usr/bin/zilstat' '$(CURDIR)/debian/tmp/usr/sbin/zilstat' - - @# Zed has dependencies outside of the system root. - mv '$(CURDIR)/debian/tmp/sbin/zed' '$(CURDIR)/debian/tmp/usr/sbin/zed' - sed -i 's|ExecStart=/sbin/|ExecStart=/usr/sbin/|g' '$(CURDIR)/debian/tmp/lib/systemd/system/zfs-zed.service' - @# Install the DKMS source. @# We only want the files needed to build the modules install -D -t '$(CURDIR)/debian/tmp/usr/src/$(NAME)-$(DEB_VERSION_UPSTREAM)/scripts' \ @@ -131,11 +119,6 @@ override_dh_auto_install: cd '$(CURDIR)/debian/tmp/usr/src/$(NAME)-$(DEB_VERSION_UPSTREAM)'; ./autogen.sh rm -fr '$(CURDIR)/debian/tmp/usr/src/$(NAME)-$(DEB_VERSION_UPSTREAM)/autom4te.cache' - for i in `ls $(CURDIR)/debian/tmp/lib/$(DEB_HOST_MULTIARCH)/*.so`; do \ - ln -s '/lib/$(DEB_HOST_MULTIARCH)/'`readlink $${i}` '$(CURDIR)/debian/tmp/usr/lib/$(DEB_HOST_MULTIARCH)/'`basename $${i}`; \ - rm $${i}; \ - done - chmod a-x '$(CURDIR)/debian/tmp/etc/zfs/zfs-functions' chmod a-x '$(CURDIR)/debian/tmp/etc/default/zfs' @@ -159,7 +142,7 @@ override_dh_auto_clean: @if test -e META.orig; then mv META.orig META; fi override_dh_install: - find debian/tmp/lib -name '*.la' -delete + find debian/tmp/usr/lib -name '*.la' -delete dh_install override_dh_missing: @@ -173,8 +156,8 @@ override_dh_installinit: dh_installinit -R --name zfs-zed override_dh_installsystemd: - mkdir -p debian/openzfs-zfsutils/lib/systemd/system - ln -sr /dev/null debian/openzfs-zfsutils/lib/systemd/system/zfs-import.service + mkdir -p debian/openzfs-zfsutils/usr/lib/systemd/system + ln -sr /dev/null debian/openzfs-zfsutils/usr/lib/systemd/system/zfs-import.service dh_installsystemd --no-stop-on-upgrade -X zfs-zed.service dh_installsystemd --name zfs-zed diff --git a/contrib/debian/tree/zfs-initramfs/usr/share/initramfs-tools/hooks/zdev b/contrib/debian/tree/zfs-initramfs/usr/share/initramfs-tools/hooks/zdev index 0cf21a4211a..d4f968aed8f 100755 --- a/contrib/debian/tree/zfs-initramfs/usr/share/initramfs-tools/hooks/zdev +++ b/contrib/debian/tree/zfs-initramfs/usr/share/initramfs-tools/hooks/zdev @@ -5,7 +5,7 @@ PREREQ="udev" PREREQ_UDEV_RULES="60-zvol.rules 69-vdev.rules" -COPY_EXEC_LIST="/lib/udev/zvol_id /lib/udev/vdev_id" +COPY_EXEC_LIST="/usr/lib/udev/zvol_id /usr/lib/udev/vdev_id" # Generic result code. RC=0 From 8f15d2e4d58525e583277ccfef83f2056be4f72e Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Thu, 24 Apr 2025 13:01:00 -0700 Subject: [PATCH 07/23] Add allocation profile export and zhack subcommand for import When attempting to debug performance problems on large systems, one of the major factors that affect performance is free space fragmentation. This heavily affects the allocation process, which is an area of active development in ZFS. Unfortunately, fragmenting a large pool for testing purposes is time consuming; it usually involves filling the pool and then repeatedly overwriting data until the free space becomes fragmented, which can take many hours. And even if the time is available, artificial workloads rarely generate the same fragmentation patterns as the natural workloads they're attempting to mimic. This patch has two parts. First, in zdb, we add the ability to export the full allocation map of the pool. It iterates over each vdev, printing every allocated segment in the ms_allocatable range tree. This can be done while the pool is online, though in that case the allocation map may actually be from several different TXGs as new ones are loaded on demand. The second is a new subcommand for zhack, zhack metaslab leak (and its supporting kernel changes). This is a zhack subcommand that imports a pool and then modified the range trees of the metaslabs, allowing the sync process to write them out normall. It does not currently store those allocations anywhere to make them reversible, and there is no corresponding free subcommand (which would be extremely dangerous); this is an irreversible process, only intended for performance testing. The only way to reclaim the space afterwards is to destroy the pool or roll back to a checkpoint. Reviewed-by: Brian Behlendorf Signed-off-by: Paul Dagnelie Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Closes #17576 --- cmd/zdb/zdb.c | 45 ++++- cmd/zdb/zdb.h | 2 +- cmd/zdb/zdb_il.c | 2 - cmd/zhack.c | 188 +++++++++++++++++- contrib/pyzfs/libzfs_core/exceptions.py | 1 - man/man1/zhack.1 | 18 ++ man/man8/zdb.8 | 12 ++ tests/runfiles/common.run | 2 +- tests/zfs-tests/tests/Makefile.am | 1 + .../cli_root/zhack/zhack_metaslab_leak.ksh | 70 +++++++ 10 files changed, 329 insertions(+), 12 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zhack/zhack_metaslab_leak.ksh diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 134c258a1e3..70096b80965 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -107,7 +107,9 @@ extern uint_t zfs_reconstruct_indirect_combinations_max; extern uint_t zfs_btree_verify_intensity; static const char cmdname[] = "zdb"; -uint8_t dump_opt[256]; +uint8_t dump_opt[512]; + +#define ALLOCATED_OPT 256 typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size); @@ -1666,6 +1668,16 @@ dump_metaslab_stats(metaslab_t *msp) dump_histogram(rt->rt_histogram, ZFS_RANGE_TREE_HISTOGRAM_SIZE, 0); } +static void +dump_allocated(void *arg, uint64_t start, uint64_t size) +{ + uint64_t *off = arg; + if (*off != start) + (void) printf("ALLOC: %"PRIu64" %"PRIu64"\n", *off, + start - *off); + *off = start + size; +} + static void dump_metaslab(metaslab_t *msp) { @@ -1682,13 +1694,24 @@ dump_metaslab(metaslab_t *msp) (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start, (u_longlong_t)space_map_object(sm), freebuf); - if (dump_opt['m'] > 2 && !dump_opt['L']) { + if (dump_opt[ALLOCATED_OPT] || + (dump_opt['m'] > 2 && !dump_opt['L'])) { mutex_enter(&msp->ms_lock); VERIFY0(metaslab_load(msp)); + } + + if (dump_opt['m'] > 2 && !dump_opt['L']) { zfs_range_tree_stat_verify(msp->ms_allocatable); dump_metaslab_stats(msp); - metaslab_unload(msp); - mutex_exit(&msp->ms_lock); + } + + if (dump_opt[ALLOCATED_OPT]) { + uint64_t off = msp->ms_start; + zfs_range_tree_walk(msp->ms_allocatable, dump_allocated, + &off); + if (off != msp->ms_start + msp->ms_size) + (void) printf("ALLOC: %"PRIu64" %"PRIu64"\n", off, + msp->ms_size - off); } if (dump_opt['m'] > 1 && sm != NULL && @@ -1703,6 +1726,12 @@ dump_metaslab(metaslab_t *msp) SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift); } + if (dump_opt[ALLOCATED_OPT] || + (dump_opt['m'] > 2 && !dump_opt['L'])) { + metaslab_unload(msp); + mutex_exit(&msp->ms_lock); + } + if (vd->vdev_ops == &vdev_draid_ops) ASSERT3U(msp->ms_size, <=, 1ULL << vd->vdev_ms_shift); else @@ -1739,8 +1768,9 @@ print_vdev_metaslab_header(vdev_t *vd) } } - (void) printf("\tvdev %10llu %s", - (u_longlong_t)vd->vdev_id, bias_str); + (void) printf("\tvdev %10llu\t%s metaslab shift %4llu", + (u_longlong_t)vd->vdev_id, bias_str, + (u_longlong_t)vd->vdev_ms_shift); if (ms_flush_data_obj != 0) { (void) printf(" ms_unflushed_phys object %llu", @@ -9375,6 +9405,8 @@ main(int argc, char **argv) {"all-reconstruction", no_argument, NULL, 'Y'}, {"livelist", no_argument, NULL, 'y'}, {"zstd-headers", no_argument, NULL, 'Z'}, + {"allocated-map", no_argument, NULL, + ALLOCATED_OPT}, {0, 0, 0, 0} }; @@ -9405,6 +9437,7 @@ main(int argc, char **argv) case 'u': case 'y': case 'Z': + case ALLOCATED_OPT: dump_opt[c]++; dump_all = 0; break; diff --git a/cmd/zdb/zdb.h b/cmd/zdb/zdb.h index 6b6c9169816..48b561eb202 100644 --- a/cmd/zdb/zdb.h +++ b/cmd/zdb/zdb.h @@ -29,6 +29,6 @@ #define _ZDB_H void dump_intent_log(zilog_t *); -extern uint8_t dump_opt[256]; +extern uint8_t dump_opt[512]; #endif /* _ZDB_H */ diff --git a/cmd/zdb/zdb_il.c b/cmd/zdb/zdb_il.c index 62e290cd122..3d91fb28a4c 100644 --- a/cmd/zdb/zdb_il.c +++ b/cmd/zdb/zdb_il.c @@ -48,8 +48,6 @@ #include "zdb.h" -extern uint8_t dump_opt[256]; - static char tab_prefix[4] = "\t\t\t"; static void diff --git a/cmd/zhack.c b/cmd/zhack.c index 536532a6762..fe1c697e194 100644 --- a/cmd/zhack.c +++ b/cmd/zhack.c @@ -54,6 +54,7 @@ #include #include #include +#include static importargs_t g_importargs; static char *g_pool; @@ -93,7 +94,10 @@ usage(void) " -c repair corrupted label checksums\n" " -u restore the label on a detached device\n" "\n" - " : path to vdev\n"); + " : path to vdev\n" + "\n" + " metaslab leak \n" + " apply allocation map from zdb to specified pool\n"); exit(1); } @@ -500,6 +504,186 @@ zhack_do_feature(int argc, char **argv) return (0); } +static boolean_t +strstarts(const char *a, const char *b) +{ + return (strncmp(a, b, strlen(b)) == 0); +} + +static void +metaslab_force_alloc(metaslab_t *msp, uint64_t start, uint64_t size, + dmu_tx_t *tx) +{ + ASSERT(msp->ms_disabled); + ASSERT(MUTEX_HELD(&msp->ms_lock)); + uint64_t txg = dmu_tx_get_txg(tx); + + uint64_t off = start; + while (off < start + size) { + uint64_t ostart, osize; + boolean_t found = zfs_range_tree_find_in(msp->ms_allocatable, + off, start + size - off, &ostart, &osize); + if (!found) + break; + zfs_range_tree_remove(msp->ms_allocatable, ostart, osize); + + if (zfs_range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) + vdev_dirty(msp->ms_group->mg_vd, VDD_METASLAB, msp, + txg); + + zfs_range_tree_add(msp->ms_allocating[txg & TXG_MASK], ostart, + osize); + msp->ms_allocating_total += osize; + off = ostart + osize; + } +} + +static void +zhack_do_metaslab_leak(int argc, char **argv) +{ + int c; + char *target; + spa_t *spa; + + optind = 1; + boolean_t force = B_FALSE; + while ((c = getopt(argc, argv, "f")) != -1) { + switch (c) { + case 'f': + force = B_TRUE; + break; + default: + usage(); + break; + } + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, "error: missing pool name\n"); + usage(); + } + target = argv[0]; + + zhack_spa_open(target, B_FALSE, FTAG, &spa); + spa_config_enter(spa, SCL_VDEV | SCL_ALLOC, FTAG, RW_READER); + + char *line = NULL; + size_t cap = 0; + + vdev_t *vd = NULL; + metaslab_t *prev = NULL; + dmu_tx_t *tx = NULL; + while (getline(&line, &cap, stdin) > 0) { + if (strstarts(line, "\tvdev ")) { + uint64_t vdev_id, ms_shift; + if (sscanf(line, + "\tvdev %10"PRIu64"\t%*s metaslab shift %4"PRIu64, + &vdev_id, &ms_shift) == 1) { + VERIFY3U(sscanf(line, "\tvdev %"PRIu64 + "\t metaslab shift %4"PRIu64, + &vdev_id, &ms_shift), ==, 2); + } + vd = vdev_lookup_top(spa, vdev_id); + if (vd == NULL) { + fprintf(stderr, "error: no such vdev with " + "id %"PRIu64"\n", vdev_id); + break; + } + if (tx) { + dmu_tx_commit(tx); + mutex_exit(&prev->ms_lock); + metaslab_enable(prev, B_FALSE, B_FALSE); + tx = NULL; + prev = NULL; + } + if (vd->vdev_ms_shift != ms_shift) { + fprintf(stderr, "error: ms_shift mismatch: %" + PRIu64" != %"PRIu64"\n", vd->vdev_ms_shift, + ms_shift); + break; + } + } else if (strstarts(line, "\tmetaslabs ")) { + uint64_t ms_count; + VERIFY3U(sscanf(line, "\tmetaslabs %"PRIu64, &ms_count), + ==, 1); + ASSERT(vd); + if (!force && vd->vdev_ms_count != ms_count) { + fprintf(stderr, "error: ms_count mismatch: %" + PRIu64" != %"PRIu64"\n", vd->vdev_ms_count, + ms_count); + break; + } + } else if (strstarts(line, "ALLOC:")) { + uint64_t start, size; + VERIFY3U(sscanf(line, "ALLOC: %"PRIu64" %"PRIu64"\n", + &start, &size), ==, 2); + + ASSERT(vd); + metaslab_t *cur = + vd->vdev_ms[start >> vd->vdev_ms_shift]; + if (prev != cur) { + if (prev) { + dmu_tx_commit(tx); + mutex_exit(&prev->ms_lock); + metaslab_enable(prev, B_FALSE, B_FALSE); + } + ASSERT(cur); + metaslab_disable(cur); + mutex_enter(&cur->ms_lock); + metaslab_load(cur); + prev = cur; + tx = dmu_tx_create_dd( + spa_get_dsl(vd->vdev_spa)->dp_root_dir); + dmu_tx_assign(tx, DMU_TX_WAIT); + } + + metaslab_force_alloc(cur, start, size, tx); + } else { + continue; + } + } + if (tx) { + dmu_tx_commit(tx); + mutex_exit(&prev->ms_lock); + metaslab_enable(prev, B_FALSE, B_FALSE); + tx = NULL; + prev = NULL; + } + if (line) + free(line); + + spa_config_exit(spa, SCL_VDEV | SCL_ALLOC, FTAG); + spa_close(spa, FTAG); +} + +static int +zhack_do_metaslab(int argc, char **argv) +{ + char *subcommand; + + argc--; + argv++; + if (argc == 0) { + (void) fprintf(stderr, + "error: no metaslab operation specified\n"); + usage(); + } + + subcommand = argv[0]; + if (strcmp(subcommand, "leak") == 0) { + zhack_do_metaslab_leak(argc, argv); + } else { + (void) fprintf(stderr, "error: unknown subcommand: %s\n", + subcommand); + usage(); + } + + return (0); +} + #define ASHIFT_UBERBLOCK_SHIFT(ashift) \ MIN(MAX(ashift, UBERBLOCK_SHIFT), \ MAX_UBERBLOCK_SHIFT) @@ -1015,6 +1199,8 @@ main(int argc, char **argv) rv = zhack_do_feature(argc, argv); } else if (strcmp(subcommand, "label") == 0) { return (zhack_do_label(argc, argv)); + } else if (strcmp(subcommand, "metaslab") == 0) { + rv = zhack_do_metaslab(argc, argv); } else { (void) fprintf(stderr, "error: unknown subcommand: %s\n", subcommand); diff --git a/contrib/pyzfs/libzfs_core/exceptions.py b/contrib/pyzfs/libzfs_core/exceptions.py index b26a37f5de1..26d66a45272 100644 --- a/contrib/pyzfs/libzfs_core/exceptions.py +++ b/contrib/pyzfs/libzfs_core/exceptions.py @@ -604,5 +604,4 @@ class RaidzExpansionRunning(ZFSError): errno = ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS message = "A raidz device is currently expanding" - # vim: softtabstop=4 tabstop=4 expandtab shiftwidth=4 diff --git a/man/man1/zhack.1 b/man/man1/zhack.1 index 743bd53b731..ebb13647739 100644 --- a/man/man1/zhack.1 +++ b/man/man1/zhack.1 @@ -122,6 +122,24 @@ Example: .Nm zhack Cm label repair Fl cu Ar device Fix checksums and undetach a device . +.It Xo +.Nm zhack +.Cm metaslab leak +.Op Fl f +.Ar pool +.Xc +Apply a fragmentation profile generated by +.Sy zdb +to the specified +.Ar pool Ns +\&. +.Pp +The +.Fl f +flag forces the profile to apply even if the vdevs in the +.Ar pool +don't have the same number of metaslabs as the fragmentation profile. +. .El . .Sh GLOBAL OPTIONS diff --git a/man/man8/zdb.8 b/man/man8/zdb.8 index e00544e4a5a..c3290ea1476 100644 --- a/man/man8/zdb.8 +++ b/man/man8/zdb.8 @@ -69,6 +69,13 @@ .Op Fl U Ar cache .Ar poolname Op Ar vdev Oo Ar metaslab Oc Ns … .Nm +.Fl -allocated-map +.Op Fl mAFLPXY +.Op Fl e Oo Fl V Oc Oo Fl p Ar path Oc Ns … +.Op Fl t Ar txg +.Op Fl U Ar cache +.Ar poolname Op Ar vdev Oo Ar metaslab Oc Ns … +.Nm .Fl O .Op Fl K Ar key .Ar dataset path @@ -128,6 +135,11 @@ that zdb may interpret inconsistent pool data and behave erratically. .Sh OPTIONS Display options: .Bl -tag -width Ds +.It Fl Sy -allocated-map +Prints out a list of all the allocated regions in the pool. +Primarily intended for use with the +.Nm zhack metaslab leak +subcommand. .It Fl b , -block-stats Display statistics regarding the number, size .Pq logical, physical and allocated diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index da33fc9024c..db3f25dc426 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -379,7 +379,7 @@ tags = ['functional', 'cli_root', 'zfs_wait'] [tests/functional/cli_root/zhack] tests = ['zhack_label_repair_001', 'zhack_label_repair_002', - 'zhack_label_repair_003', 'zhack_label_repair_004'] + 'zhack_label_repair_003', 'zhack_label_repair_004', 'zhack_metaslab_leak'] pre = post = tags = ['functional', 'cli_root', 'zhack'] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 116873794b7..d9ec3686cea 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1009,6 +1009,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zhack/zhack_label_repair_002.ksh \ functional/cli_root/zhack/zhack_label_repair_003.ksh \ functional/cli_root/zhack/zhack_label_repair_004.ksh \ + functional/cli_root/zhack/zhack_metaslab_leak.ksh \ functional/cli_root/zpool_add/add_nested_replacing_spare.ksh \ functional/cli_root/zpool_add/add-o_ashift.ksh \ functional/cli_root/zpool_add/add_prop_ashift.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_metaslab_leak.ksh b/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_metaslab_leak.ksh new file mode 100755 index 00000000000..cc72d6313d4 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_metaslab_leak.ksh @@ -0,0 +1,70 @@ +#!/bin/ksh +# SPDX-License-Identifier: CDDL-1.0 + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# + +# +# Description: +# +# Test whether zhack metaslab leak functions correctly +# +# Strategy: +# +# 1. Create pool on a loopback device with some test data +# 2. Gather pool capacity stats +# 3. Generate fragmentation data with zdb +# 4. Destroy the pool +# 5. Create a new pool with the same configuration +# 6. Export the pool +# 7. Apply the fragmentation information with zhack metaslab leak +# 8. Import the pool +# 9. Verify that pool capacity stats match + +. "$STF_SUITE"/include/libtest.shlib + +verify_runnable "global" + +function cleanup +{ + zpool destroy $TESTPOOL + rm $tmp +} + +log_onexit cleanup +log_assert "zhack metaslab leak leaks the right amount of space" + +typeset tmp=$(mktemp) + +log_must zpool create $TESTPOOL $DISKS +for i in `seq 1 16`; do + log_must dd if=/dev/urandom of=/$TESTPOOL/f$i bs=1M count=16 + log_must zpool sync $TESTPOOL +done +for i in `seq 2 2 16`; do + log_must rm /$TESTPOOL/f$i +done +for i in `seq 1 16`; do + log_must touch /$TESTPOOL/g$i + log_must zpool sync $TESTPOOL +done + +alloc=$(zpool get -Hpo value alloc $TESTPOOL) +log_must eval "zdb -m --allocated-map $TESTPOOL > $tmp" +log_must zpool destroy $TESTPOOL + +log_must zpool create $TESTPOOL $DISKS +log_must zpool export $TESTPOOL +log_must eval "zhack metaslab leak $TESTPOOL < $tmp" +log_must zpool import $TESTPOOL + +alloc2=$(zpool get -Hpo value alloc $TESTPOOL) + +[[ $((alloc * 1.05)) -gt $alloc2 ]] && [[ $alloc -lt $alloc2 ]] || \ + log_fail "space usage changed too much: $alloc to $alloc2" + +log_pass "zhack metaslab leak behaved correctly" From bc4aac039531aac14d93e2643ca72050a3e850d8 Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Tue, 9 Sep 2025 12:09:35 -0700 Subject: [PATCH 08/23] Enable zhack to work properly with 4k sector size disks Reviewed-by: Brian Behlendorf Signed-off-by: Paul Dagnelie Closes #17576 --- cmd/zhack.c | 9 +++++++-- include/sys/zfs_file.h | 2 +- lib/libzpool/kernel.c | 6 +++--- man/man1/zhack.1 | 2 ++ module/os/freebsd/zfs/zfs_file_os.c | 3 ++- module/os/linux/zfs/zfs_file_os.c | 3 ++- module/zfs/vdev_file.c | 3 ++- 7 files changed, 19 insertions(+), 9 deletions(-) diff --git a/cmd/zhack.c b/cmd/zhack.c index fe1c697e194..edf9dfa2cec 100644 --- a/cmd/zhack.c +++ b/cmd/zhack.c @@ -70,7 +70,8 @@ static __attribute__((noreturn)) void usage(void) { (void) fprintf(stderr, - "Usage: zhack [-c cachefile] [-d dir] ...\n" + "Usage: zhack [-o tunable] [-c cachefile] [-d dir] " + " ...\n" "where is one of the following:\n" "\n"); @@ -1169,7 +1170,7 @@ main(int argc, char **argv) dprintf_setup(&argc, argv); zfs_prop_init(); - while ((c = getopt(argc, argv, "+c:d:")) != -1) { + while ((c = getopt(argc, argv, "+c:d:o:")) != -1) { switch (c) { case 'c': g_importargs.cachefile = optarg; @@ -1178,6 +1179,10 @@ main(int argc, char **argv) assert(g_importargs.paths < MAX_NUM_PATHS); g_importargs.path[g_importargs.paths++] = optarg; break; + case 'o': + if (handle_tunable_option(optarg, B_FALSE) != 0) + exit(1); + break; default: usage(); break; diff --git a/include/sys/zfs_file.h b/include/sys/zfs_file.h index a1f344c2bb7..67abe9988aa 100644 --- a/include/sys/zfs_file.h +++ b/include/sys/zfs_file.h @@ -46,7 +46,7 @@ void zfs_file_close(zfs_file_t *fp); int zfs_file_write(zfs_file_t *fp, const void *buf, size_t len, ssize_t *resid); int zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t len, loff_t off, - ssize_t *resid); + uint8_t ashift, ssize_t *resid); int zfs_file_read(zfs_file_t *fp, void *buf, size_t len, ssize_t *resid); int zfs_file_pread(zfs_file_t *fp, void *buf, size_t len, loff_t off, ssize_t *resid); diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c index fea2f81458f..8ed37462726 100644 --- a/lib/libzpool/kernel.c +++ b/lib/libzpool/kernel.c @@ -1238,7 +1238,7 @@ zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid) */ int zfs_file_pwrite(zfs_file_t *fp, const void *buf, - size_t count, loff_t pos, ssize_t *resid) + size_t count, loff_t pos, uint8_t ashift, ssize_t *resid) { ssize_t rc, split, done; int sectors; @@ -1248,8 +1248,8 @@ zfs_file_pwrite(zfs_file_t *fp, const void *buf, * system calls so that the process can be killed in between. * This is used by ztest to simulate realistic failure modes. */ - sectors = count >> SPA_MINBLOCKSHIFT; - split = (sectors > 0 ? rand() % sectors : 0) << SPA_MINBLOCKSHIFT; + sectors = count >> ashift; + split = (sectors > 0 ? rand() % sectors : 0) << ashift; rc = pwrite64(fp->f_fd, buf, split, pos); if (rc != -1) { done = rc; diff --git a/man/man1/zhack.1 b/man/man1/zhack.1 index ebb13647739..63658cf930e 100644 --- a/man/man1/zhack.1 +++ b/man/man1/zhack.1 @@ -161,6 +161,8 @@ Search for members in .Ar dir . Can be specified more than once. +.It Fl o Ar var Ns = Ns Ar value +Set the given tunable to the provided value. .El . .Sh EXAMPLES diff --git a/module/os/freebsd/zfs/zfs_file_os.c b/module/os/freebsd/zfs/zfs_file_os.c index 21e5f7938f9..ca13569a123 100644 --- a/module/os/freebsd/zfs/zfs_file_os.c +++ b/module/os/freebsd/zfs/zfs_file_os.c @@ -164,8 +164,9 @@ zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid) int zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t off, - ssize_t *resid) + uint8_t ashift, ssize_t *resid) { + (void) ashift; return (zfs_file_write_impl(fp, buf, count, &off, resid)); } diff --git a/module/os/linux/zfs/zfs_file_os.c b/module/os/linux/zfs/zfs_file_os.c index c729947369c..3fdcdbac6f6 100644 --- a/module/os/linux/zfs/zfs_file_os.c +++ b/module/os/linux/zfs/zfs_file_os.c @@ -115,8 +115,9 @@ zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid) */ int zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t off, - ssize_t *resid) + uint8_t ashift, ssize_t *resid) { + (void) ashift; ssize_t rc; rc = kernel_write(fp, buf, count, &off); diff --git a/module/zfs/vdev_file.c b/module/zfs/vdev_file.c index f457669bc80..20b4db65ec0 100644 --- a/module/zfs/vdev_file.c +++ b/module/zfs/vdev_file.c @@ -228,7 +228,8 @@ vdev_file_io_strategy(void *arg) abd_return_buf_copy(zio->io_abd, buf, size); } else { buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size); - err = zfs_file_pwrite(vf->vf_file, buf, size, off, &resid); + err = zfs_file_pwrite(vf->vf_file, buf, size, off, + vd->vdev_ashift, &resid); abd_return_buf(zio->io_abd, buf, size); } zio->io_error = err; From cd6db758f34a66cd6717557c34e847c5ceed6208 Mon Sep 17 00:00:00 2001 From: Alan Somers Date: Wed, 10 Sep 2025 12:27:39 -0600 Subject: [PATCH 09/23] Fix the build of crypto_test on LP32 architectures test->id is a uint64_t, not a long. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Alan Somers Sponsored by: ConnectWise Closes #17707 --- tests/zfs-tests/cmd/crypto_test.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/zfs-tests/cmd/crypto_test.c b/tests/zfs-tests/cmd/crypto_test.c index cbebd33e0bf..c8d8622c757 100644 --- a/tests/zfs-tests/cmd/crypto_test.c +++ b/tests/zfs-tests/cmd/crypto_test.c @@ -863,7 +863,8 @@ test_result(const crypto_test_t *test, int encrypt_rv, uint8_t *encrypt_buf, return (pass); /* print summary of test result */ - printf("%s[%lu]: encrypt=%s decrypt=%s\n", test->fileloc, test->id, + printf("%s[%ju]: encrypt=%s decrypt=%s\n", test->fileloc, + (uintmax_t)test->id, encrypt_pass ? "PASS" : "FAIL", decrypt_pass ? "PASS" : "FAIL"); From 7f7c58389ec2474d56e50aea2db3b201409e8fe3 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Wed, 10 Sep 2025 14:55:58 -0700 Subject: [PATCH 10/23] ZTS: Print warning if running ZTS user_run test locally Print a warning if you're attempting to run a ZTS test that calls 'user_run', and the ephemeral user doesn't have permissions to access the test binaries. This can happen if you're running ZTS from a local git repo. In that case the test user (say, 'testuser1') may need access to the ZTS binaries in: /home//zfs/tests/zfs-tests/bin/ ... but 'testuser1' doesn't have permission to enter your home dir: /home/ The warning will help alert users to what is going on. This will not be an issue when ZTS is actually installed on the system (via 'make install' or from packages). Reviewed-by: Brian Behlendorf Signed-off-by: Tony Hutter Closes #17721 --- tests/zfs-tests/include/libtest.shlib | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index 23e89599cae..a9a4a90bc94 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -2881,6 +2881,28 @@ function user_run log_note "user: $user" log_note "cmd: $*" + if ! sudo -Eu $user test -x $PATH ; then + log_note "-------------------------------------------------" + log_note "Warning: $user doesn't have permissions on $PATH" + log_note "" + log_note "This usually happens when you're running ZTS locally" + log_note "from inside the ZFS source dir, and are attempting to" + log_note "run a test that calls user_run. The ephemeral user" + log_note "($user) that ZTS is creating does not have permission" + log_note "to traverse to $PATH, or the binaries in $PATH are" + log_note "not the right permissions." + log_note "" + log_note "To get around this, copy your ZFS source directory" + log_note "to a world-accessible location (like /tmp), and " + log_note "change the permissions on your ZFS source dir " + log_note "to allow access." + log_note "" + log_note "Also, verify that /dev/zfs is RW for others:" + log_note "" + log_note " sudo chmod o+rw /dev/zfs" + log_note "-------------------------------------------------" + fi + typeset out=$TEST_BASE_DIR/out typeset err=$TEST_BASE_DIR/err From 0620c979a5680031a0d1d183208ad3d0f0e48a2e Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Mon, 25 Aug 2025 15:23:47 -0700 Subject: [PATCH 11/23] Remove RAIDZ reconstruct flags from debug defaults Reviewed-by: Brian Behlendorf Signed-off-by: Paul Dagnelie Closes #17227 --- module/zfs/spa_misc.c | 6 +++--- .../tests/functional/cli_root/zdb/zdb_tunables.ksh | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index dceafbc2755..6f7c060f97f 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -251,11 +251,11 @@ spa_mode_t spa_mode_global = SPA_MODE_UNINIT; #ifdef ZFS_DEBUG /* - * Everything except dprintf, set_error, spa, and indirect_remap is on - * by default in debug builds. + * Everything except dprintf, set_error, indirect_remap, and raidz_reconstruct + * is on by default in debug builds. */ int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SET_ERROR | - ZFS_DEBUG_INDIRECT_REMAP); + ZFS_DEBUG_INDIRECT_REMAP | ZFS_DEBUG_RAIDZ_RECONSTRUCT); #else int zfs_flags = 0; #endif diff --git a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_tunables.ksh b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_tunables.ksh index 46965aa7cc3..b89790d5d52 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_tunables.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_tunables.ksh @@ -63,7 +63,7 @@ log_mustnot_expect 'no such tunable: 0' zdb -o show=0 log_mustnot_expect 'no such tunable: 1' zdb -o info=1 # can set multiple in same command -log_must eval 'zdb -o zfs_recover=1 -o zfs_flags=512 | xargs | grep -qE "^zfs_recover: 0 -> 1 zfs_flags: 4294965758 -> 512$"' +log_must eval 'zdb -o zfs_recover=1 -o zfs_flags=512 | xargs | grep -qE "^zfs_recover: 0 -> 1 zfs_flags: 4294932990 -> 512$"' # can set and show in same command log_must eval 'zdb -o zfs_recover=1 -o zfs_recover -o zfs_recover=0 | xargs | grep -qE "^zfs_recover: 0 -> 1 zfs_recover: 1 zfs_recover: 1 -> 0$"' From d64711c20205b3ffa6ce1e0495d55207d6bbd80b Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Wed, 27 Aug 2025 16:41:48 -0700 Subject: [PATCH 12/23] Detect a slow raidz child during reads A single slow responding disk can affect the overall read performance of a raidz group. When a raidz child disk is determined to be a persistent slow outlier, then have it sit out during reads for a period of time. The raidz group can use parity to reconstruct the data that was skipped. Each time a slow disk is placed into a sit out period, its `vdev_stat.vs_slow_ios count` is incremented and a zevent class `ereport.fs.zfs.delay` is posted. The length of the sit out period can be changed using the `raid_read_sit_out_secs` module parameter. Setting it to zero disables slow outlier detection. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Signed-off-by: Paul Dagnelie Contributions-by: Don Brady Contributions-by: Brian Behlendorf Closes #17227 --- include/os/freebsd/spl/sys/time.h | 11 + include/os/linux/spl/sys/time.h | 8 + include/sys/fm/fs/zfs.h | 1 + include/sys/fs/zfs.h | 3 + include/sys/vdev_impl.h | 6 + include/sys/vdev_raidz.h | 3 + include/sys/vdev_raidz_impl.h | 2 + lib/libspl/include/sys/time.h | 9 + lib/libzfs/libzfs.abi | 4 +- lib/libzfs/libzfs_pool.c | 14 +- lib/libzfs/libzfs_util.c | 5 + man/man4/zfs.4 | 37 ++ man/man7/vdevprops.7 | 39 +- man/man8/zpool-events.8 | 10 + module/zcommon/zpool_prop.c | 6 + module/zfs/vdev.c | 117 +++++- module/zfs/vdev_draid.c | 28 ++ module/zfs/vdev_raidz.c | 345 ++++++++++++++++++ tests/runfiles/common.run | 9 +- tests/runfiles/linux.run | 3 +- tests/zfs-tests/include/libtest.shlib | 32 ++ tests/zfs-tests/include/tunables.cfg | 3 + tests/zfs-tests/tests/Makefile.am | 5 + .../events/slow_vdev_degraded_sit_out.ksh | 106 ++++++ .../functional/events/slow_vdev_sit_out.ksh | 102 ++++++ .../events/slow_vdev_sit_out_neg.ksh | 116 ++++++ .../replacement/attach_resilver_sit_out.ksh | 189 ++++++++++ .../replacement/replace_resilver_sit_out.ksh | 199 ++++++++++ 28 files changed, 1399 insertions(+), 13 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/events/slow_vdev_degraded_sit_out.ksh create mode 100755 tests/zfs-tests/tests/functional/events/slow_vdev_sit_out.ksh create mode 100755 tests/zfs-tests/tests/functional/events/slow_vdev_sit_out_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/replacement/attach_resilver_sit_out.ksh create mode 100755 tests/zfs-tests/tests/functional/replacement/replace_resilver_sit_out.ksh diff --git a/include/os/freebsd/spl/sys/time.h b/include/os/freebsd/spl/sys/time.h index 2f5fe4619ef..14b42f2e708 100644 --- a/include/os/freebsd/spl/sys/time.h +++ b/include/os/freebsd/spl/sys/time.h @@ -62,6 +62,17 @@ typedef longlong_t hrtime_t; #define SEC_TO_TICK(sec) ((sec) * hz) #define NSEC_TO_TICK(nsec) ((nsec) / (NANOSEC / hz)) +static __inline hrtime_t +getlrtime(void) +{ + struct timespec ts; + hrtime_t nsec; + + getnanouptime(&ts); + nsec = ((hrtime_t)ts.tv_sec * NANOSEC) + ts.tv_nsec; + return (nsec); +} + static __inline hrtime_t gethrtime(void) { diff --git a/include/os/linux/spl/sys/time.h b/include/os/linux/spl/sys/time.h index 33b273b5399..4edc42a8aef 100644 --- a/include/os/linux/spl/sys/time.h +++ b/include/os/linux/spl/sys/time.h @@ -79,6 +79,14 @@ gethrestime_sec(void) return (ts.tv_sec); } +static inline hrtime_t +getlrtime(void) +{ + inode_timespec_t ts; + ktime_get_coarse_ts64(&ts); + return (((hrtime_t)ts.tv_sec * NSEC_PER_SEC) + ts.tv_nsec); +} + static inline hrtime_t gethrtime(void) { diff --git a/include/sys/fm/fs/zfs.h b/include/sys/fm/fs/zfs.h index 659c64bf15a..a771b11420f 100644 --- a/include/sys/fm/fs/zfs.h +++ b/include/sys/fm/fs/zfs.h @@ -58,6 +58,7 @@ extern "C" { #define FM_EREPORT_ZFS_PROBE_FAILURE "probe_failure" #define FM_EREPORT_ZFS_LOG_REPLAY "log_replay" #define FM_EREPORT_ZFS_CONFIG_CACHE_WRITE "config_cache_write" +#define FM_EREPORT_ZFS_SITOUT "sitout" #define FM_EREPORT_PAYLOAD_ZFS_POOL "pool" #define FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE "pool_failmode" diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index fc359c10365..49ab9d3db79 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -385,6 +385,8 @@ typedef enum { VDEV_PROP_TRIM_SUPPORT, VDEV_PROP_TRIM_ERRORS, VDEV_PROP_SLOW_IOS, + VDEV_PROP_SIT_OUT, + VDEV_PROP_AUTOSIT, VDEV_NUM_PROPS } vdev_prop_t; @@ -1673,6 +1675,7 @@ typedef enum { ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS, ZFS_ERR_ASHIFT_MISMATCH, ZFS_ERR_STREAM_LARGE_MICROZAP, + ZFS_ERR_TOO_MANY_SITOUTS, } zfs_errno_t; /* diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 4ab472bd674..5a8c2f846be 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -279,10 +279,12 @@ struct vdev { uint64_t vdev_noalloc; /* device is passivated? */ uint64_t vdev_removing; /* device is being removed? */ uint64_t vdev_failfast; /* device failfast setting */ + boolean_t vdev_autosit; /* automatic sitout management */ boolean_t vdev_rz_expanding; /* raidz is being expanded? */ boolean_t vdev_ishole; /* is a hole in the namespace */ uint64_t vdev_top_zap; vdev_alloc_bias_t vdev_alloc_bias; /* metaslab allocation bias */ + uint64_t vdev_last_latency_check; /* pool checkpoint related */ space_map_t *vdev_checkpoint_sm; /* contains reserved blocks */ @@ -431,6 +433,10 @@ struct vdev { hrtime_t vdev_mmp_pending; /* 0 if write finished */ uint64_t vdev_mmp_kstat_id; /* to find kstat entry */ uint64_t vdev_expansion_time; /* vdev's last expansion time */ + /* used to calculate average read latency */ + uint64_t *vdev_prev_histo; + int64_t vdev_outlier_count; /* read outlier amongst peers */ + hrtime_t vdev_read_sit_out_expire; /* end of sit out period */ list_node_t vdev_leaf_node; /* leaf vdev list */ /* diff --git a/include/sys/vdev_raidz.h b/include/sys/vdev_raidz.h index 3b02728cdbf..df8c2aed404 100644 --- a/include/sys/vdev_raidz.h +++ b/include/sys/vdev_raidz.h @@ -61,6 +61,9 @@ void vdev_raidz_checksum_error(zio_t *, struct raidz_col *, abd_t *); struct raidz_row *vdev_raidz_row_alloc(int, zio_t *); void vdev_raidz_reflow_copy_scratch(spa_t *); void raidz_dtl_reassessed(vdev_t *); +boolean_t vdev_sit_out_reads(vdev_t *, zio_flag_t); +void vdev_raidz_sit_child(vdev_t *, uint64_t); +void vdev_raidz_unsit_child(vdev_t *); extern const zio_vsd_ops_t vdev_raidz_vsd_ops; diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h index debce6f09a2..8c8dcfb077f 100644 --- a/include/sys/vdev_raidz_impl.h +++ b/include/sys/vdev_raidz_impl.h @@ -119,6 +119,7 @@ typedef struct raidz_col { uint8_t rc_need_orig_restore:1; /* need to restore from orig_data? */ uint8_t rc_force_repair:1; /* Write good data to this column */ uint8_t rc_allow_repair:1; /* Allow repair I/O to this column */ + uint8_t rc_latency_outlier:1; /* Latency outlier for this device */ int rc_shadow_devidx; /* for double write during expansion */ int rc_shadow_error; /* for double write during expansion */ uint64_t rc_shadow_offset; /* for double write during expansion */ @@ -133,6 +134,7 @@ typedef struct raidz_row { int rr_firstdatacol; /* First data column/parity count */ abd_t *rr_abd_empty; /* dRAID empty sector buffer */ int rr_nempty; /* empty sectors included in parity */ + int rr_outlier_cnt; /* Count of latency outlier devices */ #ifdef ZFS_DEBUG uint64_t rr_offset; /* Logical offset for *_io_verify() */ uint64_t rr_size; /* Physical size for *_io_verify() */ diff --git a/lib/libspl/include/sys/time.h b/lib/libspl/include/sys/time.h index da80a5852ae..062c6ec979f 100644 --- a/lib/libspl/include/sys/time.h +++ b/lib/libspl/include/sys/time.h @@ -97,6 +97,15 @@ gethrestime_sec(void) return (tv.tv_sec); } +static inline hrtime_t +getlrtime(void) +{ + struct timeval tv; + (void) gettimeofday(&tv, NULL); + return ((((uint64_t)tv.tv_sec) * NANOSEC) + + ((uint64_t)tv.tv_usec * NSEC_PER_USEC)); +} + static inline hrtime_t gethrtime(void) { diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index ba161d1ef10..184ea4a55b4 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -6117,7 +6117,9 @@ - + + + diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 10b42720e96..ce154ae1a4c 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -5549,6 +5549,8 @@ zpool_get_vdev_prop_value(nvlist_t *nvprop, vdev_prop_t prop, char *prop_name, /* Only use if provided by the RAIDZ VDEV above */ if (prop == VDEV_PROP_RAIDZ_EXPANDING) return (ENOENT); + if (prop == VDEV_PROP_SIT_OUT) + return (ENOENT); } if (vdev_prop_index_to_string(prop, intval, (const char **)&strval) != 0) @@ -5718,8 +5720,16 @@ zpool_set_vdev_prop(zpool_handle_t *zhp, const char *vdevname, nvlist_free(nvl); nvlist_free(outnvl); - if (ret) - (void) zpool_standard_error(zhp->zpool_hdl, errno, errbuf); + if (ret) { + if (errno == ENOTSUP) { + zfs_error_aux(zhp->zpool_hdl, dgettext(TEXT_DOMAIN, + "property not supported for this vdev")); + (void) zfs_error(zhp->zpool_hdl, EZFS_PROPTYPE, errbuf); + } else { + (void) zpool_standard_error(zhp->zpool_hdl, errno, + errbuf); + } + } return (ret); } diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index 4edddc2a759..26f5135dff6 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -776,6 +776,11 @@ zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) case ZFS_ERR_ASHIFT_MISMATCH: zfs_verror(hdl, EZFS_ASHIFT_MISMATCH, fmt, ap); break; + case ZFS_ERR_TOO_MANY_SITOUTS: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "too many disks " + "already sitting out")); + zfs_verror(hdl, EZFS_BUSY, fmt, ap); + break; default: zfs_error_aux(hdl, "%s", zfs_strerror(error)); zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap); diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index e865d6a79c5..7f1adaceb40 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -4,6 +4,7 @@ .\" Copyright (c) 2019, 2021 by Delphix. All rights reserved. .\" Copyright (c) 2019 Datto Inc. .\" Copyright (c) 2023, 2024, 2025, Klara, Inc. +.\" .\" The contents of this file are subject to the terms of the Common Development .\" and Distribution License (the "License"). You may not use this file except .\" in compliance with the License. You can obtain a copy of the license at @@ -601,6 +602,42 @@ new format when enabling the feature. The default is to convert all log entries. . +.It Sy vdev_read_sit_out_secs Ns = Ns Sy 600 Ns s Po 10 min Pc Pq ulong +When a slow disk outlier is detected it is placed in a sit out state. +While sitting out the disk will not participate in normal reads, instead its +data will be reconstructed as needed from parity. +Scrub operations will always read from a disk, even if it's sitting out. +A number of disks in a RAID-Z or dRAID vdev may sit out at the same time, up +to the number of parity devices. +Writes will still be issued to a disk which is sitting out to maintain full +redundancy. +Defaults to 600 seconds and a value of zero disables disk sit-outs in general, +including slow disk outlier detection. +. +.It Sy vdev_raidz_outlier_check_interval_ms Ns = Ns Sy 1000 Ns ms Po 1 sec Pc Pq ulong +How often each RAID-Z and dRAID vdev will check for slow disk outliers. +Increasing this interval will reduce the sensitivity of detection (since all +I/Os since the last check are included in the statistics), but will slow the +response to a disk developing a problem. +Defaults to once per second; setting extremely small values may cause negative +performance effects. +. +.It Sy vdev_raidz_outlier_insensitivity Ns = Ns Sy 50 Pq uint +When performing slow outlier checks for RAID-Z and dRAID vdevs, this value is +used to determine how far out an outlier must be before it counts as an event +worth consdering. +This is phrased as "insensitivity" because larger values result in fewer +detections. +Smaller values will result in more aggressive sitting out of disks that may have +problems, but may significantly increase the rate of spurious sit-outs. +.Pp +To provide a more technical definition of this parameter, this is the multiple +of the inter-quartile range (IQR) that is being used in a Tukey's Fence +detection algorithm. +This is much higher than a normal Tukey's Fence k-value, because the +distribution under consideration is probably an extreme-value distribution, +rather than a more typical Gaussian distribution. +. .It Sy vdev_removal_max_span Ns = Ns Sy 32768 Ns B Po 32 KiB Pc Pq uint During top-level vdev removal, chunks of data are copied from the vdev which may include free space in order to trade bandwidth for IOPS. diff --git a/man/man7/vdevprops.7 b/man/man7/vdevprops.7 index 61e60d95041..0fb28d7db13 100644 --- a/man/man7/vdevprops.7 +++ b/man/man7/vdevprops.7 @@ -19,7 +19,7 @@ .\" .\" CDDL HEADER END .\" -.\" Copyright (c) 2021 Klara, Inc. +.\" Copyright (c) 2021, 2025, Klara, Inc. .\" .Dd July 23, 2024 .Dt VDEVPROPS 7 @@ -106,11 +106,17 @@ The number of children belonging to this vdev .It Sy read_errors , write_errors , checksum_errors , initialize_errors , trim_errors The number of errors of each type encountered by this vdev .It Sy slow_ios -The number of slow I/Os encountered by this vdev, -These represent I/O operations that didn't complete in +This indicates the number of slow I/O operations encountered by this vdev. +A slow I/O is defined as an operation that did not complete within the .Sy zio_slow_io_ms -milliseconds +threshold in milliseconds .Pq Sy 30000 No by default . +For +.Sy RAIDZ +and +.Sy DRAID +configurations, this value also represents the number of times the vdev was +identified as an outlier and excluded from participating in read I/O operations. .It Sy null_ops , read_ops , write_ops , free_ops , claim_ops , trim_ops The number of I/O operations of each type performed by this vdev .It Xo @@ -150,6 +156,31 @@ The amount of space to reserve for the EFI system partition .It Sy failfast If this device should propagate BIO errors back to ZFS, used to disable failfast. +.It Sy sit_out +Only valid for +.Sy RAIDZ +and +.Sy DRAID +vdevs. +True when a slow disk outlier was detected and the vdev is currently in a sit +out state. +This property can be manually set to cause vdevs to sit out. +It will also be automatically set by the +.Sy autosit +logic if that is enabled. +While sitting out, the vdev will not participate in normal reads, instead its +data will be reconstructed as needed from parity. +.It Sy autosit +Only valid for +.Sy RAIDZ +and +.Sy DRAID +vdevs. +If set, this enables the kernel-level slow disk detection logic. +This logic automatically causes any vdevs that are significant negative +performance outliers to sit out, as described in the +.Sy sit_out +property. .It Sy path The path to the device for this vdev .It Sy allocating diff --git a/man/man8/zpool-events.8 b/man/man8/zpool-events.8 index 2d32dce2bb6..36a9864dc73 100644 --- a/man/man8/zpool-events.8 +++ b/man/man8/zpool-events.8 @@ -190,6 +190,16 @@ Issued when a scrub is resumed on a pool. .It Sy scrub.paused Issued when a scrub is paused on a pool. .It Sy bootfs.vdev.attach +.It Sy sitout +Issued when a +.Sy RAIDZ +or +.Sy DRAID +vdev triggers the +.Sy autosit +logic. +This logic detects when a disk in such a vdev is significantly slower than its +peers, and sits them out temporarily to preserve the performance of the pool. .El . .Sh PAYLOADS diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c index 04ae9f986d8..07819ba2be8 100644 --- a/module/zcommon/zpool_prop.c +++ b/module/zcommon/zpool_prop.c @@ -467,9 +467,15 @@ vdev_prop_init(void) zprop_register_index(VDEV_PROP_RAIDZ_EXPANDING, "raidz_expanding", 0, PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "RAIDZ_EXPANDING", boolean_table, sfeatures); + zprop_register_index(VDEV_PROP_SIT_OUT, "sit_out", 0, + PROP_DEFAULT, ZFS_TYPE_VDEV, "on | off", "SIT_OUT", boolean_table, + sfeatures); zprop_register_index(VDEV_PROP_TRIM_SUPPORT, "trim_support", 0, PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "TRIMSUP", boolean_table, sfeatures); + zprop_register_index(VDEV_PROP_AUTOSIT, "autosit", 0, + PROP_DEFAULT, ZFS_TYPE_VDEV, "on | off", "AUTOSIT", boolean_table, + sfeatures); /* default index properties */ zprop_register_index(VDEV_PROP_FAILFAST, "failfast", B_TRUE, diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index ed04ce0c86e..fc6d445f978 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -29,7 +29,7 @@ * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Datto Inc. All rights reserved. - * Copyright (c) 2021, Klara Inc. + * Copyright (c) 2021, 2025, Klara, Inc. * Copyright (c) 2021, 2023 Hewlett Packard Enterprise Development LP. */ @@ -1086,6 +1086,10 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, } } + if (top_level && (ops == &vdev_raidz_ops || ops == &vdev_draid_ops)) + vd->vdev_autosit = + vdev_prop_default_numeric(VDEV_PROP_AUTOSIT); + /* * Add ourselves to the parent's list of children. */ @@ -1187,6 +1191,9 @@ vdev_free(vdev_t *vd) spa_spare_remove(vd); if (vd->vdev_isl2cache) spa_l2cache_remove(vd); + if (vd->vdev_prev_histo) + kmem_free(vd->vdev_prev_histo, + sizeof (uint64_t) * VDEV_L_HISTO_BUCKETS); txg_list_destroy(&vd->vdev_ms_list); txg_list_destroy(&vd->vdev_dtl_list); @@ -3857,6 +3864,26 @@ vdev_load(vdev_t *vd) } } + if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { + spa_t *spa = vd->vdev_spa; + uint64_t autosit; + + error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, + vdev_prop_to_name(VDEV_PROP_AUTOSIT), sizeof (autosit), + 1, &autosit); + if (error == 0) { + vd->vdev_autosit = autosit == 1; + } else if (error == ENOENT) { + vd->vdev_autosit = vdev_prop_default_numeric( + VDEV_PROP_AUTOSIT); + } else { + vdev_dbgmsg(vd, + "vdev_load: zap_lookup(top_zap=%llu) " + "failed [error=%d]", + (u_longlong_t)vd->vdev_top_zap, error); + } + } + /* * Load any rebuild state from the top-level vdev zap. */ @@ -4616,6 +4643,8 @@ vdev_clear(spa_t *spa, vdev_t *vd) vd->vdev_stat.vs_checksum_errors = 0; vd->vdev_stat.vs_dio_verify_errors = 0; vd->vdev_stat.vs_slow_ios = 0; + atomic_store_64(&vd->vdev_outlier_count, 0); + vd->vdev_read_sit_out_expire = 0; for (int c = 0; c < vd->vdev_children; c++) vdev_clear(spa, vd->vdev_child[c]); @@ -6107,6 +6136,56 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) } vd->vdev_failfast = intval & 1; break; + case VDEV_PROP_SIT_OUT: + /* Only expose this for a draid or raidz leaf */ + if (!vd->vdev_ops->vdev_op_leaf || + vd->vdev_top == NULL || + (vd->vdev_top->vdev_ops != &vdev_raidz_ops && + vd->vdev_top->vdev_ops != &vdev_draid_ops)) { + error = ENOTSUP; + break; + } + if (nvpair_value_uint64(elem, &intval) != 0) { + error = EINVAL; + break; + } + if (intval == 1) { + vdev_t *ancestor = vd; + while (ancestor->vdev_parent != vd->vdev_top) + ancestor = ancestor->vdev_parent; + vdev_t *pvd = vd->vdev_top; + uint_t sitouts = 0; + for (int i = 0; i < pvd->vdev_children; i++) { + if (pvd->vdev_child[i] == ancestor) + continue; + if (vdev_sit_out_reads( + pvd->vdev_child[i], 0)) { + sitouts++; + } + } + if (sitouts >= vdev_get_nparity(pvd)) { + error = ZFS_ERR_TOO_MANY_SITOUTS; + break; + } + if (error == 0) + vdev_raidz_sit_child(vd, + INT64_MAX - gethrestime_sec()); + } else { + vdev_raidz_unsit_child(vd); + } + break; + case VDEV_PROP_AUTOSIT: + if (vd->vdev_ops != &vdev_raidz_ops && + vd->vdev_ops != &vdev_draid_ops) { + error = ENOTSUP; + break; + } + if (nvpair_value_uint64(elem, &intval) != 0) { + error = EINVAL; + break; + } + vd->vdev_autosit = intval == 1; + break; case VDEV_PROP_CHECKSUM_N: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; @@ -6456,6 +6535,19 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) ZPROP_SRC_NONE); } continue; + case VDEV_PROP_SIT_OUT: + /* Only expose this for a draid or raidz leaf */ + if (vd->vdev_ops->vdev_op_leaf && + vd->vdev_top != NULL && + (vd->vdev_top->vdev_ops == + &vdev_raidz_ops || + vd->vdev_top->vdev_ops == + &vdev_draid_ops)) { + vdev_prop_add_list(outnvl, propname, + NULL, vdev_sit_out_reads(vd, 0), + ZPROP_SRC_NONE); + } + continue; case VDEV_PROP_TRIM_SUPPORT: /* only valid for leaf vdevs */ if (vd->vdev_ops->vdev_op_leaf) { @@ -6506,6 +6598,29 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) vdev_prop_add_list(outnvl, propname, strval, intval, src); break; + case VDEV_PROP_AUTOSIT: + /* Only raidz vdevs cannot have this property */ + if (vd->vdev_ops != &vdev_raidz_ops && + vd->vdev_ops != &vdev_draid_ops) { + src = ZPROP_SRC_NONE; + intval = ZPROP_BOOLEAN_NA; + } else { + err = vdev_prop_get_int(vd, prop, + &intval); + if (err && err != ENOENT) + break; + + if (intval == + vdev_prop_default_numeric(prop)) + src = ZPROP_SRC_DEFAULT; + else + src = ZPROP_SRC_LOCAL; + } + + vdev_prop_add_list(outnvl, propname, NULL, + intval, src); + break; + case VDEV_PROP_CHECKSUM_N: case VDEV_PROP_CHECKSUM_T: case VDEV_PROP_IO_N: diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c index a05289102af..8588cfee3f7 100644 --- a/module/zfs/vdev_draid.c +++ b/module/zfs/vdev_draid.c @@ -22,6 +22,7 @@ /* * Copyright (c) 2018 Intel Corporation. * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. + * Copyright (c) 2025, Klara, Inc. */ #include @@ -1996,6 +1997,33 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr) rc->rc_allow_repair = 1; } } + + if (vdev_sit_out_reads(cvd, zio->io_flags)) { + rr->rr_outlier_cnt++; + ASSERT0(rc->rc_latency_outlier); + rc->rc_latency_outlier = 1; + } + } + + /* + * When the row contains a latency outlier and sufficient parity + * exists to reconstruct the column data, then skip reading the + * known slow child vdev as a performance optimization. + */ + if (rr->rr_outlier_cnt > 0 && + (rr->rr_firstdatacol - rr->rr_missingparity) >= + (rr->rr_missingdata + 1)) { + + for (int c = rr->rr_cols - 1; c >= rr->rr_firstdatacol; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_error == 0 && rc->rc_latency_outlier) { + rr->rr_missingdata++; + rc->rc_error = SET_ERROR(EAGAIN); + rc->rc_skipped = 1; + break; + } + } } /* diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index b597d6daefd..80727b0d8f9 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -24,6 +24,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2016 Gvozden Nešković. All rights reserved. + * Copyright (c) 2025, Klara, Inc. */ #include @@ -355,6 +356,32 @@ unsigned long raidz_expand_max_reflow_bytes = 0; */ uint_t raidz_expand_pause_point = 0; +/* + * This represents the duration for a slow drive read sit out. + */ +static unsigned long vdev_read_sit_out_secs = 600; + +/* + * How often each RAID-Z and dRAID vdev will check for slow disk outliers. + * Increasing this interval will reduce the sensitivity of detection (since all + * I/Os since the last check are included in the statistics), but will slow the + * response to a disk developing a problem. + * + * Defaults to once per second; setting extremely small values may cause + * negative performance effects. + */ +static hrtime_t vdev_raidz_outlier_check_interval_ms = 1000; + +/* + * When performing slow outlier checks for RAID-Z and dRAID vdevs, this value is + * used to determine how far out an outlier must be before it counts as an event + * worth consdering. + * + * Smaller values will result in more aggressive sitting out of disks that may + * have problems, but may significantly increase the rate of spurious sit-outs. + */ +static uint32_t vdev_raidz_outlier_insensitivity = 50; + /* * Maximum amount of copy io's outstanding at once. */ @@ -2311,6 +2338,41 @@ vdev_raidz_min_asize(vdev_t *vd) vd->vdev_children); } +/* + * return B_TRUE if a read should be skipped due to being too slow. + * + * In vdev_child_slow_outlier() it looks for outliers based on disk + * latency from the most recent child reads. Here we're checking if, + * over time, a disk has has been an outlier too many times and is + * now in a sit out period. + */ +boolean_t +vdev_sit_out_reads(vdev_t *vd, zio_flag_t io_flags) +{ + if (vdev_read_sit_out_secs == 0) + return (B_FALSE); + + /* Avoid skipping a data column read when scrubbing */ + if (io_flags & ZIO_FLAG_SCRUB) + return (B_FALSE); + + if (!vd->vdev_ops->vdev_op_leaf) { + boolean_t sitting = B_FALSE; + for (int c = 0; c < vd->vdev_children; c++) { + sitting |= vdev_sit_out_reads(vd->vdev_child[c], + io_flags); + } + return (sitting); + } + + if (vd->vdev_read_sit_out_expire >= gethrestime_sec()) + return (B_TRUE); + + vd->vdev_read_sit_out_expire = 0; + + return (B_FALSE); +} + void vdev_raidz_child_done(zio_t *zio) { @@ -2475,6 +2537,45 @@ vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity) rc->rc_skipped = 1; continue; } + + if (vdev_sit_out_reads(cvd, zio->io_flags)) { + rr->rr_outlier_cnt++; + ASSERT0(rc->rc_latency_outlier); + rc->rc_latency_outlier = 1; + } + } + + /* + * When the row contains a latency outlier and sufficient parity + * exists to reconstruct the column data, then skip reading the + * known slow child vdev as a performance optimization. + */ + if (rr->rr_outlier_cnt > 0 && + (rr->rr_firstdatacol - rr->rr_missingparity) >= + (rr->rr_missingdata + 1)) { + + for (int c = rr->rr_cols - 1; c >= 0; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_error == 0 && rc->rc_latency_outlier) { + if (c >= rr->rr_firstdatacol) + rr->rr_missingdata++; + else + rr->rr_missingparity++; + rc->rc_error = SET_ERROR(EAGAIN); + rc->rc_skipped = 1; + break; + } + } + } + + for (int c = rr->rr_cols - 1; c >= 0; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + + if (rc->rc_error || rc->rc_size == 0) + continue; + if (forceparity || c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { @@ -2498,6 +2599,7 @@ vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm) ASSERT3U(prc->rc_devidx, ==, i); vdev_t *cvd = vd->vdev_child[i]; + if (!vdev_readable(cvd)) { prc->rc_error = SET_ERROR(ENXIO); prc->rc_tried = 1; /* don't even try */ @@ -2774,6 +2876,239 @@ vdev_raidz_worst_error(raidz_row_t *rr) return (error); } +/* + * Find the median value from a set of n values + */ +static uint64_t +latency_median_value(const uint64_t *data, size_t n) +{ + uint64_t m; + + if (n % 2 == 0) + m = (data[(n >> 1) - 1] + data[n >> 1]) >> 1; + else + m = data[((n + 1) >> 1) - 1]; + + return (m); +} + +/* + * Calculate the outlier fence from a set of n latency values + * + * fence = Q3 + vdev_raidz_outlier_insensitivity x (Q3 - Q1) + */ +static uint64_t +latency_quartiles_fence(const uint64_t *data, size_t n, uint64_t *iqr) +{ + uint64_t q1 = latency_median_value(&data[0], n >> 1); + uint64_t q3 = latency_median_value(&data[(n + 1) >> 1], n >> 1); + + /* + * To avoid detecting false positive outliers when N is small and + * and the latencies values are very close, make sure the IQR + * is at least 25% larger than Q1. + */ + *iqr = MAX(q3 - q1, q1 / 4); + + return (q3 + (*iqr * vdev_raidz_outlier_insensitivity)); +} +#define LAT_CHILDREN_MIN 5 +#define LAT_OUTLIER_LIMIT 20 + +static int +latency_compare(const void *arg1, const void *arg2) +{ + const uint64_t *l1 = (uint64_t *)arg1; + const uint64_t *l2 = (uint64_t *)arg2; + + return (TREE_CMP(*l1, *l2)); +} + +void +vdev_raidz_sit_child(vdev_t *svd, uint64_t secs) +{ + for (int c = 0; c < svd->vdev_children; c++) + vdev_raidz_sit_child(svd->vdev_child[c], secs); + + if (!svd->vdev_ops->vdev_op_leaf) + return; + + /* Begin a sit out period for this slow drive */ + svd->vdev_read_sit_out_expire = gethrestime_sec() + + secs; + + /* Count each slow io period */ + mutex_enter(&svd->vdev_stat_lock); + svd->vdev_stat.vs_slow_ios++; + mutex_exit(&svd->vdev_stat_lock); +} + +void +vdev_raidz_unsit_child(vdev_t *vd) +{ + for (int c = 0; c < vd->vdev_children; c++) + vdev_raidz_unsit_child(vd->vdev_child[c]); + + if (!vd->vdev_ops->vdev_op_leaf) + return; + + vd->vdev_read_sit_out_expire = 0; +} + +/* + * Check for any latency outlier from latest set of child reads. + * + * Uses a Tukey's fence, with K = 50, for detecting extreme outliers. This + * rule defines extreme outliers as data points outside the fence of the + * third quartile plus fifty times the Interquartile Range (IQR). This range + * is the distance between the first and third quartile. + * + * Fifty is an extremely large value for Tukey's fence, but the outliers we're + * attempting to detect here are orders of magnitude times larger than the + * median. This large value should capture any truly fault disk quickly, + * without causing spurious sit-outs. + * + * To further avoid spurious sit-outs, vdevs must be detected multiple times + * as an outlier before they are sat, and outlier counts will gradually decay. + * Every nchildren times we have detected an outlier, we subtract 2 from the + * outlier count of all children. If detected outliers are close to uniformly + * distributed, this will result in the outlier count remaining close to 0 + * (in expectation; over long enough time-scales, spurious sit-outs are still + * possible). + */ +static void +vdev_child_slow_outlier(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + if (!vd->vdev_autosit || vdev_read_sit_out_secs == 0 || + vd->vdev_children < LAT_CHILDREN_MIN) + return; + + hrtime_t now = getlrtime(); + uint64_t last = atomic_load_64(&vd->vdev_last_latency_check); + + if ((now - last) < MSEC2NSEC(vdev_raidz_outlier_check_interval_ms)) + return; + + /* Allow a single winner when there are racing callers. */ + if (atomic_cas_64(&vd->vdev_last_latency_check, last, now) != last) + return; + + int children = vd->vdev_children; + uint64_t *lat_data = kmem_alloc(sizeof (uint64_t) * children, KM_SLEEP); + + for (int c = 0; c < children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + if (cvd->vdev_prev_histo == NULL) { + mutex_enter(&cvd->vdev_stat_lock); + size_t size = + sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]); + cvd->vdev_prev_histo = kmem_zalloc(size, KM_SLEEP); + memcpy(cvd->vdev_prev_histo, + cvd->vdev_stat_ex.vsx_disk_histo[ZIO_TYPE_READ], + size); + mutex_exit(&cvd->vdev_stat_lock); + } + } + uint64_t max = 0; + vdev_t *svd = NULL; + uint_t sitouts = 0; + boolean_t skip = B_FALSE, svd_sitting = B_FALSE; + for (int c = 0; c < children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + boolean_t sitting = vdev_sit_out_reads(cvd, 0) || + cvd->vdev_state != VDEV_STATE_HEALTHY; + + /* We can't sit out more disks than we have parity */ + if (sitting && ++sitouts >= vdev_get_nparity(vd)) + skip = B_TRUE; + + mutex_enter(&cvd->vdev_stat_lock); + + uint64_t *prev_histo = cvd->vdev_prev_histo; + uint64_t *histo = + cvd->vdev_stat_ex.vsx_disk_histo[ZIO_TYPE_READ]; + if (skip) { + size_t size = + sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]); + memcpy(prev_histo, histo, size); + mutex_exit(&cvd->vdev_stat_lock); + continue; + } + uint64_t count = 0; + lat_data[c] = 0; + for (int i = 0; i < VDEV_L_HISTO_BUCKETS; i++) { + uint64_t this_count = histo[i] - prev_histo[i]; + lat_data[c] += (1ULL << i) * this_count; + count += this_count; + } + size_t size = sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]); + memcpy(prev_histo, histo, size); + mutex_exit(&cvd->vdev_stat_lock); + lat_data[c] /= MAX(1, count); + + /* Wait until all disks have been read from */ + if (lat_data[c] == 0 && !sitting) { + skip = B_TRUE; + continue; + } + + /* Keep track of the vdev with largest value */ + if (lat_data[c] > max) { + max = lat_data[c]; + svd = cvd; + svd_sitting = sitting; + } + } + + if (skip) { + kmem_free(lat_data, sizeof (uint64_t) * children); + return; + } + + qsort((void *)lat_data, children, sizeof (uint64_t), latency_compare); + + uint64_t iqr; + uint64_t fence = latency_quartiles_fence(lat_data, children, &iqr); + + ASSERT3U(lat_data[children - 1], ==, max); + if (max > fence && !svd_sitting) { + ASSERT3U(iqr, >, 0); + uint64_t incr = MAX(1, MIN((max - fence) / iqr, + LAT_OUTLIER_LIMIT / 4)); + vd->vdev_outlier_count += incr; + if (vd->vdev_outlier_count >= children) { + for (int c = 0; c < children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + cvd->vdev_outlier_count -= 2; + cvd->vdev_outlier_count = MAX(0, + cvd->vdev_outlier_count); + } + vd->vdev_outlier_count = 0; + } + /* + * Keep track of how many times this child has had + * an outlier read. A disk that persitently has a + * higher than peers outlier count will be considered + * a slow disk. + */ + svd->vdev_outlier_count += incr; + if (svd->vdev_outlier_count > LAT_OUTLIER_LIMIT) { + ASSERT0(svd->vdev_read_sit_out_expire); + vdev_raidz_sit_child(svd, vdev_read_sit_out_secs); + (void) zfs_ereport_post(FM_EREPORT_ZFS_SITOUT, + zio->io_spa, svd, NULL, NULL, 0); + vdev_dbgmsg(svd, "begin read sit out for %d secs", + (int)vdev_read_sit_out_secs); + + for (int c = 0; c < vd->vdev_children; c++) + vd->vdev_child[c]->vdev_outlier_count = 0; + } + } + + kmem_free(lat_data, sizeof (uint64_t) * children); +} + static void vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) { @@ -3515,6 +3850,9 @@ vdev_raidz_io_done(zio_t *zio) raidz_row_t *rr = rm->rm_row[i]; vdev_raidz_io_done_verified(zio, rr); } + /* Periodically check for a read outlier */ + if (zio->io_type == ZIO_TYPE_READ) + vdev_child_slow_outlier(zio); zio_checksum_verified(zio); } else { /* @@ -5155,3 +5493,10 @@ ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW, "For expanded RAIDZ, automatically start a pool scrub when expansion " "completes"); +ZFS_MODULE_PARAM(zfs_vdev, vdev_, read_sit_out_secs, ULONG, ZMOD_RW, + "Raidz/draid slow disk sit out time period in seconds"); +ZFS_MODULE_PARAM(zfs_vdev, vdev_, raidz_outlier_check_interval_ms, ULONG, + ZMOD_RW, "Interval to check for slow raidz/draid children"); +ZFS_MODULE_PARAM(zfs_vdev, vdev_, raidz_outlier_insensitivity, UINT, + ZMOD_RW, "How insensitive the slow raidz/draid child check should be"); +/* END CSTYLED */ diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index db3f25dc426..b6cb2d559af 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -940,10 +940,11 @@ tags = ['functional', 'rename_dirs'] [tests/functional/replacement] tests = ['attach_import', 'attach_multiple', 'attach_rebuild', - 'attach_resilver', 'detach', 'rebuild_disabled_feature', - 'rebuild_multiple', 'rebuild_raidz', 'replace_import', 'replace_rebuild', - 'replace_resilver', 'resilver_restart_001', 'resilver_restart_002', - 'scrub_cancel'] + 'attach_resilver', 'attach_resilver_sit_out', 'detach', + 'rebuild_disabled_feature', 'rebuild_multiple', 'rebuild_raidz', + 'replace_import', 'replace_rebuild', 'replace_resilver', + 'replace_resilver_sit_out', 'resilver_restart_001', + 'resilver_restart_002', 'scrub_cancel'] tags = ['functional', 'replacement'] [tests/functional/reservation] diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index ba367fad402..86fe4d699f9 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -109,7 +109,8 @@ tags = ['functional', 'direct'] [tests/functional/events:Linux] tests = ['events_001_pos', 'events_002_pos', 'zed_rc_filter', 'zed_fd_spill', 'zed_cksum_reported', 'zed_cksum_config', 'zed_io_config', - 'zed_slow_io', 'zed_slow_io_many_vdevs', 'zed_diagnose_multiple'] + 'zed_slow_io', 'zed_slow_io_many_vdevs', 'zed_diagnose_multiple', + 'slow_vdev_sit_out', 'slow_vdev_sit_out_neg', 'slow_vdev_degraded_sit_out'] tags = ['functional', 'events'] [tests/functional/fallocate:Linux] diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index a9a4a90bc94..6b0f8b18c4b 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -1112,6 +1112,16 @@ function get_pool_prop # property pool zpool get -Hpo value "$prop" "$pool" || log_fail "zpool get $prop $pool" } +# Get the specified vdev property in parsable format or fail +function get_vdev_prop +{ + typeset prop="$1" + typeset pool="$2" + typeset vdev="$3" + + zpool get -Hpo value "$prop" "$pool" "$vdev" || log_fail "zpool get $prop $pool $vdev" +} + # Return 0 if a pool exists; $? otherwise # # $1 - pool name @@ -1970,6 +1980,28 @@ function wait_vdev_state # pool disk state timeout return 1 } +# +# Wait for vdev 'sit_out' property to be cleared. +# +# $1 pool name +# $2 vdev name +# $3 timeout +# +function wait_sit_out #pool vdev timeout +{ + typeset pool=${1:-$TESTPOOL} + typeset vdev="$2" + typeset timeout=${3:-300} + for (( timer = 0; timer < $timeout; timer++ )); do + if [ "$(get_vdev_prop sit_out "$pool" "$vdev")" = "off" ]; then + return 0 + fi + sleep 1; + done + + return 1 +} + # # Check the output of 'zpool status -v ', # and to see if the content of contain the specified. diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index f2d7ceac0cb..b8d72f391bd 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -72,6 +72,9 @@ MULTIHOST_INTERVAL multihost.interval zfs_multihost_interval OVERRIDE_ESTIMATE_RECORDSIZE send.override_estimate_recordsize zfs_override_estimate_recordsize PREFETCH_DISABLE prefetch.disable zfs_prefetch_disable RAIDZ_EXPAND_MAX_REFLOW_BYTES vdev.expand_max_reflow_bytes raidz_expand_max_reflow_bytes +READ_SIT_OUT_SECS vdev.read_sit_out_secs vdev_read_sit_out_secs +SIT_OUT_CHECK_INTERVAL vdev.raidz_outlier_check_interval_ms vdev_raidz_outlier_check_interval_ms +SIT_OUT_INSENSITIVITY vdev.raidz_outlier_insensitivity vdev_raidz_outlier_insensitivity REBUILD_SCRUB_ENABLED rebuild_scrub_enabled zfs_rebuild_scrub_enabled REMOVAL_SUSPEND_PROGRESS vdev.removal_suspend_progress zfs_removal_suspend_progress REMOVE_MAX_SEGMENT vdev.remove_max_segment zfs_remove_max_segment diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index d9ec3686cea..dfc57dfedd2 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1525,6 +1525,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/events/events_001_pos.ksh \ functional/events/events_002_pos.ksh \ functional/events/setup.ksh \ + functional/events/slow_vdev_degraded_sit_out.ksh \ + functional/events/slow_vdev_sit_out.ksh \ + functional/events/slow_vdev_sit_out_neg.ksh \ functional/events/zed_cksum_config.ksh \ functional/events/zed_cksum_reported.ksh \ functional/events/zed_diagnose_multiple.ksh \ @@ -1937,6 +1940,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/replacement/attach_multiple.ksh \ functional/replacement/attach_rebuild.ksh \ functional/replacement/attach_resilver.ksh \ + functional/replacement/attach_resilver_sit_out.ksh \ functional/replacement/cleanup.ksh \ functional/replacement/detach.ksh \ functional/replacement/rebuild_disabled_feature.ksh \ @@ -1945,6 +1949,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/replacement/replace_import.ksh \ functional/replacement/replace_rebuild.ksh \ functional/replacement/replace_resilver.ksh \ + functional/replacement/replace_resilver_sit_out.ksh \ functional/replacement/resilver_restart_001.ksh \ functional/replacement/resilver_restart_002.ksh \ functional/replacement/scrub_cancel.ksh \ diff --git a/tests/zfs-tests/tests/functional/events/slow_vdev_degraded_sit_out.ksh b/tests/zfs-tests/tests/functional/events/slow_vdev_degraded_sit_out.ksh new file mode 100755 index 00000000000..d5feb6936b4 --- /dev/null +++ b/tests/zfs-tests/tests/functional/events/slow_vdev_degraded_sit_out.ksh @@ -0,0 +1,106 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# Copyright (c) 2024 by Lawrence Livermore National Security, LLC. +# Copyright (c) 2025 by Klara, Inc. + +# DESCRIPTION: +# Verify that vdevs 'sit out' when they are slow +# +# STRATEGY: +# 1. Create various raidz/draid pools +# 2. Degrade/fault one of the disks. +# 3. Inject delays into one of the disks +# 4. Verify disk is set to 'sit out' for awhile. +# 5. Wait for READ_SIT_OUT_SECS and verify sit out state is lifted. +# + +. $STF_SUITE/include/libtest.shlib + +function cleanup +{ + restore_tunable READ_SIT_OUT_SECS + restore_tunable SIT_OUT_CHECK_INTERVAL + log_must zinject -c all + log_must zpool events -c + destroy_pool $TESTPOOL2 + log_must rm -f $TEST_BASE_DIR/vdev.$$.* +} + +log_assert "Verify sit_out works" + +log_onexit cleanup + +# shorten sit out period for testing +save_tunable READ_SIT_OUT_SECS +set_tunable32 READ_SIT_OUT_SECS 5 + +save_tunable SIT_OUT_CHECK_INTERVAL +set_tunable64 SIT_OUT_CHECK_INTERVAL 20 + +log_must truncate -s 150M $TEST_BASE_DIR/vdev.$$.{0..9} + +for raidtype in raidz2 raidz3 draid2 draid3 ; do + log_must zpool create $TESTPOOL2 $raidtype $TEST_BASE_DIR/vdev.$$.{0..9} + log_must zpool set autosit=on $TESTPOOL2 "${raidtype}-0" + log_must dd if=/dev/urandom of=/$TESTPOOL2/bigfile bs=1M count=400 + log_must zpool export $TESTPOOL2 + log_must zpool import -d $TEST_BASE_DIR $TESTPOOL2 + + BAD_VDEV=$TEST_BASE_DIR/vdev.$$.9 + SLOW_VDEV=$TEST_BASE_DIR/vdev.$$.8 + + # Initial state should not be sitting out + log_must eval [[ "$(get_vdev_prop sit_out $TESTPOOL2 $SLOW_VDEV)" == "off" ]] + + # Delay our reads 200ms to trigger sit out + log_must zinject -d $SLOW_VDEV -D200:1 -T read $TESTPOOL2 + type=$((RANDOM % 2)) + [[ "$type" -eq "0" ]] && action="degrade" || action="fault" + log_must zinject -d $BAD_VDEV -A $action -T read $TESTPOOL2 + + # Do some reads and wait for us to sit out + for i in {0..99} ; do + dd if=/$TESTPOOL2/bigfile skip=$i bs=2M count=1 of=/dev/null & + dd if=/$TESTPOOL2/bigfile skip=$((i + 100)) bs=2M count=1 of=/dev/null + + sit_out=$(get_vdev_prop sit_out $TESTPOOL2 $SLOW_VDEV) + if [[ "$sit_out" == "on" ]] ; then + break + fi + done + + log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $SLOW_VDEV)" == "on" + + # Clear fault injection + log_must zinject -c all + + # Wait for us to exit our sit out period + log_must wait_sit_out $TESTPOOL2 $SLOW_VDEV 10 + + log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $SLOW_VDEV)" == "off" + destroy_pool $TESTPOOL2 + log_must zpool labelclear -f $BAD_VDEV +done + +log_pass "sit_out works correctly" diff --git a/tests/zfs-tests/tests/functional/events/slow_vdev_sit_out.ksh b/tests/zfs-tests/tests/functional/events/slow_vdev_sit_out.ksh new file mode 100755 index 00000000000..37f616cf56e --- /dev/null +++ b/tests/zfs-tests/tests/functional/events/slow_vdev_sit_out.ksh @@ -0,0 +1,102 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# Copyright (c) 2024 by Lawrence Livermore National Security, LLC. + +# DESCRIPTION: +# Verify that vdevs 'sit out' when they are slow +# +# STRATEGY: +# 1. Create various raidz/draid pools +# 2. Inject delays into one of the disks +# 3. Verify disk is set to 'sit out' for awhile. +# 4. Wait for READ_SIT_OUT_SECS and verify sit out state is lifted. +# + +. $STF_SUITE/include/libtest.shlib + +function cleanup +{ + restore_tunable READ_SIT_OUT_SECS + restore_tunable SIT_OUT_CHECK_INTERVAL + log_must zinject -c all + log_must zpool events -c + destroy_pool $TESTPOOL2 + log_must rm -f $TEST_BASE_DIR/vdev.$$.* +} + +log_assert "Verify sit_out works" + +log_onexit cleanup + +# shorten sit out period for testing +save_tunable READ_SIT_OUT_SECS +set_tunable32 READ_SIT_OUT_SECS 5 + +save_tunable SIT_OUT_CHECK_INTERVAL +set_tunable64 SIT_OUT_CHECK_INTERVAL 20 + +log_must truncate -s200M $TEST_BASE_DIR/vdev.$$.{0..9} + +for raidtype in raidz raidz2 raidz3 draid1 draid2 draid3 ; do + log_must zpool create $TESTPOOL2 $raidtype $TEST_BASE_DIR/vdev.$$.{0..9} + log_must zpool set autosit=on $TESTPOOL2 "${raidtype}-0" + log_must dd if=/dev/urandom of=/$TESTPOOL2/bigfile bs=1M count=600 + log_must zpool export $TESTPOOL2 + log_must zpool import -d $TEST_BASE_DIR $TESTPOOL2 + + BAD_VDEV=$TEST_BASE_DIR/vdev.$$.9 + + # Initial state should not be sitting out + log_must eval [[ "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV)" == "off" ]] + + # Delay our reads 200ms to trigger sit out + log_must zinject -d $BAD_VDEV -D200:1 -T read $TESTPOOL2 + + # Do some reads and wait for us to sit out + for i in {0..99} ; do + dd if=/$TESTPOOL2/bigfile skip=$i bs=2M count=1 of=/dev/null & + dd if=/$TESTPOOL2/bigfile skip=$((i + 100)) bs=2M count=1 of=/dev/null & + dd if=/$TESTPOOL2/bigfile skip=$((i + 200)) bs=2M count=1 of=/dev/null + + sit_out=$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV) + if [[ "$sit_out" == "on" ]] ; then + break + fi + done + + log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV)" == "on" + + # Clear fault injection + log_must zinject -c all + + # Wait for us to exit our sit out period + log_must wait_sit_out $TESTPOOL2 $BAD_VDEV 10 + + # Verify sit_out was cleared during wait_sit_out + log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV)" == "off" + + destroy_pool $TESTPOOL2 +done + +log_pass "sit_out works correctly" diff --git a/tests/zfs-tests/tests/functional/events/slow_vdev_sit_out_neg.ksh b/tests/zfs-tests/tests/functional/events/slow_vdev_sit_out_neg.ksh new file mode 100755 index 00000000000..457105a6645 --- /dev/null +++ b/tests/zfs-tests/tests/functional/events/slow_vdev_sit_out_neg.ksh @@ -0,0 +1,116 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# Copyright (c) 2024 by Lawrence Livermore National Security, LLC. +# Copyright (c) 2025 by Klara, Inc. + +# DESCRIPTION: +# Verify that we don't sit out too many vdevs +# +# STRATEGY: +# 1. Create draid2 pool +# 2. Inject delays into three of the disks +# 3. Do reads to trigger sit-outs +# 4. Verify exactly 2 disks sit out +# + +. $STF_SUITE/include/libtest.shlib + +function cleanup +{ + restore_tunable READ_SIT_OUT_SECS + restore_tunable SIT_OUT_CHECK_INTERVAL + log_must zinject -c all + log_must zpool events -c + destroy_pool $TESTPOOL2 + log_must rm -f $TEST_BASE_DIR/vdev.$$.* +} + +log_assert "Verify sit_out works" + +log_onexit cleanup + +save_tunable SIT_OUT_CHECK_INTERVAL +set_tunable64 SIT_OUT_CHECK_INTERVAL 20 + +log_must truncate -s 150M $TEST_BASE_DIR/vdev.$$.{0..9} + +log_must zpool create $TESTPOOL2 draid2 $TEST_BASE_DIR/vdev.$$.{0..9} +log_must zpool set autosit=on $TESTPOOL2 draid2-0 +log_must dd if=/dev/urandom of=/$TESTPOOL2/bigfile bs=1M count=400 +log_must zpool export $TESTPOOL2 +log_must zpool import -d $TEST_BASE_DIR $TESTPOOL2 + +BAD_VDEV1=$TEST_BASE_DIR/vdev.$$.7 +BAD_VDEV2=$TEST_BASE_DIR/vdev.$$.8 +BAD_VDEV3=$TEST_BASE_DIR/vdev.$$.9 + +# Initial state should not be sitting out +log_must eval [[ "$(get_vdev_prop autosit $TESTPOOL2 draid2-0)" == "on" ]] +log_must eval [[ "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV1)" == "off" ]] +log_must eval [[ "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV2)" == "off" ]] +log_must eval [[ "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV3)" == "off" ]] + +# Delay our reads 200ms to trigger sit out +log_must zinject -d $BAD_VDEV1 -D200:1 -T read $TESTPOOL2 + +# Do some reads and wait for us to sit out +for i in {0..99} ; do + dd if=/$TESTPOOL2/bigfile skip=$i bs=2M count=1 of=/dev/null & + dd if=/$TESTPOOL2/bigfile skip=$((i + 100)) bs=2M count=1 of=/dev/null + + sit_out=$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV1) + if [[ "$sit_out" == "on" ]] ; then + break + fi +done +log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV1)" == "on" + +log_must zinject -d $BAD_VDEV2 -D200:1 -T read $TESTPOOL2 +# Do some reads and wait for us to sit out +for i in {0..99} ; do + dd if=/$TESTPOOL2/bigfile skip=$i bs=2M count=1 of=/dev/null & + dd if=/$TESTPOOL2/bigfile skip=$((i + 100)) bs=2M count=1 of=/dev/null + + sit_out=$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV2) + if [[ "$sit_out" == "on" ]] ; then + break + fi +done +log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV2)" == "on" + +log_must zinject -d $BAD_VDEV3 -D200:1 -T read $TESTPOOL2 +# Do some reads and wait for us to sit out +for i in {0..99} ; do + dd if=/$TESTPOOL2/bigfile skip=$i bs=2M count=1 of=/dev/null & + dd if=/$TESTPOOL2/bigfile skip=$((i + 100)) bs=2M count=1 of=/dev/null + + sit_out=$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV3) + if [[ "$sit_out" == "on" ]] ; then + break + fi +done +log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV3)" == "off" + + +log_pass "sit_out works correctly" diff --git a/tests/zfs-tests/tests/functional/replacement/attach_resilver_sit_out.ksh b/tests/zfs-tests/tests/functional/replacement/attach_resilver_sit_out.ksh new file mode 100755 index 00000000000..6820aba184b --- /dev/null +++ b/tests/zfs-tests/tests/functional/replacement/attach_resilver_sit_out.ksh @@ -0,0 +1,189 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013, 2016 by Delphix. All rights reserved. +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/replacement/replacement.cfg + +# +# DESCRIPTION: +# Attaching disks while a disk is sitting out reads should pass +# +# STRATEGY: +# 1. Create raidz pools +# 2. Make one disk slower and trigger a read sit out for that disk +# 3. Start some random I/O +# 4. Attach a disk to the pool. +# 5. Verify the integrity of the file system and the resilvering. + +verify_runnable "global" + +save_tunable READ_SIT_OUT_SECS +set_tunable32 READ_SIT_OUT_SECS 120 +save_tunable SIT_OUT_CHECK_INTERVAL +set_tunable64 SIT_OUT_CHECK_INTERVAL 20 + +function cleanup +{ + restore_tunable READ_SIT_OUT_SECS + restore_tunable SIT_OUT_CHECK_INTERVAL + log_must zinject -c all + log_must zpool events -c + + if [[ -n "$child_pids" ]]; then + for wait_pid in $child_pids; do + kill $wait_pid + done + fi + + if poolexists $TESTPOOL1; then + destroy_pool $TESTPOOL1 + fi + + [[ -e $TESTDIR ]] && log_must rm -rf $TESTDIR/* +} + +log_assert "Replacing a disk during I/O with a sit out completes." + +options="" +options_display="default options" + +log_onexit cleanup + +[[ -n "$HOLES_FILESIZE" ]] && options=" $options -f $HOLES_FILESIZE " + +[[ -n "$HOLES_BLKSIZE" ]] && options="$options -b $HOLES_BLKSIZE " + +[[ -n "$HOLES_COUNT" ]] && options="$options -c $HOLES_COUNT " + +[[ -n "$HOLES_SEED" ]] && options="$options -s $HOLES_SEED " + +[[ -n "$HOLES_FILEOFFSET" ]] && options="$options -o $HOLES_FILEOFFSET " + +options="$options -r " + +[[ -n "$options" ]] && options_display=$options + +child_pids="" + +function attach_test +{ + typeset vdev=$1 + typeset disk=$2 + + typeset i=0 + while [[ $i -lt $iters ]]; do + log_note "Invoking file_trunc with: $options_display on $TESTFILE.$i" + file_trunc $options $TESTDIR/$TESTFILE.$i & + typeset pid=$! + + sleep 1 + + child_pids="$child_pids $pid" + ((i = i + 1)) + done + + # attach disk with a slow drive still present + SECONDS=0 + log_must zpool attach -w $TESTPOOL1 $vdev $disk + log_note took $SECONDS seconds to attach disk + + for wait_pid in $child_pids + do + kill $wait_pid + done + child_pids="" + + log_must zinject -c all + log_must zpool export $TESTPOOL1 + log_must zpool import -d $TESTDIR $TESTPOOL1 + log_must zfs umount $TESTPOOL1/$TESTFS1 + log_must zdb -cdui $TESTPOOL1/$TESTFS1 + log_must zfs mount $TESTPOOL1/$TESTFS1 + verify_pool $TESTPOOL1 +} + +DEVSIZE="150M" +specials_list="" +i=0 +while [[ $i != 10 ]]; do + truncate -s $DEVSIZE $TESTDIR/$TESTFILE1.$i + specials_list="$specials_list $TESTDIR/$TESTFILE1.$i" + + ((i = i + 1)) +done + +slow_disk=$TESTDIR/$TESTFILE1.3 +log_must truncate -s $DEVSIZE $TESTDIR/$REPLACEFILE + +# Test file size in MB +count=200 + +for type in "raidz1" "raidz2" "raidz3" ; do + create_pool $TESTPOOL1 $type $specials_list + log_must zpool set autosit=on $TESTPOOL1 "${type}-0" + log_must zfs create -o primarycache=none -o recordsize=512K \ + $TESTPOOL1/$TESTFS1 + log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1 + + log_must dd if=/dev/urandom of=/$TESTDIR1/bigfile bs=1M count=$count + + # Make one disk 100ms slower to trigger a sit out + log_must zinject -d $slow_disk -D100:1 -T read $TESTPOOL1 + + # Do some reads and wait for sit out on slow disk + SECONDS=0 + typeset -i size=0 + for i in $(seq 1 $count) ; do + dd if=/$TESTDIR1/bigfile skip=$i bs=1M count=1 of=/dev/null + size=$i + + sit_out=$(get_vdev_prop sit_out $TESTPOOL1 $slow_disk) + if [[ "$sit_out" == "on" ]] ; then + break + fi + done + + log_must test "$(get_vdev_prop sit_out $TESTPOOL1 $slow_disk)" == "on" + log_note took $SECONDS seconds to reach sit out reading ${size}M + log_must zpool status -s $TESTPOOL1 + + typeset top=$(zpool status -j | jq -r ".pools.$TESTPOOL1.vdevs[].vdevs[].name") + attach_test $top $TESTDIR/$REPLACEFILE + + log_must eval "zpool iostat -v $TESTPOOL1 | grep \"$REPLACEFILE\"" + + destroy_pool $TESTPOOL1 + log_must rm -rf /$TESTPOOL1 +done + +log_pass diff --git a/tests/zfs-tests/tests/functional/replacement/replace_resilver_sit_out.ksh b/tests/zfs-tests/tests/functional/replacement/replace_resilver_sit_out.ksh new file mode 100755 index 00000000000..4109dbaf45a --- /dev/null +++ b/tests/zfs-tests/tests/functional/replacement/replace_resilver_sit_out.ksh @@ -0,0 +1,199 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013, 2016 by Delphix. All rights reserved. +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/replacement/replacement.cfg + +# +# DESCRIPTION: +# Replacing disks while a disk is sitting out reads should pass +# +# STRATEGY: +# 1. Create raidz and draid pools +# 2. Make one disk slower and trigger a read sit out for that disk +# 3. Start some random I/O +# 4. Replace a disk in the pool with another disk. +# 5. Verify the integrity of the file system and the resilvering. +# + +verify_runnable "global" + +save_tunable READ_SIT_OUT_SECS +set_tunable32 READ_SIT_OUT_SECS 120 +save_tunable SIT_OUT_CHECK_INTERVAL +set_tunable64 SIT_OUT_CHECK_INTERVAL 20 + +function cleanup +{ + restore_tunable READ_SIT_OUT_SECS + restore_tunable SIT_OUT_CHECK_INTERVAL + log_must zinject -c all + log_must zpool events -c + + if [[ -n "$child_pids" ]]; then + for wait_pid in $child_pids + do + kill $wait_pid + done + fi + + if poolexists $TESTPOOL1; then + destroy_pool $TESTPOOL1 + fi + + [[ -e $TESTDIR ]] && log_must rm -rf $TESTDIR/* +} + +log_assert "Replacing a disk during I/O with a sit out completes." + +options="" +options_display="default options" + +log_onexit cleanup + +[[ -n "$HOLES_FILESIZE" ]] && options=" $options -f $HOLES_FILESIZE " + +[[ -n "$HOLES_BLKSIZE" ]] && options="$options -b $HOLES_BLKSIZE " + +[[ -n "$HOLES_COUNT" ]] && options="$options -c $HOLES_COUNT " + +[[ -n "$HOLES_SEED" ]] && options="$options -s $HOLES_SEED " + +[[ -n "$HOLES_FILEOFFSET" ]] && options="$options -o $HOLES_FILEOFFSET " + +options="$options -r " + +[[ -n "$options" ]] && options_display=$options + +child_pids="" + +function replace_test +{ + typeset -i iters=2 + typeset disk1=$1 + typeset disk2=$2 + typeset repl_type=$3 + + typeset i=0 + while [[ $i -lt $iters ]]; do + log_note "Invoking file_trunc with: $options_display on $TESTFILE.$i" + file_trunc $options $TESTDIR/$TESTFILE.$i & + typeset pid=$! + + sleep 1 + + child_pids="$child_pids $pid" + ((i = i + 1)) + done + + typeset repl_flag="-w" + if [[ "$repl_type" == "seq" ]]; then + repl_flag="-ws" + fi + # replace disk with a slow drive still present + SECONDS=0 + log_must zpool replace $repl_flag $TESTPOOL1 $disk1 $disk2 + log_note took $SECONDS seconds to replace disk + + for wait_pid in $child_pids + do + kill $wait_pid + done + child_pids="" + + log_must zinject -c all + log_must zpool export $TESTPOOL1 + log_must zpool import -d $TESTDIR $TESTPOOL1 + log_must zfs umount $TESTPOOL1/$TESTFS1 + log_must zdb -cdui $TESTPOOL1/$TESTFS1 + log_must zfs mount $TESTPOOL1/$TESTFS1 + verify_pool $TESTPOOL1 +} + +DEVSIZE="150M" +specials_list="" +i=0 +while [[ $i != 10 ]]; do + log_must truncate -s $DEVSIZE $TESTDIR/$TESTFILE1.$i + specials_list="$specials_list $TESTDIR/$TESTFILE1.$i" + + ((i = i + 1)) +done + +slow_disk=$TESTDIR/$TESTFILE1.3 +log_must truncate -s $DEVSIZE $TESTDIR/$REPLACEFILE + +# Test file size in MB +count=400 + +for type in "raidz2" "raidz3" "draid2"; do + create_pool $TESTPOOL1 $type $specials_list + log_must zpool set autosit=on $TESTPOOL1 "${type}-0" + log_must zfs create -o primarycache=none -o recordsize=512K \ + $TESTPOOL1/$TESTFS1 + log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1 + + log_must dd if=/dev/urandom of=/$TESTDIR1/bigfile bs=1M count=$count + + # Make one disk 100ms slower to trigger a sit out + log_must zinject -d $slow_disk -D100:1 -T read $TESTPOOL1 + + # Do some reads and wait for sit out on slow disk + SECONDS=0 + typeset -i size=0 + for i in $(seq 1 $count) ; do + dd if=/$TESTDIR1/bigfile skip=$i bs=1M count=1 of=/dev/null + size=$i + + sit_out=$(get_vdev_prop sit_out $TESTPOOL1 $slow_disk) + if [[ "$sit_out" == "on" ]] ; then + break + fi + done + log_must test "$(get_vdev_prop sit_out $TESTPOOL1 $slow_disk)" == "on" + log_note took $SECONDS seconds to reach sit out reading ${size}M + log_must zpool status -s $TESTPOOL1 + + typeset repl_type="replace" + if [[ "$type" == "draid2" && $((RANDOM % 2)) -eq 0 ]]; then + repl_type="seq" + fi + replace_test $TESTDIR/$TESTFILE1.1 $TESTDIR/$REPLACEFILE $repl_type + + log_must eval "zpool iostat -v $TESTPOOL1 | grep \"$REPLACEFILE\"" + + destroy_pool $TESTPOOL1 + log_must rm -rf /$TESTPOOL1 +done + +log_pass From bc0b5318aa98fed6e6c0db2181a0f85edce3265d Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Thu, 11 Sep 2025 10:58:46 -0700 Subject: [PATCH 13/23] Prevent scrubbing a read-only pool While it would be nice to be able to scrub a pool imported read-only this will currently trip an ASSERT. Before we can support this there are some designs challenges which need to be thought through first. For starters, a read-only import skips reading certain information from disk which it knows won't be needed, such as the space maps. Furthermore, the scrub process expects to be checkpoint it's progress, update the on disk error log, and issue repair IO. None of which would be possible when the pool is imported read-only. Each of these wrinkles can certainly be handled, but that will take some signifcant work. In the meanwhile we disable the 'zpool scrub' command when the pool is imported read-only. Reviewed-by: Alan Somers Signed-off-by: Brian Behlendorf Issue #17527 Closes #17717 --- module/zfs/zfs_ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 76c9d4ccd51..4227b0594b1 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -7619,7 +7619,7 @@ zfs_ioctl_init(void) zfs_ioctl_register("scrub", ZFS_IOC_POOL_SCRUB, zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME, - POOL_CHECK_NONE, B_TRUE, B_TRUE, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_pool_scrub, ARRAY_SIZE(zfs_keys_pool_scrub)); zfs_ioctl_register("get_props", ZFS_IOC_POOL_GET_PROPS, From 654f2dcb42c574f8367a73921ab7719ad4a9721d Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Thu, 11 Sep 2025 11:34:07 -0700 Subject: [PATCH 14/23] zed: Add synchronous zedlets Historically, ZED has blindly spawned off zedlets in parallel and never worried about their completion order. This means that you can potentially have zedlets for event number 2 starting before zedlets for event number 1 had finished. Most of the time this is fine, and it actually helps a lot when the system is getting spammed with hundreds of events. However, there are times when you want your zedlets to be executed in sequence with the event ID. That is where synchronous zedlets come in. ZED will wait for all previously spawned zedlets to finish before running a synchronous zedlet. Synchronous zedlets are guaranteed to be the only zedlet running. No other zedlets may run in parallel with a synchronous zedlet. Users should be careful to only use synchronous zedlets when needed, since they decrease parallelism. To make a zedlet synchronous, simply add a "-sync-" immediately following the event name in the zedlet's file name: EVENT_NAME-sync-ZEDLETNAME.sh For example, if you wanted a synchronous statechange script: statechange-sync-myzedlet.sh Reviewed-by: Brian Behlendorf Signed-off-by: Tony Hutter Closes #17335 --- cmd/zed/zed.d/Makefile.am | 24 +-- ...n-slot_off.sh => deadman-sync-slot_off.sh} | 0 cmd/zed/zed.d/pool_import-led.sh | 1 - cmd/zed/zed.d/pool_import-sync-led.sh | 1 + ...echange-led.sh => statechange-sync-led.sh} | 0 ...ot_off.sh => statechange-sync-slot_off.sh} | 0 cmd/zed/zed.d/vdev_attach-led.sh | 1 - cmd/zed/zed.d/vdev_attach-sync-led.sh | 1 + cmd/zed/zed.d/vdev_clear-led.sh | 1 - cmd/zed/zed.d/vdev_clear-sync-led.sh | 1 + cmd/zed/zed_exec.c | 111 ++++++++++--- man/man8/zed.8.in | 32 ++++ tests/runfiles/linux.run | 3 +- tests/zfs-tests/tests/Makefile.am | 1 + .../events/zed_synchronous_zedlet.ksh | 149 ++++++++++++++++++ 15 files changed, 289 insertions(+), 37 deletions(-) rename cmd/zed/zed.d/{deadman-slot_off.sh => deadman-sync-slot_off.sh} (100%) delete mode 120000 cmd/zed/zed.d/pool_import-led.sh create mode 120000 cmd/zed/zed.d/pool_import-sync-led.sh rename cmd/zed/zed.d/{statechange-led.sh => statechange-sync-led.sh} (100%) rename cmd/zed/zed.d/{statechange-slot_off.sh => statechange-sync-slot_off.sh} (100%) delete mode 120000 cmd/zed/zed.d/vdev_attach-led.sh create mode 120000 cmd/zed/zed.d/vdev_attach-sync-led.sh delete mode 120000 cmd/zed/zed.d/vdev_clear-led.sh create mode 120000 cmd/zed/zed.d/vdev_clear-sync-led.sh create mode 100755 tests/zfs-tests/tests/functional/events/zed_synchronous_zedlet.ksh diff --git a/cmd/zed/zed.d/Makefile.am b/cmd/zed/zed.d/Makefile.am index 093a04c4636..c0b161ecf24 100644 --- a/cmd/zed/zed.d/Makefile.am +++ b/cmd/zed/zed.d/Makefile.am @@ -9,18 +9,18 @@ dist_zedexec_SCRIPTS = \ %D%/all-debug.sh \ %D%/all-syslog.sh \ %D%/data-notify.sh \ - %D%/deadman-slot_off.sh \ + %D%/deadman-sync-slot_off.sh \ %D%/generic-notify.sh \ - %D%/pool_import-led.sh \ + %D%/pool_import-sync-led.sh \ %D%/resilver_finish-notify.sh \ %D%/resilver_finish-start-scrub.sh \ %D%/scrub_finish-notify.sh \ - %D%/statechange-led.sh \ + %D%/statechange-sync-led.sh \ %D%/statechange-notify.sh \ - %D%/statechange-slot_off.sh \ + %D%/statechange-sync-slot_off.sh \ %D%/trim_finish-notify.sh \ - %D%/vdev_attach-led.sh \ - %D%/vdev_clear-led.sh + %D%/vdev_attach-sync-led.sh \ + %D%/vdev_clear-sync-led.sh nodist_zedexec_SCRIPTS = \ %D%/history_event-zfs-list-cacher.sh @@ -30,17 +30,17 @@ SUBSTFILES += $(nodist_zedexec_SCRIPTS) zedconfdefaults = \ all-syslog.sh \ data-notify.sh \ - deadman-slot_off.sh \ + deadman-sync-slot_off.sh \ history_event-zfs-list-cacher.sh \ - pool_import-led.sh \ + pool_import-sync-led.sh \ resilver_finish-notify.sh \ resilver_finish-start-scrub.sh \ scrub_finish-notify.sh \ - statechange-led.sh \ + statechange-sync-led.sh \ statechange-notify.sh \ - statechange-slot_off.sh \ - vdev_attach-led.sh \ - vdev_clear-led.sh + statechange-sync-slot_off.sh \ + vdev_attach-sync-led.sh \ + vdev_clear-sync-led.sh dist_noinst_DATA += %D%/README diff --git a/cmd/zed/zed.d/deadman-slot_off.sh b/cmd/zed/zed.d/deadman-sync-slot_off.sh similarity index 100% rename from cmd/zed/zed.d/deadman-slot_off.sh rename to cmd/zed/zed.d/deadman-sync-slot_off.sh diff --git a/cmd/zed/zed.d/pool_import-led.sh b/cmd/zed/zed.d/pool_import-led.sh deleted file mode 120000 index 7d7404398a4..00000000000 --- a/cmd/zed/zed.d/pool_import-led.sh +++ /dev/null @@ -1 +0,0 @@ -statechange-led.sh \ No newline at end of file diff --git a/cmd/zed/zed.d/pool_import-sync-led.sh b/cmd/zed/zed.d/pool_import-sync-led.sh new file mode 120000 index 00000000000..8b9c10c11eb --- /dev/null +++ b/cmd/zed/zed.d/pool_import-sync-led.sh @@ -0,0 +1 @@ +statechange-sync-led.sh \ No newline at end of file diff --git a/cmd/zed/zed.d/statechange-led.sh b/cmd/zed/zed.d/statechange-sync-led.sh similarity index 100% rename from cmd/zed/zed.d/statechange-led.sh rename to cmd/zed/zed.d/statechange-sync-led.sh diff --git a/cmd/zed/zed.d/statechange-slot_off.sh b/cmd/zed/zed.d/statechange-sync-slot_off.sh similarity index 100% rename from cmd/zed/zed.d/statechange-slot_off.sh rename to cmd/zed/zed.d/statechange-sync-slot_off.sh diff --git a/cmd/zed/zed.d/vdev_attach-led.sh b/cmd/zed/zed.d/vdev_attach-led.sh deleted file mode 120000 index 7d7404398a4..00000000000 --- a/cmd/zed/zed.d/vdev_attach-led.sh +++ /dev/null @@ -1 +0,0 @@ -statechange-led.sh \ No newline at end of file diff --git a/cmd/zed/zed.d/vdev_attach-sync-led.sh b/cmd/zed/zed.d/vdev_attach-sync-led.sh new file mode 120000 index 00000000000..8b9c10c11eb --- /dev/null +++ b/cmd/zed/zed.d/vdev_attach-sync-led.sh @@ -0,0 +1 @@ +statechange-sync-led.sh \ No newline at end of file diff --git a/cmd/zed/zed.d/vdev_clear-led.sh b/cmd/zed/zed.d/vdev_clear-led.sh deleted file mode 120000 index 7d7404398a4..00000000000 --- a/cmd/zed/zed.d/vdev_clear-led.sh +++ /dev/null @@ -1 +0,0 @@ -statechange-led.sh \ No newline at end of file diff --git a/cmd/zed/zed.d/vdev_clear-sync-led.sh b/cmd/zed/zed.d/vdev_clear-sync-led.sh new file mode 120000 index 00000000000..8b9c10c11eb --- /dev/null +++ b/cmd/zed/zed.d/vdev_clear-sync-led.sh @@ -0,0 +1 @@ +statechange-sync-led.sh \ No newline at end of file diff --git a/cmd/zed/zed_exec.c b/cmd/zed/zed_exec.c index 036081decd6..a14af4f20a8 100644 --- a/cmd/zed/zed_exec.c +++ b/cmd/zed/zed_exec.c @@ -196,37 +196,29 @@ _nop(int sig) (void) sig; } -static void * -_reap_children(void *arg) +static void +wait_for_children(boolean_t do_pause, boolean_t wait) { - (void) arg; - struct launched_process_node node, *pnode; pid_t pid; - int status; struct rusage usage; - struct sigaction sa = {}; - - (void) sigfillset(&sa.sa_mask); - (void) sigdelset(&sa.sa_mask, SIGCHLD); - (void) pthread_sigmask(SIG_SETMASK, &sa.sa_mask, NULL); - - (void) sigemptyset(&sa.sa_mask); - sa.sa_handler = _nop; - sa.sa_flags = SA_NOCLDSTOP; - (void) sigaction(SIGCHLD, &sa, NULL); + int status; + struct launched_process_node node, *pnode; for (_reap_children_stop = B_FALSE; !_reap_children_stop; ) { (void) pthread_mutex_lock(&_launched_processes_lock); - pid = wait4(0, &status, WNOHANG, &usage); - + pid = wait4(0, &status, wait ? 0 : WNOHANG, &usage); if (pid == 0 || pid == (pid_t)-1) { (void) pthread_mutex_unlock(&_launched_processes_lock); - if (pid == 0 || errno == ECHILD) - pause(); - else if (errno != EINTR) + if ((pid == 0) || (errno == ECHILD)) { + if (do_pause) + pause(); + } else if (errno != EINTR) zed_log_msg(LOG_WARNING, "Failed to wait for children: %s", strerror(errno)); + if (!do_pause) + return; + } else { memset(&node, 0, sizeof (node)); node.pid = pid; @@ -278,6 +270,25 @@ _reap_children(void *arg) } } +} + +static void * +_reap_children(void *arg) +{ + (void) arg; + struct sigaction sa = {}; + + (void) sigfillset(&sa.sa_mask); + (void) sigdelset(&sa.sa_mask, SIGCHLD); + (void) pthread_sigmask(SIG_SETMASK, &sa.sa_mask, NULL); + + (void) sigemptyset(&sa.sa_mask); + sa.sa_handler = _nop; + sa.sa_flags = SA_NOCLDSTOP; + (void) sigaction(SIGCHLD, &sa, NULL); + + wait_for_children(B_TRUE, B_FALSE); + return (NULL); } @@ -306,6 +317,45 @@ zed_exec_fini(void) _reap_children_tid = (pthread_t)-1; } +/* + * Check if the zedlet name indicates if it is a synchronous zedlet + * + * Synchronous zedlets have a "-sync-" immediately following the event name in + * their zedlet filename, like: + * + * EVENT_NAME-sync-ZEDLETNAME.sh + * + * For example, if you wanted a synchronous statechange script: + * + * statechange-sync-myzedlet.sh + * + * Synchronous zedlets are guaranteed to be the only zedlet running. No other + * zedlets may run in parallel with a synchronous zedlet. A synchronous + * zedlet will wait for all previously spawned zedlets to finish before running. + * Users should be careful to only use synchronous zedlets when needed, since + * they decrease parallelism. + */ +static boolean_t +zedlet_is_sync(const char *zedlet, const char *event) +{ + const char *sync_str = "-sync-"; + size_t sync_str_len; + size_t zedlet_len; + size_t event_len; + + sync_str_len = strlen(sync_str); + zedlet_len = strlen(zedlet); + event_len = strlen(event); + + if (event_len + sync_str_len >= zedlet_len) + return (B_FALSE); + + if (strncmp(&zedlet[event_len], sync_str, sync_str_len) == 0) + return (B_TRUE); + + return (B_FALSE); +} + /* * Process the event [eid] by synchronously invoking all zedlets with a * matching class prefix. @@ -368,9 +418,28 @@ zed_exec_process(uint64_t eid, const char *class, const char *subclass, z = zed_strings_next(zcp->zedlets)) { for (csp = class_strings; *csp; csp++) { n = strlen(*csp); - if ((strncmp(z, *csp, n) == 0) && !isalpha(z[n])) + if ((strncmp(z, *csp, n) == 0) && !isalpha(z[n])) { + boolean_t is_sync = zedlet_is_sync(z, *csp); + + if (is_sync) { + /* + * Wait for previous zedlets to + * finish + */ + wait_for_children(B_FALSE, B_TRUE); + } + _zed_exec_fork_child(eid, zcp->zedlet_dir, z, e, zcp->zevent_fd, zcp->do_foreground); + + if (is_sync) { + /* + * Wait for sync zedlet we just launched + * to finish. + */ + wait_for_children(B_FALSE, B_TRUE); + } + } } } free(e); diff --git a/man/man8/zed.8.in b/man/man8/zed.8.in index eda377aafc1..2d19f2d8496 100644 --- a/man/man8/zed.8.in +++ b/man/man8/zed.8.in @@ -158,6 +158,8 @@ Multiple ZEDLETs may be invoked for a given zevent. ZEDLETs are executables invoked by the ZED in response to a given zevent. They should be written under the presumption they can be invoked concurrently, and they should use appropriate locking to access any shared resources. +The one exception to this are "synchronous zedlets", which are described later +in this page. Common variables used by ZEDLETs can be stored in the default rc file which is sourced by scripts; these variables should be prefixed with .Sy ZED_ . @@ -233,6 +235,36 @@ and .Sy ZPOOL . These variables may be overridden in the rc file. . +.Sh Synchronous ZEDLETS +ZED's normal behavior is to spawn off zedlets in parallel and ignore their +completion order. +This means that ZED can potentially +have zedlets for event ID number 2 starting before zedlets for event ID number +1 have finished. +Most of the time this is fine, and it actually helps when the system is getting +hammered with hundreds of events. +.Pp +However, there are times when you want your zedlets to be executed in sequence +with the event ID. +That is where synchronous zedlets come in. +.Pp +ZED will wait for all previously spawned zedlets to finish before running +a synchronous zedlet. +Synchronous zedlets are guaranteed to be the only +zedlet running. +No other zedlets may run in parallel with a synchronous zedlet. +Users should be careful to only use synchronous zedlets when needed, since +they decrease parallelism. +.Pp +To make a zedlet synchronous, simply add a "-sync-" immediately following the +event name in the zedlet's file name: +.Pp +.Sy EVENT_NAME-sync-ZEDLETNAME.sh +.Pp +For example, if you wanted a synchronous statechange script: +.Pp +.Sy statechange-sync-myzedlet.sh +. .Sh FILES .Bl -tag -width "-c" .It Pa @sysconfdir@/zfs/zed.d diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 86fe4d699f9..339361cc276 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -110,7 +110,8 @@ tags = ['functional', 'direct'] tests = ['events_001_pos', 'events_002_pos', 'zed_rc_filter', 'zed_fd_spill', 'zed_cksum_reported', 'zed_cksum_config', 'zed_io_config', 'zed_slow_io', 'zed_slow_io_many_vdevs', 'zed_diagnose_multiple', - 'slow_vdev_sit_out', 'slow_vdev_sit_out_neg', 'slow_vdev_degraded_sit_out'] + 'zed_synchronous_zedlet', 'slow_vdev_sit_out', 'slow_vdev_sit_out_neg', + 'slow_vdev_degraded_sit_out'] tags = ['functional', 'events'] [tests/functional/fallocate:Linux] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index dfc57dfedd2..bb3920a64f9 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1536,6 +1536,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/events/zed_rc_filter.ksh \ functional/events/zed_slow_io.ksh \ functional/events/zed_slow_io_many_vdevs.ksh \ + functional/events/zed_synchronous_zedlet.ksh \ functional/exec/cleanup.ksh \ functional/exec/exec_001_pos.ksh \ functional/exec/exec_002_neg.ksh \ diff --git a/tests/zfs-tests/tests/functional/events/zed_synchronous_zedlet.ksh b/tests/zfs-tests/tests/functional/events/zed_synchronous_zedlet.ksh new file mode 100755 index 00000000000..6b732ea96d0 --- /dev/null +++ b/tests/zfs-tests/tests/functional/events/zed_synchronous_zedlet.ksh @@ -0,0 +1,149 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025 by Lawrence Livermore National Security, LLC. +# + +# DESCRIPTION: +# Verify ZED synchronous zedlets work as expected +# +# STRATEGY: +# 1. Create a scrub_start zedlet that runs quickly +# 2. Create a scrub_start zedlet that runs slowly (takes seconds) +# 3. Create a scrub_finish zedlet that is synchronous and runs slowly +# 4. Create a trim_start zedlet that runs quickly +# 4. Scrub the pool +# 5. Trim the pool +# 6. Verify the synchronous scrub_finish zedlet waited for the scrub_start +# zedlets to finish (including the slow one). If the scrub_finish zedlet +# was not synchronous, it would have completed before the slow scrub_start +# zedlet. +# 7. Verify the trim_start zedlet waited for the slow synchronous scrub_finish +# zedlet to complete. + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/events/events_common.kshlib + +verify_runnable "both" + +OUR_ZEDLETS="scrub_start-async.sh scrub_start-slow.sh scrub_finish-sync-slow.sh trim_start-async.sh" + +OUTFILE="$TEST_BASE_DIR/zed_synchronous_zedlet_lines" +TESTPOOL2=testpool2 + +function cleanup +{ + zed_stop + + for i in $OUR_ZEDLETS ; do + log_must rm -f $ZEDLET_DIR/$i + done + destroy_pool $TESTPOOL2 + log_must rm -f $TEST_BASE_DIR/vdev-file-sync-zedlet + log_must rm -f $OUTFILE +} + +log_assert "Verify ZED synchronous zedlets work as expected" + +log_onexit cleanup + +# Make a pool +log_must truncate -s 100M $TEST_BASE_DIR/vdev-file-sync-zedlet +log_must zpool create $TESTPOOL2 $TEST_BASE_DIR/vdev-file-sync-zedlet + +# Do an initial scrub +log_must zpool scrub -w $TESTPOOL2 + +log_must zpool events -c + +mkdir -p $ZEDLET_DIR + +# Create zedlets +cat << EOF > $ZEDLET_DIR/scrub_start-async.sh +#!/bin/ksh -p +echo "\$(date) \$(basename \$0)" >> $OUTFILE +EOF + +cat << EOF > $ZEDLET_DIR/scrub_start-slow.sh +#!/bin/ksh -p +sleep 3 +echo "\$(date) \$(basename \$0)" >> $OUTFILE +EOF + +cat << EOF > $ZEDLET_DIR/scrub_finish-sync-slow.sh +#!/bin/ksh -p +sleep 3 +echo "\$(date) \$(basename \$0)" >> $OUTFILE +EOF + +cat << EOF > $ZEDLET_DIR/trim_start-async.sh +#!/bin/ksh -p +echo "\$(date) \$(basename \$0)" >> $OUTFILE +EOF + +for i in $OUR_ZEDLETS ; do + log_must chmod +x $ZEDLET_DIR/$i +done + +log_must zed_start + +# Do a scrub - it should be instantaneous. +log_must zpool scrub -w $TESTPOOL2 + +# Start off a trim immediately after scrubiung. The trim should be +# instantaneous and generate a trimp_start event. This will happen in parallel +# with the slow 'scrub_finish-sync-slow.sh' zedlet still running. +log_must zpool trim -w $TESTPOOL2 + +# Wait for scrub_finish event to happen for sanity. This is the *event*, not +# the completion of zedlets for the event. +log_must file_wait_event $ZED_DEBUG_LOG 'sysevent\.fs\.zfs\.trim_finish' 10 + +# At a minimum, scrub_start-slow.sh + scrub_finish-sync-slow.sh will take a +# total of 6 seconds to run, so wait 7 sec to be sure. +sleep 7 + +# If our zedlets were run in the right order, with sync correctly honored, you +# will see this ordering in $OUTFILE: +# +# Fri May 16 12:04:23 PDT 2025 scrub_start-async.sh +# Fri May 16 12:04:26 PDT 2025 scrub_start-slow.sh +# Fri May 16 12:04:31 PDT 2025 scrub_finish-sync-slow.sh +# Fri May 16 12:04:31 PDT 2025 trim_start-async.sh +# +# Check for this ordering + +# Get a list of just the script names in the order they were executed +# from OUTFILE +lines="$(echo $(grep -Eo '(scrub|trim)_.+\.sh$' $OUTFILE))" + +# Compare it to the ordering we expect +expected="\ +scrub_start-async.sh \ +scrub_start-slow.sh \ +scrub_finish-sync-slow.sh \ +trim_start-async.sh" +log_must test "$lines" == "$expected" + +log_pass "Verified synchronous zedlets" From 7b1cc9eb61fc20a5cc3885d9d6f49b9a3e3a964c Mon Sep 17 00:00:00 2001 From: Allan Jude Date: Tue, 15 Jul 2025 19:28:02 +0000 Subject: [PATCH 15/23] ZFS allow send:encrypted A new `zfs allow` permissions that ONLY allows sending replication streams in raw (encrypted) mode, so encrypted data will not be decrypted as part of the replication process. Sponsored-by: Klara, Inc. Sponsored-by: Karakun AG Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Co-authored-by: JT Pennington Signed-off-by: Allan Jude Closes #17543 --- cmd/zfs/zfs_main.c | 6 ++++++ include/sys/dsl_deleg.h | 1 + include/zfs_deleg.h | 1 + man/man8/zfs-allow.8 | 5 +++-- module/zcommon/zfs_deleg.c | 1 + module/zfs/zfs_ioctl.c | 17 +++++++++++++++-- 6 files changed, 27 insertions(+), 4 deletions(-) diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index ff69a8a4668..06eef778968 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -5303,6 +5303,7 @@ zfs_do_receive(int argc, char **argv) #define ZFS_DELEG_PERM_MOUNT "mount" #define ZFS_DELEG_PERM_SHARE "share" #define ZFS_DELEG_PERM_SEND "send" +#define ZFS_DELEG_PERM_SEND_RAW "send:raw" #define ZFS_DELEG_PERM_RECEIVE "receive" #define ZFS_DELEG_PERM_RECEIVE_APPEND "receive:append" #define ZFS_DELEG_PERM_ALLOW "allow" @@ -5345,6 +5346,7 @@ static zfs_deleg_perm_tab_t zfs_deleg_perm_tbl[] = { { ZFS_DELEG_PERM_RENAME, ZFS_DELEG_NOTE_RENAME }, { ZFS_DELEG_PERM_ROLLBACK, ZFS_DELEG_NOTE_ROLLBACK }, { ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_SEND }, + { ZFS_DELEG_PERM_SEND_RAW, ZFS_DELEG_NOTE_SEND_RAW }, { ZFS_DELEG_PERM_SHARE, ZFS_DELEG_NOTE_SHARE }, { ZFS_DELEG_PERM_SNAPSHOT, ZFS_DELEG_NOTE_SNAPSHOT }, { ZFS_DELEG_PERM_BOOKMARK, ZFS_DELEG_NOTE_BOOKMARK }, @@ -5929,6 +5931,10 @@ deleg_perm_comment(zfs_deleg_note_t note) case ZFS_DELEG_NOTE_SEND: str = gettext(""); break; + case ZFS_DELEG_NOTE_SEND_RAW: + str = gettext("Allow sending ONLY encrypted (raw) replication" + "\n\t\t\t\tstreams"); + break; case ZFS_DELEG_NOTE_SHARE: str = gettext("Allows sharing file systems over NFS or SMB" "\n\t\t\t\tprotocols"); diff --git a/include/sys/dsl_deleg.h b/include/sys/dsl_deleg.h index ae729b9f32f..36dd6211219 100644 --- a/include/sys/dsl_deleg.h +++ b/include/sys/dsl_deleg.h @@ -46,6 +46,7 @@ extern "C" { #define ZFS_DELEG_PERM_MOUNT "mount" #define ZFS_DELEG_PERM_SHARE "share" #define ZFS_DELEG_PERM_SEND "send" +#define ZFS_DELEG_PERM_SEND_RAW "send:raw" #define ZFS_DELEG_PERM_RECEIVE "receive" #define ZFS_DELEG_PERM_RECEIVE_APPEND "receive:append" #define ZFS_DELEG_PERM_ALLOW "allow" diff --git a/include/zfs_deleg.h b/include/zfs_deleg.h index f80fe46d35f..a7bbf1620ad 100644 --- a/include/zfs_deleg.h +++ b/include/zfs_deleg.h @@ -55,6 +55,7 @@ typedef enum { ZFS_DELEG_NOTE_PROMOTE, ZFS_DELEG_NOTE_RENAME, ZFS_DELEG_NOTE_SEND, + ZFS_DELEG_NOTE_SEND_RAW, ZFS_DELEG_NOTE_RECEIVE, ZFS_DELEG_NOTE_ALLOW, ZFS_DELEG_NOTE_USERPROP, diff --git a/man/man8/zfs-allow.8 b/man/man8/zfs-allow.8 index b154aebd92a..e3b0e1ab3e1 100644 --- a/man/man8/zfs-allow.8 +++ b/man/man8/zfs-allow.8 @@ -30,7 +30,7 @@ .\" Copyright 2018 Nexenta Systems, Inc. .\" Copyright 2019 Joyent, Inc. .\" -.Dd March 13, 2025 +.Dd September 8, 2025 .Dt ZFS-ALLOW 8 .Os . @@ -212,7 +212,8 @@ receive subcommand Must also have the \fBmount\fR and \fBcreate\fR ability, requ release subcommand Allows releasing a user hold which might destroy the snapshot rename subcommand Must also have the \fBmount\fR and \fBcreate\fR ability in the new parent rollback subcommand Must also have the \fBmount\fR ability -send subcommand +send subcommand Allows sending a replication stream of a dataset. +send:raw subcommand Only allows sending raw replication streams, preventing encrypted datasets being sent in decrypted form. share subcommand Allows sharing file systems over NFS or SMB protocols snapshot subcommand Must also have the \fBmount\fR ability diff --git a/module/zcommon/zfs_deleg.c b/module/zcommon/zfs_deleg.c index 49bb534ca26..87596558c9a 100644 --- a/module/zcommon/zfs_deleg.c +++ b/module/zcommon/zfs_deleg.c @@ -59,6 +59,7 @@ const zfs_deleg_perm_tab_t zfs_deleg_perm_tab[] = { {ZFS_DELEG_PERM_SNAPSHOT}, {ZFS_DELEG_PERM_SHARE}, {ZFS_DELEG_PERM_SEND}, + {ZFS_DELEG_PERM_SEND_RAW}, {ZFS_DELEG_PERM_USERPROP}, {ZFS_DELEG_PERM_USERQUOTA}, {ZFS_DELEG_PERM_GROUPQUOTA}, diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 4227b0594b1..5ca7c2320c4 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -683,6 +683,7 @@ zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) dsl_dataset_t *ds; const char *cp; int error; + boolean_t rawok = (zc->zc_flags & 0x8); /* * Generate the current snapshot name from the given objsetid, then @@ -705,6 +706,10 @@ zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds, ZFS_DELEG_PERM_SEND, cr); + if (error != 0 && rawok == B_TRUE) { + error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds, + ZFS_DELEG_PERM_SEND_RAW, cr); + } dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); @@ -714,9 +719,17 @@ zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) static int zfs_secpolicy_send_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { + boolean_t rawok = nvlist_exists(innvl, "rawok"); + int error; + (void) innvl; - return (zfs_secpolicy_write_perms(zc->zc_name, - ZFS_DELEG_PERM_SEND, cr)); + error = zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_SEND, cr); + if (error != 0 && rawok == B_TRUE) { + error = zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_SEND_RAW, cr); + } + return (error); } static int From 955fbc5adeddfc26a3cf14304bafbc3ab63d06b0 Mon Sep 17 00:00:00 2001 From: JT Pennington Date: Tue, 15 Jul 2025 19:32:23 +0000 Subject: [PATCH 16/23] Add send:encrypted test Create tests for the new send:encrypted permission Sponsored-by: Klara, Inc. Sponsored-by: Karakun AG Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: JT Pennington Closes #17543 --- tests/runfiles/common.run | 8 + tests/zfs-tests/tests/Makefile.am | 6 + .../cli_root/zfs_send_delegation/cleanup.ksh | 43 ++++++ .../cli_root/zfs_send_delegation/setup.ksh | 50 ++++++ .../zfs_send_delegation/zfs_send_test.ksh | 111 ++++++++++++++ .../zfs_send_delegation_user/cleanup.ksh | 43 ++++++ .../zfs_send_delegation_user/setup.ksh | 50 ++++++ .../zfs_send_usertest.ksh | 145 ++++++++++++++++++ 8 files changed, 456 insertions(+) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_send_delegation/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_send_delegation/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_send_delegation/zfs_send_test.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/zfs_send_delegation_user/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/zfs_send_delegation_user/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/zfs_send_delegation_user/zfs_send_usertest.ksh diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index b6cb2d559af..bae9c0f3594 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -323,6 +323,10 @@ tests = ['zfs_send_001_pos', 'zfs_send_002_pos', 'zfs_send_003_pos', 'zfs_send_raw', 'zfs_send_sparse', 'zfs_send-b', 'zfs_send_skip_missing'] tags = ['functional', 'cli_root', 'zfs_send'] +[tests/functional/cli_root/zfs_send_delegation] +tests = ['zfs_send_test'] +tags = ['functional', 'cli_root', 'zfs_send_delegation'] + [tests/functional/cli_root/zfs_set] tests = ['cache_001_pos', 'cache_002_neg', 'canmount_001_pos', 'canmount_002_pos', 'canmount_003_pos', 'canmount_004_pos', @@ -637,6 +641,10 @@ tests = ['zfs_list_001_pos', 'zfs_list_002_pos', 'zfs_list_003_pos', user = tags = ['functional', 'cli_user', 'zfs_list'] +[tests/functional/cli_root/zfs_send_delegation_user] +tests = ['zfs_send_usertest'] +tags = ['functional', 'cli_root', 'zfs_send_delegation_user'] + [tests/functional/cli_user/zpool_iostat] tests = ['zpool_iostat_001_neg', 'zpool_iostat_002_pos', 'zpool_iostat_003_neg', 'zpool_iostat_004_pos', diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index bb3920a64f9..32fe351f6dd 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -892,6 +892,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zfs_send/zfs_send_raw.ksh \ functional/cli_root/zfs_send/zfs_send_skip_missing.ksh \ functional/cli_root/zfs_send/zfs_send_sparse.ksh \ + functional/cli_root/zfs_send_delegation/cleanup.ksh \ + functional/cli_root/zfs_send_delegation/setup.ksh \ + functional/cli_root/zfs_send_delegation/zfs_send_test.ksh \ functional/cli_root/zfs_set/cache_001_pos.ksh \ functional/cli_root/zfs_set/cache_002_neg.ksh \ functional/cli_root/zfs_set/canmount_001_pos.ksh \ @@ -1409,6 +1412,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_user/zfs_list/zfs_list_005_neg.ksh \ functional/cli_user/zfs_list/zfs_list_007_pos.ksh \ functional/cli_user/zfs_list/zfs_list_008_neg.ksh \ + functional/cli_user/zfs_send_delegation_user/cleanup.ksh \ + functional/cli_user/zfs_send_delegation_user/setup.ksh \ + functional/cli_user/zfs_send_delegation_user/zfs_send_usertest.ksh \ functional/cli_user/zpool_iostat/cleanup.ksh \ functional/cli_user/zpool_iostat/setup.ksh \ functional/cli_user/zpool_iostat/zpool_iostat_001_neg.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_send_delegation/cleanup.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_send_delegation/cleanup.ksh new file mode 100755 index 00000000000..4a59e15cc69 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_send_delegation/cleanup.ksh @@ -0,0 +1,43 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/delegate/delegate_common.kshlib + + +poolexists $TESTPOOL1 && \ + destroy_pool $TESTPOOL1 + +del_user $STAFF1 +del_user $STAFF2 +del_group $STAFF_GROUP + +del_user $OTHER1 +del_user $OTHER2 +del_group $OTHER_GROUP + +default_cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_send_delegation/setup.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_send_delegation/setup.ksh new file mode 100755 index 00000000000..0978193eddc --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_send_delegation/setup.ksh @@ -0,0 +1,50 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/delegate/delegate_common.kshlib + +# Create staff group and add two user to it +log_must add_group $STAFF_GROUP +if ! id $STAFF1 > /dev/null 2>&1; then + log_must add_user $STAFF_GROUP $STAFF1 +fi +if ! id $STAFF2 > /dev/null 2>&1; then + log_must add_user $STAFF_GROUP $STAFF2 +fi + +# Create other group and add two user to it +log_must add_group $OTHER_GROUP +if ! id $OTHER1 > /dev/null 2>&1; then + log_must add_user $OTHER_GROUP $OTHER1 +fi +if ! id $OTHER2 > /dev/null 2>&1; then + log_must add_user $OTHER_GROUP $OTHER2 +fi +DISK=${DISKS%% *} + +default_raidz_setup $DISKS diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_send_delegation/zfs_send_test.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_send_delegation/zfs_send_test.ksh new file mode 100755 index 00000000000..d018f313fae --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_send_delegation/zfs_send_test.ksh @@ -0,0 +1,111 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara Inc. +# + +# STRATEGY: +# 1. Create a pool (this is done by the test framework) +# 2. Create an encrypted dataset +# 3. Write random data to the encrypted dataset +# 4. Snapshot the dataset +# 5. As root: attempt a send and raw send (both should succeed) +# 6. Create a delegation (zfs allow -u user send testpool/encrypted_dataset) +# 7. As root: attempt a send and raw send (both should succeed) +# 8. Create a delegation (zfs allow -u user send:raw testpool/encrypted_dataset) +# 9. As root: attempt a send and raw send (both should succeed) +# 10. Disable delegation (zfs unallow) +# 11. As root: attempt a send and raw send (both should succeed) +# 12. Clean up (handled by framework) +# +# Tested as a user under ../cli_user/zfs_send_delegation_user/ + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zfs_create/zfs_create_common.kshlib +. $STF_SUITE/tests/functional/cli_root/zfs_create/properties.kshlib +. $STF_SUITE/tests/functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib +. $STF_SUITE/tests/functional/delegate/delegate.cfg + +# create encrypted dataset + +log_must eval "echo $PASSPHRASE | zfs create -o encryption=on -o keyformat=passphrase $TESTPOOL/$TESTFS1" + +# create target dataset for receives +if ! zfs list | grep testfs2 >/dev/null 2>&1; then + dataset_created="TRUE" + log_must zfs create $TESTPOOL/$TESTFS2 +fi + +# create user and group +typeset perms="snapshot,reservation,compression,checksum,userprop,receive" + +log_note "Added permissions to the $OTHER1 user." +log_must zfs allow $OTHER1 $perms $TESTPOOL/$TESTFS1 +log_must zfs allow $OTHER1 $perms $TESTPOOL/$TESTFS2 + +# create random data +log_must fill_fs $TESTPOOL/$TESTFS1/child 1 2047 1024 1 R + +# snapshot +log_must zfs snapshot $TESTPOOL/$TESTFS1@snap1 + + +# check baseline send abilities (should pass) +log_must eval "zfs send $TESTPOOL/$TESTFS1@snap1 | zfs receive $TESTPOOL/$TESTFS2/zfsrecv0_datastream.$$" +log_must eval "zfs send -w $TESTPOOL/$TESTFS1@snap1 | zfs receive $TESTPOOL/$TESTFS2/zfsrecv0raw_datastream.$$" + + +# create delegation +log_must zfs allow $OTHER1 send $TESTPOOL/$TESTFS1 + +# attempt send with full allow + +log_must eval "zfs send $TESTPOOL/$TESTFS1@snap1 | zfs receive $TESTPOOL/$TESTFS2/zfsrecv1_datastream.$$" +log_must eval "zfs send -w $TESTPOOL/$TESTFS1@snap1 | zfs receive $TESTPOOL/$TESTFS2/zfsrecv1raw_datastream.$$" + +# create raw delegation +log_must zfs allow $OTHER1 send:raw $TESTPOOL/$TESTFS1 +log_must zfs unallow $OTHER1 send $TESTPOOL/$TESTFS1 + +# test new send abilities (should pass) +log_must eval "zfs send $TESTPOOL/$TESTFS1@snap1 | zfs receive $TESTPOOL/$TESTFS2/zfsrecv2_datastream.$$" +log_must eval "zfs send -w $TESTPOOL/$TESTFS1@snap1 | zfs receive $TESTPOOL/$TESTFS2/zfsrecv2raw_datastream.$$" + + +# disable raw delegation +zfs unallow $OTHER1 send:raw $TESTPOOL/$TESTFS1 +zfs allow $OTHER1 send $TESTPOOL/$TESTFS1 + +# verify original send abilities (should pass) +log_must eval "zfs send $TESTPOOL/$TESTFS1@snap1 | zfs receive $TESTPOOL/$TESTFS2/zfsrecv3_datastream.$$" +log_must eval "zfs send -w $TESTPOOL/$TESTFS1@snap1 | zfs receive $TESTPOOL/$TESTFS2/zfsrecv3raw_datastream.$$" + + +function cleanup +{ + datasetexists $TESTPOOL/$TESTFS1 && \ + destroy_dataset $TESTPOOL/$TESTFS1 -r \ + destroy_dataset $TESTPOOL/$TESTFS2 -r + +} diff --git a/tests/zfs-tests/tests/functional/cli_user/zfs_send_delegation_user/cleanup.ksh b/tests/zfs-tests/tests/functional/cli_user/zfs_send_delegation_user/cleanup.ksh new file mode 100755 index 00000000000..4a59e15cc69 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_user/zfs_send_delegation_user/cleanup.ksh @@ -0,0 +1,43 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/delegate/delegate_common.kshlib + + +poolexists $TESTPOOL1 && \ + destroy_pool $TESTPOOL1 + +del_user $STAFF1 +del_user $STAFF2 +del_group $STAFF_GROUP + +del_user $OTHER1 +del_user $OTHER2 +del_group $OTHER_GROUP + +default_cleanup diff --git a/tests/zfs-tests/tests/functional/cli_user/zfs_send_delegation_user/setup.ksh b/tests/zfs-tests/tests/functional/cli_user/zfs_send_delegation_user/setup.ksh new file mode 100755 index 00000000000..0978193eddc --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_user/zfs_send_delegation_user/setup.ksh @@ -0,0 +1,50 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/delegate/delegate_common.kshlib + +# Create staff group and add two user to it +log_must add_group $STAFF_GROUP +if ! id $STAFF1 > /dev/null 2>&1; then + log_must add_user $STAFF_GROUP $STAFF1 +fi +if ! id $STAFF2 > /dev/null 2>&1; then + log_must add_user $STAFF_GROUP $STAFF2 +fi + +# Create other group and add two user to it +log_must add_group $OTHER_GROUP +if ! id $OTHER1 > /dev/null 2>&1; then + log_must add_user $OTHER_GROUP $OTHER1 +fi +if ! id $OTHER2 > /dev/null 2>&1; then + log_must add_user $OTHER_GROUP $OTHER2 +fi +DISK=${DISKS%% *} + +default_raidz_setup $DISKS diff --git a/tests/zfs-tests/tests/functional/cli_user/zfs_send_delegation_user/zfs_send_usertest.ksh b/tests/zfs-tests/tests/functional/cli_user/zfs_send_delegation_user/zfs_send_usertest.ksh new file mode 100755 index 00000000000..f62f2b07929 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_user/zfs_send_delegation_user/zfs_send_usertest.ksh @@ -0,0 +1,145 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara Inc. +# + +# STRATEGY: +# 1. Create a pool (this is done by the test framework) +# 2. Create a user +# 3. Create an encrypted dataset +# 4. Write random data to the encrypted dataset +# 5. Snapshot the dataset +# 6. As root: attempt a send and raw send (both should succeed) +# 7. As user: attempt a send and raw send (both should fail, no permission) +# 8. Create a delegation (zfs allow -u user send testpool/encrypted_dataset) +# 9. As root: attempt a send and raw send (both should succeed) +# 10. As user: attempt a send and raw send (both should succeed) +# 11. Create a delegation (zfs allow -u user sendraw testpool/encrypted_dataset) +# 12. As root: attempt a send and raw send (both should succeed) +# 13. As user: attempt a send and raw send (send should fail, raw send should succeed) +# 14. Disable delegation (zfs unallow) +# 15. As root: attempt a send and raw send (both should succeed) +# 16. As user: attempt a send and raw send (both should fail, no permission) +# 17. Clean up (handled by framework) +# root tests to verify this doesnt affect root user under ../cli_root/zfs_send_delegation/ +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zfs_create/zfs_create_common.kshlib +. $STF_SUITE/tests/functional/cli_root/zfs_create/properties.kshlib +. $STF_SUITE/tests/functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib +. $STF_SUITE/tests/functional/delegate/delegate.cfg + +# create encrypted dataset + +log_must eval "echo $PASSPHRASE | zfs create -o encryption=on -o keyformat=passphrase $TESTPOOL/$TESTFS1" + +# create target dataset for receives +log_must zfs create $TESTPOOL/$TESTFS2 + +# set user perms +# need to run chown for fs permissions for $OTHER1 +typeset perms="snapshot,reservation,compression,checksum,userprop,receive,mount,create" + +log_must zfs allow $OTHER1 $perms $TESTPOOL/$TESTFS1 +log_must zfs allow $OTHER1 $perms $TESTPOOL/$TESTFS2 +log_must chown ${OTHER1}:${OTHER_GROUP} /$TESTPOOL/$TESTFS2 + +# create random data +log_must fill_fs $TESTPOOL/$TESTFS1/child 1 2047 1024 1 R + +# snapshot +log_must zfs snapshot $TESTPOOL/$TESTFS1@snap1 + +# note +# we need to use `sh -c` here becuase the quoting on <<<"$*" in the user_run wrapper is broken once pipes and redirects get involved + +# check baseline send abilities (should fail) +log_mustnot user_run $OTHER1 sh -c "'zfs send $TESTPOOL/$TESTFS1@snap1 | zfs receive -u $TESTPOOL/$TESTFS2/zfsrecv0_user_datastream.$$'" +# verify nothing went through +if [ -s $TESTPOOL/$TESTFS2/zfsrecv0_user_datastream.$$ ] +then + log_fail "A zfs recieve was completed in $TESTPOOL/$TESTFS2/zfsrecv0_user_datastream !" +fi +log_mustnot user_run $OTHER1 sh -c "'zfs send -w $TESTPOOL/$TESTFS1@snap1 | zfs receive -u $TESTPOOL/$TESTFS2/zfsrecv0raw_user_datastream.$$'" +# verify nothing went through +if [ -s $TESTPOOL/$TESTFS2/zfsrecv0raw_user_datastream.$$ ] +then + log_fail "A zfs recieve was completed in $TESTPOOL/$TESTFS2/zfsrecv0raw_user_datastream !" +fi + +# create delegation +log_must zfs allow $OTHER1 send $TESTPOOL/$TESTFS1 + +# attempt send with full allow (should pass) +log_must user_run $OTHER1 sh -c "'zfs send $TESTPOOL/$TESTFS1@snap1 | zfs receive -u $TESTPOOL/$TESTFS2/zfsrecv1_user_datastream.$$'" +log_must user_run $OTHER1 sh -c "'zfs send -w $TESTPOOL/$TESTFS1@snap1 | zfs receive -u $TESTPOOL/$TESTFS2/zfsrecv1raw_user_datastream.$$'" + + +# create raw delegation +log_must zfs allow $OTHER1 send:raw $TESTPOOL/$TESTFS1 +# We have to remove 'send' to confirm 'send raw' only allows what we want +log_must zfs unallow -u $OTHER1 send $TESTPOOL/$TESTFS1 + +# test new sendraw abilities (send should fail, sendraw should pass) +log_mustnot user_run $OTHER1 sh -c "'zfs send $TESTPOOL/$TESTFS1@snap1 | zfs receive -u $TESTPOOL/$TESTFS2/zfsrecv2_user_datastream.$$'" + verify nothing went through +if [ -s $TESTPOOL/$TESTFS2/zfsrecv2_user_datastream.$$ ] +then + log_fail "A zfs recieve was completed in $TESTPOOL/$TESTFS2/zfsrecv2_user_datastream !" +fi +log_must user_run $OTHER1 sh -c "'zfs send -w $TESTPOOL/$TESTFS1@snap1 | zfs receive -u $TESTPOOL/$TESTFS2/zfsrecv2raw_user_datastream.$$'" + +# disable raw delegation +log_must zfs unallow -u $OTHER1 send:raw $TESTPOOL/$TESTFS1 +log_must zfs allow $OTHER1 send $TESTPOOL/$TESTFS1 + +# test with raw taken away (should pass) +log_must user_run $OTHER1 sh -c "'zfs send $TESTPOOL/$TESTFS1@snap1 | zfs receive -u $TESTPOOL/$TESTFS2/zfsrecv3_user_datastream.$$'" +log_must user_run $OTHER1 sh -c "'zfs send -w $TESTPOOL/$TESTFS1@snap1 | zfs receive -u $TESTPOOL/$TESTFS2/zfsrecv3raw_user_datastream.$$'" + +# disable send abilities +log_must zfs unallow -u $OTHER1 send $TESTPOOL/$TESTFS1 + +# verify original send abilities (should fail) +log_mustnot user_run $OTHER1 sh -c "'zfs send $TESTPOOL/$TESTFS1@snap1 | zfs receive -u $TESTPOOL/$TESTFS2/zfsrecv4_user_datastream.$$'" + verify nothing went through +if [ -s $TESTPOOL/$TESTFS2/zfsrecv4_user_datastream.$$ ] +then + log_fail "A zfs recieve was completed in $TESTPOOL/$TESTFS2/zfsrecv4_user_datastream !" +fi +log_mustnot user_run $OTHER1 sh -c "'zfs send -w $TESTPOOL/$TESTFS1@snap1 | zfs receive -u $TESTPOOL/$TESTFS2/zfsrecv4raw_user_datastream.$$'" + verify nothing went through +if [ -s $TESTPOOL/$TESTFS2/zfsrecv4raw_user_datastream.$$ ] +then + log_fail "A zfs recieve was completed in $TESTPOOL/$TESTFS2/zfsrecv4raw_user_datastream !" +fi + + +function cleanup +{ + datasetexists $TESTPOOL/$TESTFS1 && \ + destroy_dataset $TESTPOOL/$TESTFS1 -r \ + destroy_dataset $TESTPOOL/$TESTFS2 -r + +} From 37cd30f714a4c0e76b173cdc8d81b62b3efbddec Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Fri, 12 Sep 2025 10:05:06 -0700 Subject: [PATCH 17/23] Fix ddle memleak in ddt_log_load In ddt_log_load(), when removing dup entry from flushing tree, it doesn't free the entry causing memleak. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Chunwei Chen Co-authored-by: Chunwei Chen Closes #17657 Closes #17730 --- module/zfs/ddt_log.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/module/zfs/ddt_log.c b/module/zfs/ddt_log.c index c9217cef4f7..c7a2426f3a7 100644 --- a/module/zfs/ddt_log.c +++ b/module/zfs/ddt_log.c @@ -244,6 +244,13 @@ ddt_log_alloc_entry(ddt_t *ddt) return (ddle); } +static void +ddt_log_free_entry(ddt_t *ddt, ddt_log_entry_t *ddle) +{ + kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? + ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle); +} + static void ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe) { @@ -349,8 +356,7 @@ ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe) ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe); avl_remove(&ddl->ddl_tree, ddle); - kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? - ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle); + ddt_log_free_entry(ddt, ddle); return (B_TRUE); } @@ -367,8 +373,7 @@ ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk) ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, &ddlwe); avl_remove(&ddl->ddl_tree, ddle); - kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? - ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle); + ddt_log_free_entry(ddt, ddle); return (B_TRUE); } @@ -529,8 +534,7 @@ ddt_log_empty(ddt_t *ddt, ddt_log_t *ddl) IMPLY(ddt->ddt_version == UINT64_MAX, avl_is_empty(&ddl->ddl_tree)); while ((ddle = avl_destroy_nodes(&ddl->ddl_tree, &cookie)) != NULL) { - kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? - ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle); + ddt_log_free_entry(ddt, ddle); } ASSERT(avl_is_empty(&ddl->ddl_tree)); } @@ -729,7 +733,7 @@ ddt_log_load(ddt_t *ddt) ddle = fe; fe = AVL_NEXT(fl, fe); avl_remove(fl, ddle); - + ddt_log_free_entry(ddt, ddle); ddle = ae; ae = AVL_NEXT(al, ae); } From 35f47cb4f4f5cd5bb519b38b9bbd831c749ce0c4 Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Fri, 12 Sep 2025 10:07:24 -0700 Subject: [PATCH 18/23] Make new zhack test a little more reliable Reviewed-by: Brian Behlendorf Signed-off-by: Paul Dagnelie Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Closes #17728 --- .../tests/functional/cli_root/zhack/zhack_metaslab_leak.ksh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_metaslab_leak.ksh b/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_metaslab_leak.ksh index cc72d6313d4..0d2a39be6b5 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_metaslab_leak.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_metaslab_leak.ksh @@ -64,7 +64,7 @@ log_must zpool import $TESTPOOL alloc2=$(zpool get -Hpo value alloc $TESTPOOL) -[[ $((alloc * 1.05)) -gt $alloc2 ]] && [[ $alloc -lt $alloc2 ]] || \ +within_percent $alloc $alloc2 98 || log_fail "space usage changed too much: $alloc to $alloc2" log_pass "zhack metaslab leak behaved correctly" From cb5f9aa582c6a867d27c111fd45d17af4c13da0a Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 12 Sep 2025 13:29:27 -0400 Subject: [PATCH 19/23] FreeBSD: Satisfy ASSERT_VOP_IN_SEQC() zfs_aclset_common() might be called for newly created or not even created vnodes, that triggers assertions on newer FreeBSD versions with DEBUG_VFS_LOCKS included into INVARIANTS. In the first case make sure to call vn_seqc_write_begin()/_end(), in the second just skip the assertion. The similar has to be done for project management IOCTL and file- bases extended attributes, since those are not going through VFS. Signed-off-by: Alexander Motin Closes #17722 --- module/os/freebsd/zfs/zfs_acl.c | 2 +- module/os/freebsd/zfs/zfs_vnops_os.c | 4 ++++ module/os/freebsd/zfs/zfs_znode_os.c | 6 +++++- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/module/os/freebsd/zfs/zfs_acl.c b/module/os/freebsd/zfs/zfs_acl.c index b15a3e6e38c..cb5787269db 100644 --- a/module/os/freebsd/zfs/zfs_acl.c +++ b/module/os/freebsd/zfs/zfs_acl.c @@ -1175,7 +1175,7 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx) int count = 0; zfs_acl_phys_t acl_phys; - if (zp->z_zfsvfs->z_replay == B_FALSE) { + if (ZTOV(zp) != NULL && zp->z_zfsvfs->z_replay == B_FALSE) { ASSERT_VOP_IN_SEQC(ZTOV(zp)); } diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index e150874c69b..e4b35d13ffd 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -389,7 +389,9 @@ zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, cred_t *cred, error = vn_lock(vp, LK_EXCLUSIVE); if (error) return (error); + vn_seqc_write_begin(vp); error = zfs_ioctl_setxattr(vp, fsx, cred); + vn_seqc_write_end(vp); VOP_UNLOCK(vp); return (error); } @@ -2206,6 +2208,7 @@ zfs_setattr_dir(znode_t *dzp) if (err) break; + vn_seqc_write_begin(ZTOV(zp)); mutex_enter(&dzp->z_lock); if (zp->z_uid != dzp->z_uid) { @@ -2255,6 +2258,7 @@ zfs_setattr_dir(znode_t *dzp) dmu_tx_abort(tx); } tx = NULL; + vn_seqc_write_end(ZTOV(zp)); if (err != 0 && err != ENOENT) break; diff --git a/module/os/freebsd/zfs/zfs_znode_os.c b/module/os/freebsd/zfs/zfs_znode_os.c index 7cd0a153577..649022ab5bc 100644 --- a/module/os/freebsd/zfs/zfs_znode_os.c +++ b/module/os/freebsd/zfs/zfs_znode_os.c @@ -817,6 +817,10 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, (*zpp)->z_dnodesize = dnodesize; (*zpp)->z_projid = projid; + vnode_t *vp = ZTOV(*zpp); + if (!(flag & IS_ROOT_NODE)) + vn_seqc_write_begin(vp); + if (vap->va_mask & AT_XVATTR) zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx); @@ -825,7 +829,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); } if (!(flag & IS_ROOT_NODE)) { - vnode_t *vp = ZTOV(*zpp); + vn_seqc_write_end(vp); vp->v_vflag |= VV_FORCEINSMQ; int err = insmntque(vp, zfsvfs->z_vfs); vp->v_vflag &= ~VV_FORCEINSMQ; From bc8bcfc71a77c1e7f591d3208aa24101380f0328 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 12 Sep 2025 14:05:38 -0400 Subject: [PATCH 20/23] Fix type in dbrrd_closest() For ABS() to work, the argument must be signed, but rrdd_time is uint64_t. Clang noticed it. Reviewed-by: Brian Behlendorf Reviewed-by: Mariusz Zaborski Signed-off-by: Alexander Motin Fixes #16853 Closes #17733 --- module/zfs/zfs_crrd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/module/zfs/zfs_crrd.c b/module/zfs/zfs_crrd.c index f9267ed41d7..e5783b7b8a8 100644 --- a/module/zfs/zfs_crrd.c +++ b/module/zfs/zfs_crrd.c @@ -208,7 +208,8 @@ dbrrd_closest(hrtime_t tv, const rrd_data_t *r1, const rrd_data_t *r2) if (r2 == NULL) return (r1); - return (ABS(tv - r1->rrdd_time) < ABS(tv - r2->rrdd_time) ? r1 : r2); + return (ABS(tv - (hrtime_t)r1->rrdd_time) < + ABS(tv - (hrtime_t)r2->rrdd_time) ? r1 : r2); } uint64_t From 455c36156c4e26247a6709936881c1c4517ec53b Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 12 Sep 2025 11:08:53 -0700 Subject: [PATCH 21/23] ZTS: refreserv/refreserv_raidz improvements Several small changes intended to make this test reliable. - Leave the default compression enabled for the pool and switch to using /dev/urandom as the data source. Functionally this shouldn't impact the test but it's preferable to test with the pool defaults when possible. - Verify the device is created and removed as required. Switch to a unique volume name for a more clarity in the logs. - Use the ZVOL_DEVDIR to specify the device path. - Speed up the test by creating the pool with an ashift=12 and testing 4K, 8K, 128K volblocksizes. Reviewed-by: Alexander Motin Signed-off-by: Brian Behlendorf Closes #17725 --- .../functional/refreserv/refreserv_raidz.ksh | 46 +++++++------------ 1 file changed, 16 insertions(+), 30 deletions(-) diff --git a/tests/zfs-tests/tests/functional/refreserv/refreserv_raidz.ksh b/tests/zfs-tests/tests/functional/refreserv/refreserv_raidz.ksh index 3249bd93d5c..a1da4a8631e 100755 --- a/tests/zfs-tests/tests/functional/refreserv/refreserv_raidz.ksh +++ b/tests/zfs-tests/tests/functional/refreserv/refreserv_raidz.ksh @@ -17,6 +17,7 @@ . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/refreserv/refreserv.cfg +. $STF_SUITE/tests/functional/zvol/zvol_misc/zvol_misc_common.kshlib # # DESCRIPTION: @@ -24,7 +25,7 @@ # # STRATEGY: # 1. Create a pool with a single raidz vdev -# 2. For each block size [512b, 1k, 128k] or [4k, 8k, 128k] +# 2. For each block size [4k, 8k, 128k] # - create a volume # - fully overwrite it # - verify that referenced is less than or equal to reservation @@ -38,6 +39,7 @@ # 1. This test will use up to 14 disks but can cover the key concepts with # 5 disks. # 2. If the disks are a mixture of 4Kn and 512n/512e, failures are likely. +# Therefore, when creating the pool we specify 4Kn sectors. # verify_runnable "global" @@ -60,29 +62,10 @@ log_onexit cleanup poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL" -# Testing tiny block sizes on ashift=12 pools causes so much size inflation -# that small test disks may fill before creating small volumes. However, -# testing 512b and 1K blocks on ashift=9 pools is an ok approximation for -# testing the problems that arise from 4K and 8K blocks on ashift=12 pools. -if is_freebsd; then - bps=$(diskinfo -v ${alldisks[0]} | awk '/sectorsize/ { print $1 }') -elif is_linux; then - bps=$(lsblk -nrdo min-io /dev/${alldisks[0]}) -fi -log_must test "$bps" -eq 512 -o "$bps" -eq 4096 -case "$bps" in -512) - allshifts=(9 10 17) - maxpct=151 - ;; -4096) - allshifts=(12 13 17) - maxpct=110 - ;; -*) - log_fail "bytes/sector: $bps != (512|4096)" - ;; -esac +ashift=12 +allshifts=(12 13 17) +maxpct=110 + log_note "Testing in ashift=${allshifts[0]} mode" # This loop handles all iterations of steps 1 through 4 described in strategy @@ -99,18 +82,21 @@ for parity in 1 2 3; do continue fi - log_must zpool create -O compression=off "$TESTPOOL" "$raid" "${disks[@]}" + log_must zpool create -o ashift=$ashift "$TESTPOOL" "$raid" "${disks[@]}" for bits in "${allshifts[@]}"; do vbs=$((1 << bits)) log_note "Testing $raid-$ndisks volblocksize=$vbs" - vol=$TESTPOOL/$TESTVOL + vol=$TESTPOOL/$TESTVOL-$vbs + zdev=$ZVOL_DEVDIR/$vol log_must zfs create -V ${volsize}m \ -o volblocksize=$vbs "$vol" - block_device_wait "/dev/zvol/$vol" - log_must dd if=/dev/zero of=/dev/zvol/$vol \ - bs=1024k count=$volsize + block_device_wait $zdev + blockdev_exists $zdev + + log_must timeout 120 dd if=/dev/urandom of=$zdev \ + bs=1024k count=$volsize status=progress sync_pool $TESTPOOL ref=$(zfs get -Hpo value referenced "$vol") @@ -126,7 +112,7 @@ for parity in 1 2 3; do log_must test "$deltapct" -le $maxpct log_must_busy zfs destroy "$vol" - block_device_wait + blockdev_missing $zdev done log_must_busy zpool destroy "$TESTPOOL" From 9b772f328bd5f1187ce9b0a0ea241b2fe2875b05 Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Fri, 12 Sep 2025 16:33:36 -0700 Subject: [PATCH 22/23] Fix time database update calculations The time database update math assumed that the timestamps were in nanoseconds, but at some point in the development or review process they changed to seconds. This PR fixes the math to use seconds instead. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Paul Dagnelie Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Closes #17735 --- module/zfs/zfs_crrd.c | 4 +- tests/runfiles/common.run | 2 +- tests/zfs-tests/include/tunables.cfg | 1 + tests/zfs-tests/tests/Makefile.am | 1 + .../zpool_scrub_date_range_002.ksh | 76 +++++++++++++++++++ 5 files changed, 81 insertions(+), 3 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_date_range_002.ksh diff --git a/module/zfs/zfs_crrd.c b/module/zfs/zfs_crrd.c index e5783b7b8a8..30d4c7c3689 100644 --- a/module/zfs/zfs_crrd.c +++ b/module/zfs/zfs_crrd.c @@ -162,9 +162,9 @@ dbrrd_add(dbrrd_t *db, hrtime_t time, uint64_t txg) daydiff = time - rrd_tail(&db->dbr_days); monthdiff = time - rrd_tail(&db->dbr_months); - if (monthdiff >= 0 && monthdiff >= SEC2NSEC(30 * 24 * 60 * 60)) + if (monthdiff >= 0 && monthdiff >= 30 * 24 * 60 * 60) rrd_add(&db->dbr_months, time, txg); - else if (daydiff >= 0 && daydiff >= SEC2NSEC(24 * 60 * 60)) + else if (daydiff >= 0 && daydiff >= 24 * 60 * 60) rrd_add(&db->dbr_days, time, txg); else if (minutedif >= 0) rrd_add(&db->dbr_minutes, time, txg); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index bae9c0f3594..4dd700ef361 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -550,7 +550,7 @@ tests = ['zpool_scrub_001_neg', 'zpool_scrub_002_pos', 'zpool_scrub_003_pos', 'zpool_scrub_multiple_pools', 'zpool_error_scrub_001_pos', 'zpool_error_scrub_002_pos', 'zpool_error_scrub_003_pos', 'zpool_error_scrub_004_pos', - 'zpool_scrub_date_range_001'] + 'zpool_scrub_date_range_001', 'zpool_scrub_date_range_002'] tags = ['functional', 'cli_root', 'zpool_scrub'] [tests/functional/cli_root/zpool_set] diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index b8d72f391bd..54b50c9dba7 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -91,6 +91,7 @@ SPA_DISCARD_MEMORY_LIMIT spa.discard_memory_limit zfs_spa_discard_memory_limit SPA_LOAD_VERIFY_DATA spa.load_verify_data spa_load_verify_data SPA_LOAD_VERIFY_METADATA spa.load_verify_metadata spa_load_verify_metadata SPA_NOTE_TXG_TIME spa.note_txg_time spa_note_txg_time +SPA_FLUSH_TXG_TIME spa.flush_txg_time spa_flush_txg_time TRIM_EXTENT_BYTES_MIN trim.extent_bytes_min zfs_trim_extent_bytes_min TRIM_METASLAB_SKIP trim.metaslab_skip zfs_trim_metaslab_skip TRIM_TXG_BATCH trim.txg_batch zfs_trim_txg_batch diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 32fe351f6dd..1517f90e99a 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1251,6 +1251,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_scrub/zpool_scrub_print_repairing.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_txg_continue_from_last.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_date_range_001.ksh \ + functional/cli_root/zpool_scrub/zpool_scrub_date_range_002.ksh \ functional/cli_root/zpool_scrub/zpool_error_scrub_001_pos.ksh \ functional/cli_root/zpool_scrub/zpool_error_scrub_002_pos.ksh \ functional/cli_root/zpool_scrub/zpool_error_scrub_003_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_date_range_002.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_date_range_002.ksh new file mode 100755 index 00000000000..9327df81a5c --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_date_range_002.ksh @@ -0,0 +1,76 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright 2025 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_scrub/zpool_scrub.cfg + +# +# DESCRIPTION: +# Verify that the timestamp database updates all the tables as expected. +# +# STRATEGY: +# 1. Decrease the note and flush frequency of the txg database. +# 2. Force the pool to sync several txgs +# 3. Verify that there are entries in each of the "month", "day", and +# "minute" tables. +# + +verify_runnable "global" + +function cleanup +{ + log_must restore_tunable SPA_NOTE_TXG_TIME + log_must restore_tunable SPA_FLUSH_TXG_TIME + rm /$TESTPOOL/f1 +} + +log_onexit cleanup + +log_assert "Verifiy timestamp databases all update as expected." + +log_must save_tunable SPA_NOTE_TXG_TIME +log_must set_tunable64 SPA_NOTE_TXG_TIME 1 +log_must save_tunable SPA_FLUSH_TXG_TIME +log_must set_tunable64 SPA_FLUSH_TXG_TIME 1 + +log_must touch /$TESTPOOL/f1 +log_must zpool sync $TESTPOOL +sleep 1 +log_must touch /$TESTPOOL/f1 +log_must zpool sync $TESTPOOL +sleep 1 +log_must touch /$TESTPOOL/f1 +log_must zpool sync $TESTPOOL + +mos_zap="$(zdb -dddd $TESTPOOL 1)" +minutes_entries=$(echo "$mos_zap" | grep "txg_log_time:minutes" | awk '{print $5}') +days_entries=$(echo "$mos_zap" | grep "txg_log_time:days" | awk '{print $5}') +months_entries=$(echo "$mos_zap" | grep "txg_log_time:months" | awk '{print $5}') + +[[ "$minutes_entries" -ne "0" ]] || log_fail "0 entries in the minutes table" +[[ "$days_entries" -ne "0" ]] || log_fail "0 entries in the days table" +[[ "$months_entries" -ne "0" ]] || log_fail "0 entries in the months table" + +log_pass "Verified all timestamp databases had entries as expected." From 3f4312a0a428624e596bb35ae4806ea0d4d69472 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Sat, 13 Sep 2025 12:58:48 -0400 Subject: [PATCH 23/23] Fix two infinite loops if dmu_prefetch_max set to zero Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Closes #17692 Closes #17729 --- module/zfs/dmu.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index f7f808d5b8f..a7a5c89bdaf 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -759,6 +759,8 @@ dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset, */ uint8_t ibps = ibs - SPA_BLKPTRSHIFT; limit = P2ROUNDUP(dmu_prefetch_max, 1 << ibs) >> ibs; + if (limit == 0) + end2 = start2; do { level2++; start2 = P2ROUNDUP(start2, 1 << ibps) >> ibps; @@ -1689,8 +1691,8 @@ dmu_object_cached_size(objset_t *os, uint64_t object, dmu_object_info_from_dnode(dn, &doi); - for (uint64_t off = 0; off < doi.doi_max_offset; - off += dmu_prefetch_max) { + for (uint64_t off = 0; off < doi.doi_max_offset && + dmu_prefetch_max > 0; off += dmu_prefetch_max) { /* dbuf_read doesn't prefetch L1 blocks. */ dmu_prefetch_by_dnode(dn, 1, off, dmu_prefetch_max, ZIO_PRIORITY_SYNC_READ);