From dee62e074a76f66ae573436bc233e721aecebd07 Mon Sep 17 00:00:00 2001
From: Olivier Certner <olce@FreeBSD.org>
Date: Mon, 30 Jun 2025 16:24:23 +0200
Subject: [PATCH 01/72] spa: ZIO_TASKQ_ISSUE: Use symbolic priority

This allows to change the meaning of priority differences in FreeBSD
without requiring code changes in ZFS.

This upstreams commit fd141584cf89d7d2 from FreeBSD src.

Sponsored-by: The FreeBSD Foundation
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Olivier Certner <olce@FreeBSD.org>
Closes #17489
---
 include/os/freebsd/spl/sys/proc.h    |  4 +++-
 include/os/linux/spl/sys/sysmacros.h |  4 +++-
 include/sys/zfs_context.h            |  4 +++-
 module/zfs/spa.c                     | 21 +++------------------
 4 files changed, 12 insertions(+), 21 deletions(-)

diff --git a/include/os/freebsd/spl/sys/proc.h b/include/os/freebsd/spl/sys/proc.h
index a03b815a22a..c6bc10d6bab 100644
--- a/include/os/freebsd/spl/sys/proc.h
+++ b/include/os/freebsd/spl/sys/proc.h
@@ -45,7 +45,9 @@
 #ifdef _KERNEL
 #define	CPU		curcpu
 #define	minclsyspri	PRIBIO
-#define	defclsyspri minclsyspri
+#define	defclsyspri	minclsyspri
+/* Write issue taskq priority. */
+#define	wtqclsyspri	((PVM + PRIBIO) / 2)
 #define	maxclsyspri	PVM
 #define	max_ncpus	(mp_maxid + 1)
 #define	boot_max_ncpus	(mp_maxid + 1)
diff --git a/include/os/linux/spl/sys/sysmacros.h b/include/os/linux/spl/sys/sysmacros.h
index e932ea72f1b..db48222b712 100644
--- a/include/os/linux/spl/sys/sysmacros.h
+++ b/include/os/linux/spl/sys/sysmacros.h
@@ -92,8 +92,10 @@
  * Treat shim tasks as SCHED_NORMAL tasks
  */
 #define	minclsyspri			(MAX_PRIO-1)
-#define	maxclsyspri			(MAX_RT_PRIO)
 #define	defclsyspri			(DEFAULT_PRIO)
+/* Write issue taskq priority. */
+#define	wtqclsyspri			(MAX_RT_PRIO + 1)
+#define	maxclsyspri			(MAX_RT_PRIO)
 
 #ifndef NICE_TO_PRIO
 #define	NICE_TO_PRIO(nice)		(MAX_RT_PRIO + (nice) + 20)
diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h
index b3d48e25753..256c9c2cc2d 100644
--- a/include/sys/zfs_context.h
+++ b/include/sys/zfs_context.h
@@ -623,8 +623,10 @@ extern void delay(clock_t ticks);
  * Process priorities as defined by setpriority(2) and getpriority(2).
  */
 #define	minclsyspri	19
-#define	maxclsyspri	-20
 #define	defclsyspri	0
+/* Write issue taskq priority. */
+#define	wtqclsyspri	-19
+#define	maxclsyspri	-20
 
 #define	CPU_SEQID	((uintptr_t)pthread_self() & (max_ncpus - 1))
 #define	CPU_SEQID_UNSTABLE	CPU_SEQID
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index bca022af6d7..6b52c6cb1f9 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -1231,29 +1231,14 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 			    spa->spa_proc, zio_taskq_basedc, flags);
 		} else {
 #endif
-			pri_t pri = maxclsyspri;
 			/*
 			 * The write issue taskq can be extremely CPU
 			 * intensive.  Run it at slightly less important
 			 * priority than the other taskqs.
-			 *
-			 * Under Linux and FreeBSD this means incrementing
-			 * the priority value as opposed to platforms like
-			 * illumos where it should be decremented.
-			 *
-			 * On FreeBSD, if priorities divided by four (RQ_PPQ)
-			 * are equal then a difference between them is
-			 * insignificant.
 			 */
-			if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) {
-#if defined(__linux__)
-				pri++;
-#elif defined(__FreeBSD__)
-				pri += 4;
-#else
-#error "unknown OS"
-#endif
-			}
+			const pri_t pri = (t == ZIO_TYPE_WRITE &&
+			    q == ZIO_TASKQ_ISSUE) ?
+			    wtqclsyspri : maxclsyspri;
 			tq = taskq_create_proc(name, value, pri, 50,
 			    INT_MAX, spa->spa_proc, flags);
 #ifdef HAVE_SYSDC

From eacf618a6553a6c819d3c9680c3df20079ce80f8 Mon Sep 17 00:00:00 2001
From: Chunwei Chen <tuxoko@gmail.com>
Date: Mon, 30 Jun 2025 16:16:27 -0700
Subject: [PATCH 02/72] Missing tests in make pkg

```
Warning: TestGroup '/var/tmp/tests/functional/ctime' not added to this
run. Auxiliary script '/var/tmp/tests/functional/ctime/setup' failed
verification.
```

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Chunwei Chen <david.chen@nutanix.com>
Closes #17491
---
 tests/zfs-tests/tests/Makefile.am | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index d27660a42c5..f854070abaf 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -1444,6 +1444,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/crtime/setup.ksh \
 	functional/crypto/icp_aes_ccm.ksh \
 	functional/crypto/icp_aes_gcm.ksh \
+	functional/ctime/cleanup.ksh \
+	functional/ctime/ctime_001_pos.ksh \
+	functional/ctime/setup.ksh \
 	functional/deadman/deadman_ratelimit.ksh \
 	functional/deadman/deadman_sync.ksh \
 	functional/deadman/deadman_zio.ksh \

From bf846dcb7dfd25c1c865a3516beb0f0576053716 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Wed, 2 Jul 2025 12:33:47 -0400
Subject: [PATCH 03/72] Release topology restrictions on special/dedup

Special vdevs were originally designed as a small blocks storage
for dRAID, for which role RAIDZ/dRAID topologies are not good.
But it is more often used as SSD storage for metadata and hot
data of HDD pools.  In these use cases narrow RAIDZ of SSDs might
be fine, so we should not introduce unnecessary restrictions,
and ZFS internally does not care.

Similar applies to dedup vdevs.  Original DDT used 4KB blocks,
for which anything but mirror was a terrible storage.  But new
FDT implementation uses 32KB blocks by default, which are much
less demanding even including compression, and which could be
increased even higher now, if needed.

Reviewed-by: George Melikov <mail@gmelikov.ru>
Reviewed-by: Allan Jude <allan@klarasystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #17496
---
 cmd/zpool/zpool_vdev.c                        | 19 +++++++++++++++----
 .../alloc_class/alloc_class_002_neg.ksh       |  5 -----
 .../alloc_class/alloc_class_003_pos.ksh       |  8 +++++++-
 .../alloc_class/alloc_class_004_pos.ksh       |  5 ++++-
 .../alloc_class/alloc_class_009_pos.ksh       |  5 ++++-
 5 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c
index 07868a30d7e..9d7a9b74bb8 100644
--- a/cmd/zpool/zpool_vdev.c
+++ b/cmd/zpool/zpool_vdev.c
@@ -876,6 +876,18 @@ check_replication(nvlist_t *config, nvlist_t *newroot)
 				    (u_longlong_t)mirror->zprl_children);
 				ret = -1;
 			}
+		} else if (is_raidz_draid(current, new)) {
+			if (current->zprl_parity != new->zprl_parity) {
+				vdev_error(gettext(
+				    "mismatched replication level: pool and "
+				    "new vdev with different redundancy, %s "
+				    "and %s vdevs, %llu vs. %llu\n"),
+				    current->zprl_type,
+				    new->zprl_type,
+				    (u_longlong_t)current->zprl_parity,
+				    (u_longlong_t)new->zprl_parity);
+				ret = -1;
+			}
 		} else if (strcmp(current->zprl_type, new->zprl_type) != 0) {
 			vdev_error(gettext(
 			    "mismatched replication level: pool uses %s "
@@ -1581,13 +1593,12 @@ construct_spec(nvlist_t *props, int argc, char **argv)
 				is_dedup = is_spare = B_FALSE;
 			}
 
-			if (is_log || is_special || is_dedup) {
+			if (is_log) {
 				if (strcmp(type, VDEV_TYPE_MIRROR) != 0) {
 					(void) fprintf(stderr,
 					    gettext("invalid vdev "
-					    "specification: unsupported '%s' "
-					    "device: %s\n"), is_log ? "log" :
-					    "special", type);
+					    "specification: unsupported 'log' "
+					    "device: %s\n"), type);
 					goto spec_out;
 				}
 				nlogs++;
diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_002_neg.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_002_neg.ksh
index 7d6924b2c9b..79a431a1323 100755
--- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_002_neg.ksh
+++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_002_neg.ksh
@@ -41,9 +41,4 @@ log_mustnot zpool create $TESTPOOL $ZPOOL_DISKS special mirror \
 log_mustnot display_status $TESTPOOL
 log_mustnot zpool destroy -f $TESTPOOL
 
-log_mustnot zpool create $TESTPOOL raidz $ZPOOL_DISKS special raidz \
-    $CLASS_DISK0 $CLASS_DISK1 $CLASS_DISK2
-log_mustnot display_status $TESTPOOL
-log_mustnot zpool destroy -f $TESTPOOL
-
 log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_003_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_003_pos.ksh
index 42d5deda384..961dcd46429 100755
--- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_003_pos.ksh
+++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_003_pos.ksh
@@ -32,7 +32,7 @@ log_onexit cleanup
 
 log_must disk_setup
 
-for type in "" "mirror" "raidz"
+for type in "" "mirror" "raidz" "draid"
 do
 	log_must zpool create $TESTPOOL $type $ZPOOL_DISKS
 
@@ -47,6 +47,12 @@ do
 		    $CLASS_DISK0 $CLASS_DISK1
 		log_must zpool iostat -H $TESTPOOL $CLASS_DISK0
 		log_must zpool iostat -H $TESTPOOL $CLASS_DISK1
+	elif [ "$type" = "draid" ]; then
+		log_must zpool add $TESTPOOL special raidz \
+		    $CLASS_DISK0 $CLASS_DISK1 $CLASS_DISK2
+		log_must zpool iostat -H $TESTPOOL $CLASS_DISK0
+		log_must zpool iostat -H $TESTPOOL $CLASS_DISK1
+		log_must zpool iostat -H $TESTPOOL $CLASS_DISK2
 	else
 		log_must zpool add $TESTPOOL special $CLASS_DISK0
 		log_must zpool iostat -H $TESTPOOL $CLASS_DISK0
diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_004_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_004_pos.ksh
index 684b6557e3f..39ddaad1be5 100755
--- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_004_pos.ksh
+++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_004_pos.ksh
@@ -37,7 +37,7 @@ typeset ac_value
 typeset stype=""
 typeset sdisks=""
 
-for type in "" "mirror" "raidz"
+for type in "" "mirror" "raidz" "draid"
 do
 	if [ "$type" = "mirror" ]; then
 		stype="mirror"
@@ -45,6 +45,9 @@ do
 	elif [ "$type" = "raidz" ]; then
 		stype="mirror"
 		sdisks="${CLASS_DISK0} ${CLASS_DISK1}"
+	elif [ "$type" = "draid" ]; then
+		stype="raidz"
+		sdisks="${CLASS_DISK0} ${CLASS_DISK1} ${CLASS_DISK2}"
 	else
 		stype=""
 		sdisks="${CLASS_DISK0}"
diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_009_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_009_pos.ksh
index 2223bb1c491..b7e93fc7350 100755
--- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_009_pos.ksh
+++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_009_pos.ksh
@@ -36,7 +36,7 @@ typeset stype=""
 typeset sdisks=""
 typeset props=""
 
-for type in "" "mirror" "raidz"
+for type in "" "mirror" "raidz" "draid"
 do
 	if [ "$type" = "mirror" ]; then
 		stype="mirror"
@@ -45,6 +45,9 @@ do
 	elif [ "$type" = "raidz" ]; then
 		stype="mirror"
 		sdisks="${CLASS_DISK0} ${CLASS_DISK1}"
+	elif [ "$type" = "draid" ]; then
+		stype="raidz"
+		sdisks="${CLASS_DISK0} ${CLASS_DISK1} ${CLASS_DISK2}"
 	else
 		stype=""
 		sdisks="${CLASS_DISK0}"

From 6d838ec0b64a8b00a6d1372f9c66631de7598273 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20R=C3=BCegg?= <martin.rueegg@metaworx.ch>
Date: Sat, 21 Jun 2025 16:20:16 +0300
Subject: [PATCH 04/72] pyzfs: Update ax_python_devel.m4 to serial 37
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes an obvious typo, where a variable was missing the required
leading dollar sign ($)

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Martin Rüegg <martin.rueegg@metaworx.ch>
Closes #17480
---
 config/ax_python_devel.m4 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/config/ax_python_devel.m4 b/config/ax_python_devel.m4
index 1f480db6d23..935056cc4c0 100644
--- a/config/ax_python_devel.m4
+++ b/config/ax_python_devel.m4
@@ -72,7 +72,7 @@
 #   modified version of the Autoconf Macro, you may extend this special
 #   exception to the GPL to apply to your modified version as well.
 
-#serial 36
+#serial 37
 
 AU_ALIAS([AC_PYTHON_DEVEL], [AX_PYTHON_DEVEL])
 AC_DEFUN([AX_PYTHON_DEVEL],[
@@ -316,7 +316,7 @@ EOD`
 			PYTHON_LIBS="-L$ac_python_libdir -lpython$ac_python_version"
 		fi
 
-		if test -z "PYTHON_LIBS"; then
+		if test -z "$PYTHON_LIBS"; then
 			AC_MSG_WARN([
   Cannot determine location of your Python DSO. Please check it was installed with
   dynamic libraries enabled, or try setting PYTHON_LIBS by hand.

From 17ee0fd4fa29bc6c76df1d5240c58904b81531ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20R=C3=BCegg?= <martin.rueegg@metaworx.ch>
Date: Sat, 21 Jun 2025 18:55:19 +0300
Subject: [PATCH 05/72] pyzfs: Adapt python lib directory evaluation from
 ax_python_devel.m4
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

71216b91d281e7e58f5e29ca4d4553945e080fe9 introduced a regression
on debian/ubuntu systems during build.

The reason being, that building the RPM for pyzfs was using
a different library path than building the library itself.
This is now harmonized.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Martin Rüegg <martin.rueegg@metaworx.ch>
Closes #16155
Closes #17480
---
 rpm/generic/zfs.spec.in | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in
index 8cf13023f53..47313a6b5fb 100644
--- a/rpm/generic/zfs.spec.in
+++ b/rpm/generic/zfs.spec.in
@@ -87,7 +87,19 @@
 %define __python                  %{__use_python}
 %define __python_pkg_version      %{__use_python_pkg_version}
 %endif
-%define __python_sitelib          %(%{__python} -Esc "from distutils.sysconfig import get_python_lib; print(get_python_lib())" 2>/dev/null || %{__python} -Esc "import sysconfig; print(sysconfig.get_path('purelib'))")
+%define __python_sitelib          %(%{__python} -Esc "
+import sysconfig;
+if hasattr(sysconfig, 'get_default_scheme'):
+    scheme = sysconfig.get_default_scheme()
+else:
+    scheme = sysconfig._get_default_scheme()
+if scheme == 'posix_local':
+    scheme = 'posix_prefix'
+prefix = '%{_prefix}'
+if prefix == 'NONE':
+    prefix = '%{ac_default_prefix}'
+sitedir = sysconfig.get_path('purelib', scheme, vars={'base': prefix})
+print(sitedir);" 2>/dev/null || %{__python} -Esc "from distutils import sysconfig; print(sysconfig.get_python_lib(0,0))")
 
 Name:           @PACKAGE@
 Version:        @VERSION@

From 4e92aee23395a23efc3726b7377af0ce9b95c829 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Wed, 2 Jul 2025 14:11:37 -0400
Subject: [PATCH 06/72] Relax special_small_blocks restrictions

special_small_blocks is applied to blocks after compression, so it
makes no sense to demand its values to be power of 2.  At most
they could be multiple of 512, but that would still buy us nothing,
so lets allow them be any within SPA_MAXBLOCKSIZE.

Also special_small_blocks does not really need to depend on the
set recordsize, enabled pool features or presence of special vdev.
At worst in any of those cases it will just do nothing, so we
should not complicate users lives by artificial limitations.

While there, polish comments for recordsize and volblocksize.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #17497
---
 include/sys/dmu_objset.h                      |  2 +-
 lib/libzfs/libzfs_dataset.c                   | 37 ++-------------
 lib/libzfs/libzfs_pool.c                      | 34 --------------
 man/man7/zfsprops.7                           | 29 ++++++------
 module/zcommon/zfs_prop.c                     | 11 ++---
 module/zfs/dmu_objset.c                       |  6 ---
 module/zfs/zfs_ioctl.c                        |  9 ----
 tests/runfiles/common.run                     |  3 +-
 tests/zfs-tests/tests/Makefile.am             |  2 -
 .../alloc_class/alloc_class_010_pos.ksh       |  2 +-
 .../alloc_class/alloc_class_011_neg.ksh       |  4 +-
 .../alloc_class/alloc_class_014_neg.ksh       | 39 ----------------
 .../alloc_class/alloc_class_015_pos.ksh       | 46 -------------------
 13 files changed, 31 insertions(+), 193 deletions(-)
 delete mode 100755 tests/zfs-tests/tests/functional/alloc_class/alloc_class_014_neg.ksh
 delete mode 100755 tests/zfs-tests/tests/functional/alloc_class/alloc_class_015_pos.ksh

diff --git a/include/sys/dmu_objset.h b/include/sys/dmu_objset.h
index 288ad30166d..492be29200e 100644
--- a/include/sys/dmu_objset.h
+++ b/include/sys/dmu_objset.h
@@ -152,7 +152,7 @@ struct objset {
 	 * The largest zpl file block allowed in special class.
 	 * cached here instead of zfsvfs for easier access.
 	 */
-	int os_zpl_special_smallblock;
+	uint64_t os_zpl_special_smallblock;
 
 	/*
 	 * Pointer is constant; the blkptr it points to is protected by
diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c
index 91560b40b02..e1b91fc4729 100644
--- a/lib/libzfs/libzfs_dataset.c
+++ b/lib/libzfs/libzfs_dataset.c
@@ -1039,7 +1039,6 @@ zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl,
 	nvlist_t *ret;
 	int chosen_normal = -1;
 	int chosen_utf = -1;
-	int set_maxbs = 0;
 
 	if (nvlist_alloc(&ret, NV_UNIQUE_NAME, 0) != 0) {
 		(void) no_memory(hdl);
@@ -1258,46 +1257,20 @@ zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl,
 				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 				goto error;
 			}
-			/* save the ZFS_PROP_RECORDSIZE during create op */
-			if (zpool_hdl == NULL && prop == ZFS_PROP_RECORDSIZE) {
-				set_maxbs = intval;
-			}
 			break;
 		}
 
 		case ZFS_PROP_SPECIAL_SMALL_BLOCKS:
 		{
-			int maxbs =
-			    set_maxbs == 0 ? SPA_OLD_MAXBLOCKSIZE : set_maxbs;
+			int maxbs = SPA_MAXBLOCKSIZE;
 			char buf[64];
 
-			if (zpool_hdl != NULL) {
-				char state[64] = "";
-
-				maxbs = zpool_get_prop_int(zpool_hdl,
-				    ZPOOL_PROP_MAXBLOCKSIZE, NULL);
-
-				/*
-				 * Issue a warning but do not fail so that
-				 * tests for settable properties succeed.
-				 */
-				if (zpool_prop_get_feature(zpool_hdl,
-				    "feature@allocation_classes", state,
-				    sizeof (state)) != 0 ||
-				    strcmp(state, ZFS_FEATURE_ACTIVE) != 0) {
-					(void) fprintf(stderr, gettext(
-					    "%s: property requires a special "
-					    "device in the pool\n"), propname);
-				}
-			}
-			if (intval != 0 &&
-			    (intval < SPA_MINBLOCKSIZE ||
-			    intval > maxbs || !ISP2(intval))) {
+			if (intval > SPA_MAXBLOCKSIZE) {
 				zfs_nicebytes(maxbs, buf, sizeof (buf));
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-				    "invalid '%s=%llu' property: must be zero "
-				    "or a power of 2 from 512B to %s"),
-				    propname, (unsigned long long)intval, buf);
+				    "invalid '%s' property: must be between "
+				    "zero and %s"),
+				    propname, buf);
 				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 				goto error;
 			}
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index 6f8fb994f81..fb18c430975 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -1421,30 +1421,6 @@ zpool_get_state(zpool_handle_t *zhp)
 	return (zhp->zpool_state);
 }
 
-/*
- * Check if vdev list contains a special vdev
- */
-static boolean_t
-zpool_has_special_vdev(nvlist_t *nvroot)
-{
-	nvlist_t **child;
-	uint_t children;
-
-	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child,
-	    &children) == 0) {
-		for (uint_t c = 0; c < children; c++) {
-			const char *bias;
-
-			if (nvlist_lookup_string(child[c],
-			    ZPOOL_CONFIG_ALLOCATION_BIAS, &bias) == 0 &&
-			    strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0) {
-				return (B_TRUE);
-			}
-		}
-	}
-	return (B_FALSE);
-}
-
 /*
  * Check if vdev list contains a dRAID vdev
  */
@@ -1548,16 +1524,6 @@ zpool_create(libzfs_handle_t *hdl, const char *pool, nvlist_t *nvroot,
 			goto create_failed;
 		}
 
-		if (nvlist_exists(zc_fsprops,
-		    zfs_prop_to_name(ZFS_PROP_SPECIAL_SMALL_BLOCKS)) &&
-		    !zpool_has_special_vdev(nvroot)) {
-			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-			    "%s property requires a special vdev"),
-			    zfs_prop_to_name(ZFS_PROP_SPECIAL_SMALL_BLOCKS));
-			(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
-			goto create_failed;
-		}
-
 		if (!zc_props &&
 		    (nvlist_alloc(&zc_props, NV_UNIQUE_NAME, 0) != 0)) {
 			goto create_failed;
diff --git a/man/man7/zfsprops.7 b/man/man7/zfsprops.7
index ac58203f00a..829eb420527 100644
--- a/man/man7/zfsprops.7
+++ b/man/man7/zfsprops.7
@@ -541,10 +541,16 @@ The
 .Sy blocksize
 cannot be changed once the volume has been written, so it should be set at
 volume creation time.
-The default
-.Sy blocksize
-for volumes is 16 KiB.
-Any power of 2 from 512 bytes to 128 KiB is valid.
+The size specified must be a power of two greater than or equal to
+.Ar 512 B
+and less than or equal to
+.Ar 128 KiB .
+If the
+.Sy large_blocks
+feature is enabled on the pool, the size may be up to
+.Ar 16 MiB .
+The default size is
+.Ar 16 KiB .
 .Pp
 This property can also be referred to by its shortened column name,
 .Sy volblock .
@@ -1282,10 +1288,12 @@ This feature must be enabled to be used
 .It Sy special_small_blocks Ns = Ns Ar size
 This value represents the threshold block size for including small file
 or zvol blocks into the special allocation class.
-Blocks smaller than or equal to this
-value will be assigned to the special allocation class while greater blocks
-will be assigned to the regular class.
-Valid values are zero or a power of two from 512 up to 1048576 (1 MiB).
+Blocks smaller than or equal to this value after compression and encryption
+will be assigned to the special allocation class, while greater blocks will
+be assigned to the regular class.
+Valid values are from 0 to maximum block size (
+.Ar 16 MiB
+).
 The default size is 0 which means no small file or zvol blocks
 will be allocated in the special class.
 .Pp
@@ -1569,11 +1577,6 @@ See
 .Xr zpool-features 7
 for details on ZFS feature flags.
 .Pp
-However, blocks larger than
-.Ar 1 MiB
-can have an impact on i/o latency (e.g. tying up a spinning disk for
-~300ms), and also potentially on the memory allocator.
-.Pp
 Note that maximum size is still limited by default to
 .Ar 1 MiB
 on x86_32, see
diff --git a/module/zcommon/zfs_prop.c b/module/zcommon/zfs_prop.c
index 8b4c4251703..864e3898b36 100644
--- a/module/zcommon/zfs_prop.c
+++ b/module/zcommon/zfs_prop.c
@@ -640,7 +640,7 @@ zfs_prop_init(void)
 	    "<1.00x or higher if compressed>", "REFRATIO", B_FALSE, sfeatures);
 	zprop_register_number(ZFS_PROP_VOLBLOCKSIZE, "volblocksize",
 	    ZVOL_DEFAULT_BLOCKSIZE, PROP_ONETIME,
-	    ZFS_TYPE_VOLUME, "512 to 128k, power of 2",	"VOLBLOCK", B_FALSE,
+	    ZFS_TYPE_VOLUME, "512 to 16M, power of 2", "VOLBLOCK", B_FALSE,
 	    sfeatures);
 	zprop_register_index(ZFS_PROP_VOLTHREADING, "volthreading",
 	    1, PROP_DEFAULT, ZFS_TYPE_VOLUME, "on | off", "zvol threading",
@@ -734,13 +734,12 @@ zfs_prop_init(void)
 	/* inherit number properties */
 	zprop_register_number(ZFS_PROP_RECORDSIZE, "recordsize",
 	    SPA_OLD_MAXBLOCKSIZE, PROP_INHERIT,
-	    ZFS_TYPE_FILESYSTEM, "512 to 1M, power of 2", "RECSIZE", B_FALSE,
-	    sfeatures);
+	    ZFS_TYPE_FILESYSTEM, "512 to 16M, power of 2",
+	    "RECSIZE", B_FALSE, sfeatures);
 	zprop_register_number(ZFS_PROP_SPECIAL_SMALL_BLOCKS,
 	    "special_small_blocks", 0, PROP_INHERIT,
-	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
-	    "zero or 512 to 1M, power of 2", "SPECIAL_SMALL_BLOCKS", B_FALSE,
-	    sfeatures);
+	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "0 to 16M",
+	    "SPECIAL_SMALL_BLOCKS", B_FALSE, sfeatures);
 
 	/* hidden properties */
 	zprop_register_hidden(ZFS_PROP_NUMCLONES, "numclones", PROP_TYPE_NUMBER,
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
index b3f792e4ae6..c1101088be1 100644
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@ -345,12 +345,6 @@ smallblk_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
-	/*
-	 * Inheritance and range checking should have been done by now.
-	 */
-	ASSERT(newval <= SPA_MAXBLOCKSIZE);
-	ASSERT(ISP2(newval));
-
 	os->os_zpl_special_smallblock = newval;
 }
 
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index ebb1cfd0712..3a413f4a7bd 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -5000,15 +5000,6 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
 		}
 		break;
 
-	case ZFS_PROP_SPECIAL_SMALL_BLOCKS:
-		/*
-		 * This property could require the allocation classes
-		 * feature to be active for setting, however we allow
-		 * it so that tests of settable properties succeed.
-		 * The CLI will issue a warning in this case.
-		 */
-		break;
-
 	case ZFS_PROP_SHARESMB:
 		if (zpl_earlier_version(dsname, ZPL_VERSION_FUID))
 			return (SET_ERROR(ENOTSUP));
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index 7969945a479..376518e9f37 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -37,8 +37,7 @@ tests = ['alloc_class_001_pos', 'alloc_class_002_neg', 'alloc_class_003_pos',
     'alloc_class_004_pos', 'alloc_class_005_pos', 'alloc_class_006_pos',
     'alloc_class_007_pos', 'alloc_class_008_pos', 'alloc_class_009_pos',
     'alloc_class_010_pos', 'alloc_class_011_neg', 'alloc_class_012_pos',
-    'alloc_class_013_pos', 'alloc_class_014_neg', 'alloc_class_015_pos',
-    'alloc_class_016_pos']
+    'alloc_class_013_pos', 'alloc_class_016_pos']
 tags = ['functional', 'alloc_class']
 
 [tests/functional/append]
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index f854070abaf..20a17a53110 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -429,8 +429,6 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/alloc_class/alloc_class_011_neg.ksh \
 	functional/alloc_class/alloc_class_012_pos.ksh \
 	functional/alloc_class/alloc_class_013_pos.ksh \
-	functional/alloc_class/alloc_class_014_neg.ksh \
-	functional/alloc_class/alloc_class_015_pos.ksh \
 	functional/alloc_class/alloc_class_016_pos.ksh \
 	functional/alloc_class/cleanup.ksh \
 	functional/alloc_class/setup.ksh \
diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_010_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_010_pos.ksh
index 7f9d108ed18..f7dfd42b0f0 100755
--- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_010_pos.ksh
+++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_010_pos.ksh
@@ -36,7 +36,7 @@ log_must disk_setup
 log_must zpool create $TESTPOOL raidz $ZPOOL_DISKS special mirror \
 	$CLASS_DISK0 $CLASS_DISK1
 
-for value in 0 512 1024 2048 4096 8192 16384 32768 65536 131072
+for value in 0 200 512 1300 4096 12345 131072 1572864 16777216
 do
 	log_must zfs set special_small_blocks=$value $TESTPOOL
 	ACTUAL=$(zfs get -p special_small_blocks $TESTPOOL | \
diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_011_neg.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_011_neg.ksh
index a04e9ca4327..0f90117544a 100755
--- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_011_neg.ksh
+++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_011_neg.ksh
@@ -22,7 +22,7 @@
 #
 # DESCRIPTION:
 #	Setting the special_small_blocks property to invalid values fails.
-#	Powers of two from 512 to 1M are allowed.
+#	Only values between 0 and 16M including are allowed.
 #
 
 verify_runnable "global"
@@ -36,7 +36,7 @@ log_must disk_setup
 log_must zpool create $TESTPOOL raidz $ZPOOL_DISKS special mirror \
 	$CLASS_DISK0 $CLASS_DISK1
 
-for value in 256 1025 33554432
+for value in 16777217 33554432 4294967296
 do
 	log_mustnot zfs set special_small_blocks=$value $TESTPOOL
 done
diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_014_neg.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_014_neg.ksh
deleted file mode 100755
index e16b64a964e..00000000000
--- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_014_neg.ksh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/ksh -p
-# SPDX-License-Identifier: CDDL-1.0
-
-#
-# This file and its contents are supplied under the terms of the
-# Common Development and Distribution License ("CDDL"), version 1.0.
-# You may only use this file in accordance with the terms of version
-# 1.0 of the CDDL.
-#
-# A full copy of the text of the CDDL should have accompanied this
-# source.  A copy of the CDDL is also available via the Internet at
-# http://www.illumos.org/license/CDDL.
-#
-
-. $STF_SUITE/tests/functional/alloc_class/alloc_class.kshlib
-
-#
-# DESCRIPTION:
-#	Setting the special_small_blocks property greater than recordsize fails.
-#
-
-verify_runnable "global"
-
-claim="Setting the special_small_blocks property greater than recordsize fails"
-
-log_assert $claim
-log_onexit cleanup
-log_must disk_setup
-
-for size in 512 4096 32768 131072 524288 1048576
-do
-	let bigger=$size*2
-	log_mustnot zpool create -O recordsize=$size \
-		-O special_small_blocks=$bigger \
-		$TESTPOOL raidz $ZPOOL_DISKS special mirror \
-		$CLASS_DISK0 $CLASS_DISK1
-done
-
-log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_015_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_015_pos.ksh
deleted file mode 100755
index 9d34375b74c..00000000000
--- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_015_pos.ksh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/bin/ksh -p
-# SPDX-License-Identifier: CDDL-1.0
-
-#
-# This file and its contents are supplied under the terms of the
-# Common Development and Distribution License ("CDDL"), version 1.0.
-# You may only use this file in accordance with the terms of version
-# 1.0 of the CDDL.
-#
-# A full copy of the text of the CDDL should have accompanied this
-# source.  A copy of the CDDL is also available via the Internet at
-# http://www.illumos.org/license/CDDL.
-#
-
-. $STF_SUITE/tests/functional/alloc_class/alloc_class.kshlib
-
-#
-# DESCRIPTION:
-# 	Can set special_small_blocks property less than or equal to recordsize.
-#
-
-verify_runnable "global"
-
-claim="Can set special_small_blocks property less than or equal to recordsize"
-
-log_assert $claim
-log_onexit cleanup
-log_must disk_setup
-
-for size in 8192 32768 131072 524288 1048576
-do
-	let smaller=$size/2
-	log_must zpool create -O recordsize=$size \
-		-O special_small_blocks=$smaller \
-		$TESTPOOL raidz $ZPOOL_DISKS special mirror \
-		$CLASS_DISK0 $CLASS_DISK1
-	log_must zpool destroy -f "$TESTPOOL"
-
-	log_must zpool create -O recordsize=$size \
-		-O special_small_blocks=$size \
-		$TESTPOOL raidz $ZPOOL_DISKS special mirror \
-		$CLASS_DISK0 $CLASS_DISK1
-	log_must zpool destroy -f "$TESTPOOL"
-done
-
-log_pass $claim

From c98a393cb661540b9f5359e24a5b8e18dcb1cdac Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Thu, 3 Jul 2025 10:27:05 -0700
Subject: [PATCH 07/72] CI: run ztest on compressed zpool

When running ztest under the CI a common failure mode is for the
underlying filesystem to run out of available free space.  Since
the storage associated with a GitHub-hosted running is fixed, we
instead create a pool and use a compressed ZFS dataset to store
the ztest vdev files.  This significantly increases the available
capacity since the data written by ztest is highly compressible.
A compression ratio of over 40:1 is conservatively achieved using
the default lz4 compression.  Autotrimming is enabled to ensure
freed blocks are discarded from the backing cipool vdev file.

Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #17501
---
 .github/workflows/zloop.yml | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/zloop.yml b/.github/workflows/zloop.yml
index 7b3bf49d90d..4ae3ccdc548 100644
--- a/.github/workflows/zloop.yml
+++ b/.github/workflows/zloop.yml
@@ -12,7 +12,8 @@ jobs:
   zloop:
     runs-on: ubuntu-24.04
     env:
-      TEST_DIR: /var/tmp/zloop
+      WORK_DIR: /mnt/zloop
+      CORE_DIR: /mnt/zloop/cores
     steps:
     - uses: actions/checkout@v4
       with:
@@ -40,38 +41,37 @@ jobs:
         sudo modprobe zfs
     - name: Tests
       run: |
-        sudo mkdir -p $TEST_DIR
-        # run for 10 minutes or at most 6 iterations for a maximum runner
-        # time of 60 minutes.
-        sudo /usr/share/zfs/zloop.sh -t 600 -I 6 -l -m 1 -- -T 120 -P 60
+        sudo truncate -s 256G /mnt/vdev
+        sudo zpool create cipool -m $WORK_DIR -O compression=on -o autotrim=on /mnt/vdev
+        sudo /usr/share/zfs/zloop.sh -t 600 -I 6 -l -m 1 -c $CORE_DIR -f $WORK_DIR -- -T 120 -P 60
     - name: Prepare artifacts
       if: failure()
       run: |
-        sudo chmod +r -R $TEST_DIR/
+        sudo chmod +r -R $WORK_DIR/
     - name: Ztest log
       if: failure()
       run: |
-        grep -B10 -A1000 'ASSERT' $TEST_DIR/*/ztest.out || tail -n 1000 $TEST_DIR/*/ztest.out
+        grep -B10 -A1000 'ASSERT' $CORE_DIR/*/ztest.out || tail -n 1000 $CORE_DIR/*/ztest.out
     - name: Gdb log
       if: failure()
       run: |
-        sed -n '/Backtraces (full)/q;p' $TEST_DIR/*/ztest.gdb
+        sed -n '/Backtraces (full)/q;p' $CORE_DIR/*/ztest.gdb
     - name: Zdb log
       if: failure()
       run: |
-        cat $TEST_DIR/*/ztest.zdb
+        cat $CORE_DIR/*/ztest.zdb
     - uses: actions/upload-artifact@v4
       if: failure()
       with:
         name: Logs
         path: |
-          /var/tmp/zloop/*/
-          !/var/tmp/zloop/*/vdev/
+          /mnt/zloop/*/
+          !/mnt/zloop/cores/*/vdev/
         if-no-files-found: ignore
     - uses: actions/upload-artifact@v4
       if: failure()
       with:
         name: Pool files
         path: |
-          /var/tmp/zloop/*/vdev/
+          /mnt/zloop/cores/*/vdev/
         if-no-files-found: ignore

From d411ea2e4d986645163deed3e87fcca793ca8a17 Mon Sep 17 00:00:00 2001
From: Meriel Luna Mittelbach <lunarlambda@gmail.com>
Date: Thu, 3 Jul 2025 23:24:07 +0200
Subject: [PATCH 08/72] Add templated zfs-mount@.service

Runs `zfs mount -R <dataset>` at boot, after `zfs mount -a`.
Intended to replace `mountpoint=legacy` in certain mount setups.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Meriel Luna Mittelbach <lunarlambda@gmail.com>
Closes #17483
---
 contrib/debian/openzfs-zfsutils.install  |  1 +
 etc/Makefile.am                          |  1 +
 etc/systemd/system/zfs-mount@.service.in | 26 ++++++++++++++++++++++++
 rpm/generic/zfs.spec.in                  |  2 +-
 4 files changed, 29 insertions(+), 1 deletion(-)
 create mode 100644 etc/systemd/system/zfs-mount@.service.in

diff --git a/contrib/debian/openzfs-zfsutils.install b/contrib/debian/openzfs-zfsutils.install
index 4573cc77ea7..37284a78ad1 100644
--- a/contrib/debian/openzfs-zfsutils.install
+++ b/contrib/debian/openzfs-zfsutils.install
@@ -8,6 +8,7 @@ lib/systemd/system/zfs-import-scan.service
 lib/systemd/system/zfs-import.target
 lib/systemd/system/zfs-load-key.service
 lib/systemd/system/zfs-mount.service
+lib/systemd/system/zfs-mount@.service
 lib/systemd/system/zfs-scrub-monthly@.timer
 lib/systemd/system/zfs-scrub-weekly@.timer
 lib/systemd/system/zfs-scrub@.service
diff --git a/etc/Makefile.am b/etc/Makefile.am
index 7187762d380..808c729cd96 100644
--- a/etc/Makefile.am
+++ b/etc/Makefile.am
@@ -56,6 +56,7 @@ systemdunit_DATA = \
 	%D%/systemd/system/zfs-import-scan.service \
 	%D%/systemd/system/zfs-import.target \
 	%D%/systemd/system/zfs-mount.service \
+	%D%/systemd/system/zfs-mount@.service \
 	%D%/systemd/system/zfs-scrub-monthly@.timer \
 	%D%/systemd/system/zfs-scrub-weekly@.timer \
 	%D%/systemd/system/zfs-scrub@.service \
diff --git a/etc/systemd/system/zfs-mount@.service.in b/etc/systemd/system/zfs-mount@.service.in
new file mode 100644
index 00000000000..0698fad1207
--- /dev/null
+++ b/etc/systemd/system/zfs-mount@.service.in
@@ -0,0 +1,26 @@
+[Unit]
+Description=Mount ZFS filesystem %I
+Documentation=man:zfs(8)
+DefaultDependencies=no
+After=systemd-udev-settle.service
+After=zfs-import.target
+After=zfs-mount.service
+After=systemd-remount-fs.service
+Before=local-fs.target
+ConditionPathIsDirectory=/sys/module/zfs
+
+# This merely tells the service manager
+# that unmounting everything undoes the
+# effect of this service. No extra logic
+# is ran as a result of these settings.
+Conflicts=umount.target
+Before=umount.target
+
+[Service]
+Type=oneshot
+RemainAfterExit=yes
+EnvironmentFile=-@initconfdir@/zfs
+ExecStart=@sbindir@/zfs mount -R %I
+
+[Install]
+WantedBy=zfs.target
diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in
index 47313a6b5fb..dddc0a6c8f0 100644
--- a/rpm/generic/zfs.spec.in
+++ b/rpm/generic/zfs.spec.in
@@ -388,7 +388,7 @@ support for unlocking datasets on user login.
 
 %if 0%{?_systemd}
     %define systemd --enable-systemd --with-systemdunitdir=%{_unitdir} --with-systemdpresetdir=%{_presetdir} --with-systemdmodulesloaddir=%{_modulesloaddir} --with-systemdgeneratordir=%{_systemdgeneratordir} --disable-sysvinit
-    %define systemd_svcs zfs-import-cache.service zfs-import-scan.service zfs-mount.service zfs-share.service zfs-zed.service zfs.target zfs-import.target zfs-volume-wait.service zfs-volumes.target
+    %define systemd_svcs zfs-import-cache.service zfs-import-scan.service zfs-mount.service zfs-mount@.service zfs-share.service zfs-zed.service zfs.target zfs-import.target zfs-volume-wait.service zfs-volumes.target
 %else
     %define systemd --enable-sysvinit --disable-systemd
 %endif

From ee0cb4cb89ac3714705fbc716766877321911562 Mon Sep 17 00:00:00 2001
From: Igor Ostapenko <pm@igoro.pro>
Date: Fri, 4 Jul 2025 02:00:13 +0300
Subject: [PATCH 09/72] ztest: Fix false positive of ENOSPC handling

Before running a pass zs_enospc_count is checked to free up some space
by destroying a random dataset. But the space freed may still be not
re-usable during the TXG_DEFER window breaking the next dataset creation
in ztest_generic_run().

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Igor Ostapenko <igor.ostapenko@klarasystems.com>
Closes #17506
---
 cmd/ztest.c | 39 +++++++++++++++++++++++----------------
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/cmd/ztest.c b/cmd/ztest.c
index 89264c97ff1..c7982c59ff4 100644
--- a/cmd/ztest.c
+++ b/cmd/ztest.c
@@ -7813,6 +7813,9 @@ ztest_dataset_open(int d)
 
 	ztest_dataset_name(name, ztest_opts.zo_pool, d);
 
+	if (ztest_opts.zo_verbose >= 6)
+		(void) printf("Opening %s\n", name);
+
 	(void) pthread_rwlock_rdlock(&ztest_name_lock);
 
 	error = ztest_dataset_create(name);
@@ -8308,41 +8311,44 @@ static void
 ztest_generic_run(ztest_shared_t *zs, spa_t *spa)
 {
 	kthread_t **run_threads;
-	int t;
+	int i, ndatasets;
 
 	run_threads = umem_zalloc(ztest_opts.zo_threads * sizeof (kthread_t *),
 	    UMEM_NOFAIL);
 
+	/*
+	 * Actual number of datasets to be used.
+	 */
+	ndatasets = MIN(ztest_opts.zo_datasets, ztest_opts.zo_threads);
+
+	/*
+	 * Prepare the datasets first.
+	 */
+	for (i = 0; i < ndatasets; i++)
+		VERIFY0(ztest_dataset_open(i));
+
 	/*
 	 * Kick off all the tests that run in parallel.
 	 */
-	for (t = 0; t < ztest_opts.zo_threads; t++) {
-		if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) {
-			umem_free(run_threads, ztest_opts.zo_threads *
-			    sizeof (kthread_t *));
-			return;
-		}
-
-		run_threads[t] = thread_create(NULL, 0, ztest_thread,
-		    (void *)(uintptr_t)t, 0, NULL, TS_RUN | TS_JOINABLE,
+	for (i = 0; i < ztest_opts.zo_threads; i++) {
+		run_threads[i] = thread_create(NULL, 0, ztest_thread,
+		    (void *)(uintptr_t)i, 0, NULL, TS_RUN | TS_JOINABLE,
 		    defclsyspri);
 	}
 
 	/*
 	 * Wait for all of the tests to complete.
 	 */
-	for (t = 0; t < ztest_opts.zo_threads; t++)
-		VERIFY0(thread_join(run_threads[t]));
+	for (i = 0; i < ztest_opts.zo_threads; i++)
+		VERIFY0(thread_join(run_threads[i]));
 
 	/*
 	 * Close all datasets. This must be done after all the threads
 	 * are joined so we can be sure none of the datasets are in-use
 	 * by any of the threads.
 	 */
-	for (t = 0; t < ztest_opts.zo_threads; t++) {
-		if (t < ztest_opts.zo_datasets)
-			ztest_dataset_close(t);
-	}
+	for (i = 0; i < ndatasets; i++)
+		ztest_dataset_close(i);
 
 	txg_wait_synced(spa_get_dsl(spa), 0);
 
@@ -8465,6 +8471,7 @@ ztest_run(ztest_shared_t *zs)
 
 		int d = ztest_random(ztest_opts.zo_datasets);
 		ztest_dataset_destroy(d);
+		txg_wait_synced(spa_get_dsl(spa), 0);
 	}
 	zs->zs_enospc_count = 0;
 

From 92d3b4ee2c083201423c023fd01276add22bbaa4 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Sat, 5 Jul 2025 13:16:14 +1000
Subject: [PATCH 10/72] zio: rename `io_reexecute` as `io_post`; use it for the
 direct IO checksum error flag

We're not supposed to modify someone else's io_flags, so we need another
way to propagate DIO_CHKSUM_ERR.

If we squint, we can see that io_reexecute is really just recording
exceptional events that a parent (or its parents) will need to do
something about. It just happens that the only things we've had
historically are two forms of reexecution: now or later (suspend).

So, rename it to io_post, as in, post-IO info/events/actions. And now we
have a few spare bits for other conditions.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17507
---
 include/os/linux/zfs/sys/trace_common.h |  8 ++--
 include/sys/zio.h                       | 21 ++++-----
 module/zcommon/zfs_valstr.c             |  1 -
 module/zfs/dmu_direct.c                 |  2 +-
 module/zfs/vdev_indirect.c              |  2 +-
 module/zfs/vdev_mirror.c                |  2 +-
 module/zfs/vdev_raidz.c                 |  6 +--
 module/zfs/zio.c                        | 60 ++++++++++++-------------
 8 files changed, 50 insertions(+), 52 deletions(-)

diff --git a/include/os/linux/zfs/sys/trace_common.h b/include/os/linux/zfs/sys/trace_common.h
index 85cf8cc20b0..e1b6d61099b 100644
--- a/include/os/linux/zfs/sys/trace_common.h
+++ b/include/os/linux/zfs/sys/trace_common.h
@@ -45,7 +45,7 @@
 		__field(zio_flag_t,		zio_orig_flags)		\
 		__field(enum zio_stage,		zio_orig_stage)		\
 		__field(enum zio_stage,		zio_orig_pipeline)	\
-		__field(uint8_t,		zio_reexecute)		\
+		__field(uint8_t,		zio_post)		\
 		__field(uint64_t,		zio_txg)		\
 		__field(int,			zio_error)		\
 		__field(uint64_t,		zio_ena)		\
@@ -74,7 +74,7 @@
 		__entry->zio_orig_flags		= zio->io_orig_flags;	    \
 		__entry->zio_orig_stage		= zio->io_orig_stage;	    \
 		__entry->zio_orig_pipeline	= zio->io_orig_pipeline;    \
-		__entry->zio_reexecute		= zio->io_reexecute;	    \
+		__entry->zio_post		= zio->io_post;		    \
 		__entry->zio_txg		= zio->io_txg;		    \
 		__entry->zio_error		= zio->io_error;	    \
 		__entry->zio_ena		= zio->io_ena;		    \
@@ -92,7 +92,7 @@
 	"zio { type %u prio %u size %llu orig_size %llu "		\
 	"offset %llu timestamp %llu delta %llu delay %llu "		\
 	"flags 0x%llx stage 0x%x pipeline 0x%x orig_flags 0x%llx "	\
-	"orig_stage 0x%x orig_pipeline 0x%x reexecute %u "		\
+	"orig_stage 0x%x orig_pipeline 0x%x post %u "			\
 	"txg %llu error %d ena %llu prop { checksum %u compress %u "	\
 	"type %u level %u copies %u dedup %u dedup_verify %u nopwrite %u } }"
 
@@ -102,7 +102,7 @@
 	__entry->zio_timestamp, __entry->zio_delta, __entry->zio_delay,	\
 	__entry->zio_flags, __entry->zio_stage, __entry->zio_pipeline,	\
 	__entry->zio_orig_flags, __entry->zio_orig_stage,		\
-	__entry->zio_orig_pipeline, __entry->zio_reexecute,		\
+	__entry->zio_orig_pipeline, __entry->zio_post,			\
 	__entry->zio_txg, __entry->zio_error, __entry->zio_ena,		\
 	__entry->zp_checksum, __entry->zp_compress, __entry->zp_type,	\
 	__entry->zp_level, __entry->zp_copies, __entry->zp_dedup,	\
diff --git a/include/sys/zio.h b/include/sys/zio.h
index d91a4eb1e99..01f3babeb4c 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -226,8 +226,7 @@ typedef uint64_t zio_flag_t;
 #define	ZIO_FLAG_NOPWRITE	(1ULL << 29)
 #define	ZIO_FLAG_REEXECUTED	(1ULL << 30)
 #define	ZIO_FLAG_DELEGATED	(1ULL << 31)
-#define	ZIO_FLAG_DIO_CHKSUM_ERR	(1ULL << 32)
-#define	ZIO_FLAG_PREALLOCATED	(1ULL << 33)
+#define	ZIO_FLAG_PREALLOCATED	(1ULL << 32)
 
 #define	ZIO_ALLOCATOR_NONE	(-1)
 #define	ZIO_HAS_ALLOCATOR(zio)	((zio)->io_allocator != ZIO_ALLOCATOR_NONE)
@@ -418,14 +417,16 @@ typedef struct zio_transform {
 typedef zio_t *zio_pipe_stage_t(zio_t *zio);
 
 /*
- * The io_reexecute flags are distinct from io_flags because the child must
- * be able to propagate them to the parent.  The normal io_flags are local
- * to the zio, not protected by any lock, and not modifiable by children;
- * the reexecute flags are protected by io_lock, modifiable by children,
- * and always propagated -- even when ZIO_FLAG_DONT_PROPAGATE is set.
+ * The io_post flags describe additional actions that a parent IO should
+ * consider or perform on behalf of a child. They are distinct from io_flags
+ * because the child must be able to propagate them to the parent. The normal
+ * io_flags are local to the zio, not protected by any lock, and not modifiable
+ * by children; the reexecute flags are protected by io_lock, modifiable by
+ * children, and always propagated -- even when ZIO_FLAG_DONT_PROPAGATE is set.
  */
-#define	ZIO_REEXECUTE_NOW	0x01
-#define	ZIO_REEXECUTE_SUSPEND	0x02
+#define	ZIO_POST_REEXECUTE	(1 << 0)
+#define	ZIO_POST_SUSPEND	(1 << 1)
+#define	ZIO_POST_DIO_CHKSUM_ERR	(1 << 2)
 
 /*
  * The io_trim flags are used to specify the type of TRIM to perform.  They
@@ -461,7 +462,7 @@ struct zio {
 	enum zio_child	io_child_type;
 	enum trim_flag	io_trim_flags;
 	zio_priority_t	io_priority;
-	uint8_t		io_reexecute;
+	uint8_t		io_post;
 	uint8_t		io_state[ZIO_WAIT_TYPES];
 	uint64_t	io_txg;
 	spa_t		*io_spa;
diff --git a/module/zcommon/zfs_valstr.c b/module/zcommon/zfs_valstr.c
index c39ac62f654..08813b81cf5 100644
--- a/module/zcommon/zfs_valstr.c
+++ b/module/zcommon/zfs_valstr.c
@@ -221,7 +221,6 @@ _VALSTR_BITFIELD_IMPL(zio_flag,
 	{ '.', "NP", "NOPWRITE" },
 	{ '.', "EX", "REEXECUTED" },
 	{ '.', "DG", "DELEGATED" },
-	{ '.', "DC", "DIO_CHKSUM_ERR" },
 	{ '.', "PA", "PREALLOCATED" },
 )
 
diff --git a/module/zfs/dmu_direct.c b/module/zfs/dmu_direct.c
index 12b0ffa2c99..930ff101eca 100644
--- a/module/zfs/dmu_direct.c
+++ b/module/zfs/dmu_direct.c
@@ -104,7 +104,7 @@ dmu_write_direct_done(zio_t *zio)
 	dmu_sync_done(zio, NULL, zio->io_private);
 
 	if (zio->io_error != 0) {
-		if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
+		if (zio->io_post & ZIO_POST_DIO_CHKSUM_ERR)
 			ASSERT3U(zio->io_error, ==, EIO);
 
 		/*
diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c
index fac2c3a5f15..9fc71fa0e03 100644
--- a/module/zfs/vdev_indirect.c
+++ b/module/zfs/vdev_indirect.c
@@ -1842,7 +1842,7 @@ vdev_indirect_io_done(zio_t *zio)
 	 */
 	if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) {
 		zio->io_error = ret;
-		zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
+		zio->io_post |= ZIO_POST_DIO_CHKSUM_ERR;
 		zio_dio_chksum_verify_error_report(zio);
 		ret = 0;
 	}
diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c
index a6aee943706..2b78340cf70 100644
--- a/module/zfs/vdev_mirror.c
+++ b/module/zfs/vdev_mirror.c
@@ -779,7 +779,7 @@ vdev_mirror_io_done(zio_t *zio)
 	 * being written out during self healing.
 	 */
 	if ((zio->io_flags & ZIO_FLAG_DIO_READ) &&
-	    (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) {
+	    (zio->io_post & ZIO_POST_DIO_CHKSUM_ERR)) {
 		zio_dio_chksum_verify_error_report(zio);
 		zio->io_error = vdev_mirror_worst_error(mm);
 		ASSERT3U(zio->io_error, ==, ECKSUM);
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
index 71c4bfbdaf0..7a6a01603da 100644
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -2691,7 +2691,7 @@ raidz_checksum_verify(zio_t *zio)
 	 */
 	if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) {
 		zio->io_error = ret;
-		zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
+		zio->io_post |= ZIO_POST_DIO_CHKSUM_ERR;
 		zio_dio_chksum_verify_error_report(zio);
 		zio_checksum_verified(zio);
 		return (0);
@@ -3048,7 +3048,7 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
 
 	/* Check for success */
 	if (raidz_checksum_verify(zio) == 0) {
-		if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
+		if (zio->io_post & ZIO_POST_DIO_CHKSUM_ERR)
 			return (0);
 
 		/* Reconstruction succeeded - report errors */
@@ -3514,7 +3514,7 @@ vdev_raidz_io_done(zio_t *zio)
 		}
 
 		if (raidz_checksum_verify(zio) == 0) {
-			if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
+			if (zio->io_post & ZIO_POST_DIO_CHKSUM_ERR)
 				goto done;
 
 			for (int i = 0; i < rm->rm_nrows; i++) {
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 6d7bce8b0e1..c5f385aeb72 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -850,15 +850,9 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait,
 	mutex_enter(&pio->io_lock);
 	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
 		*errorp = zio_worst_error(*errorp, zio->io_error);
-	pio->io_reexecute |= zio->io_reexecute;
+	pio->io_post |= zio->io_post;
 	ASSERT3U(*countp, >, 0);
 
-	/*
-	 * Propogate the Direct I/O checksum verify failure to the parent.
-	 */
-	if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
-		pio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
-
 	(*countp)--;
 
 	if (*countp == 0 && pio->io_stall == countp) {
@@ -1649,7 +1643,7 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
 		 * through the mirror during self healing. See comment in
 		 * vdev_mirror_io_done() for more details.
 		 */
-		ASSERT0(pio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR);
+		ASSERT0(pio->io_post & ZIO_POST_DIO_CHKSUM_ERR);
 	} else if (type == ZIO_TYPE_WRITE &&
 	    pio->io_prop.zp_direct_write == B_TRUE) {
 		/*
@@ -2602,7 +2596,7 @@ zio_reexecute(void *arg)
 	pio->io_flags = pio->io_orig_flags;
 	pio->io_stage = pio->io_orig_stage;
 	pio->io_pipeline = pio->io_orig_pipeline;
-	pio->io_reexecute = 0;
+	pio->io_post = 0;
 	pio->io_flags |= ZIO_FLAG_REEXECUTED;
 	pio->io_pipeline_trace = 0;
 	pio->io_error = 0;
@@ -4722,7 +4716,7 @@ zio_vdev_io_assess(zio_t *zio)
 	 * If a Direct I/O operation has a checksum verify error then this I/O
 	 * should not attempt to be issued again.
 	 */
-	if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) {
+	if (zio->io_post & ZIO_POST_DIO_CHKSUM_ERR) {
 		if (zio->io_type == ZIO_TYPE_WRITE) {
 			ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_LOGICAL);
 			ASSERT3U(zio->io_error, ==, EIO);
@@ -5031,7 +5025,7 @@ zio_checksum_verify(zio_t *zio)
 		ASSERT3U(zio->io_prop.zp_checksum, ==, ZIO_CHECKSUM_LABEL);
 	}
 
-	ASSERT0(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR);
+	ASSERT0(zio->io_post & ZIO_POST_DIO_CHKSUM_ERR);
 	IMPLY(zio->io_flags & ZIO_FLAG_DIO_READ,
 	    !(zio->io_flags & ZIO_FLAG_SPECULATIVE));
 
@@ -5040,7 +5034,7 @@ zio_checksum_verify(zio_t *zio)
 		if (error == ECKSUM &&
 		    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
 			if (zio->io_flags & ZIO_FLAG_DIO_READ) {
-				zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
+				zio->io_post |= ZIO_POST_DIO_CHKSUM_ERR;
 				zio_t *pio = zio_unique_parent(zio);
 				/*
 				 * Any Direct I/O read that has a checksum
@@ -5090,7 +5084,7 @@ zio_dio_checksum_verify(zio_t *zio)
 	if ((error = zio_checksum_error(zio, NULL)) != 0) {
 		zio->io_error = error;
 		if (error == ECKSUM) {
-			zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
+			zio->io_post |= ZIO_POST_DIO_CHKSUM_ERR;
 			zio_dio_chksum_verify_error_report(zio);
 		}
 	}
@@ -5115,7 +5109,7 @@ zio_checksum_verified(zio_t *zio)
 void
 zio_dio_chksum_verify_error_report(zio_t *zio)
 {
-	ASSERT(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR);
+	ASSERT(zio->io_post & ZIO_POST_DIO_CHKSUM_ERR);
 
 	if (zio->io_child_type == ZIO_CHILD_LOGICAL)
 		return;
@@ -5431,7 +5425,7 @@ zio_done(zio_t *zio)
 		 */
 		if (zio->io_error != ECKSUM && zio->io_vd != NULL &&
 		    !vdev_is_dead(zio->io_vd) &&
-		    !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) {
+		    !(zio->io_post & ZIO_POST_DIO_CHKSUM_ERR)) {
 			int ret = zfs_ereport_post(FM_EREPORT_ZFS_IO,
 			    zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0);
 			if (ret != EALREADY) {
@@ -5446,7 +5440,7 @@ zio_done(zio_t *zio)
 
 		if ((zio->io_error == EIO || !(zio->io_flags &
 		    (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
-		    !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) &&
+		    !(zio->io_post & ZIO_POST_DIO_CHKSUM_ERR) &&
 		    zio == zio->io_logical) {
 			/*
 			 * For logical I/O requests, tell the SPA to log the
@@ -5467,7 +5461,7 @@ zio_done(zio_t *zio)
 		 */
 		if (zio->io_error == EAGAIN && IO_IS_ALLOCATING(zio) &&
 		    zio->io_prop.zp_dedup) {
-			zio->io_reexecute |= ZIO_REEXECUTE_NOW;
+			zio->io_post |= ZIO_POST_REEXECUTE;
 			zio->io_prop.zp_dedup = B_FALSE;
 		}
 		/*
@@ -5479,11 +5473,11 @@ zio_done(zio_t *zio)
 
 		if (IO_IS_ALLOCATING(zio) &&
 		    !(zio->io_flags & ZIO_FLAG_CANFAIL) &&
-		    !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) {
+		    !(zio->io_post & ZIO_POST_DIO_CHKSUM_ERR)) {
 			if (zio->io_error != ENOSPC)
-				zio->io_reexecute |= ZIO_REEXECUTE_NOW;
+				zio->io_post |= ZIO_POST_REEXECUTE;
 			else
-				zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
+				zio->io_post |= ZIO_POST_SUSPEND;
 		}
 
 		if ((zio->io_type == ZIO_TYPE_READ ||
@@ -5492,10 +5486,11 @@ zio_done(zio_t *zio)
 		    zio->io_error == ENXIO &&
 		    spa_load_state(zio->io_spa) == SPA_LOAD_NONE &&
 		    spa_get_failmode(zio->io_spa) != ZIO_FAILURE_MODE_CONTINUE)
-			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
+			zio->io_post |= ZIO_POST_SUSPEND;
 
-		if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
-			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
+		if (!(zio->io_flags & ZIO_FLAG_CANFAIL) &&
+		    !(zio->io_post & (ZIO_POST_REEXECUTE|ZIO_POST_SUSPEND)))
+			zio->io_post |= ZIO_POST_SUSPEND;
 
 		/*
 		 * Here is a possibly good place to attempt to do
@@ -5514,7 +5509,8 @@ zio_done(zio_t *zio)
 	 */
 	zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
 
-	if ((zio->io_error || zio->io_reexecute) &&
+	if ((zio->io_error ||
+	    (zio->io_post & (ZIO_POST_REEXECUTE|ZIO_POST_SUSPEND))) &&
 	    IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
 	    !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
 		zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp);
@@ -5525,16 +5521,16 @@ zio_done(zio_t *zio)
 	 * Godfather I/Os should never suspend.
 	 */
 	if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
-	    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
-		zio->io_reexecute &= ~ZIO_REEXECUTE_SUSPEND;
+	    (zio->io_post & ZIO_POST_SUSPEND))
+		zio->io_post &= ~ZIO_POST_SUSPEND;
 
-	if (zio->io_reexecute) {
+	if (zio->io_post & (ZIO_POST_REEXECUTE|ZIO_POST_SUSPEND)) {
 		/*
 		 * A Direct I/O operation that has a checksum verify error
 		 * should not attempt to reexecute. Instead, the error should
 		 * just be propagated back.
 		 */
-		ASSERT(!(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR));
+		ASSERT0(zio->io_post & ZIO_POST_DIO_CHKSUM_ERR);
 
 		/*
 		 * This is a logical I/O that wants to reexecute.
@@ -5571,7 +5567,7 @@ zio_done(zio_t *zio)
 			pio_next = zio_walk_parents(zio, &zl);
 
 			if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
-			    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
+			    (zio->io_post & ZIO_POST_SUSPEND)) {
 				zio_remove_child(pio, zio, remove_zl);
 				/*
 				 * This is a rare code path, so we don't
@@ -5595,13 +5591,14 @@ zio_done(zio_t *zio)
 			 * "next_to_execute".
 			 */
 			zio_notify_parent(pio, zio, ZIO_WAIT_DONE, NULL);
-		} else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
+		} else if (zio->io_post & ZIO_POST_SUSPEND) {
 			/*
 			 * We'd fail again if we reexecuted now, so suspend
 			 * until conditions improve (e.g. device comes online).
 			 */
 			zio_suspend(zio->io_spa, zio, ZIO_SUSPEND_IOERR);
 		} else {
+			ASSERT(zio->io_post & ZIO_POST_REEXECUTE);
 			/*
 			 * Reexecution is potentially a huge amount of work.
 			 * Hand it off to the otherwise-unused claim taskq.
@@ -5614,7 +5611,8 @@ zio_done(zio_t *zio)
 	}
 
 	ASSERT(list_is_empty(&zio->io_child_list));
-	ASSERT(zio->io_reexecute == 0);
+	ASSERT0(zio->io_post & ZIO_POST_REEXECUTE);
+	ASSERT0(zio->io_post & ZIO_POST_SUSPEND);
 	ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
 
 	/*

From 6af8db61b1ea489ade2d5344f4ae5f09c3d9faad Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Sat, 5 Jul 2025 13:22:22 +1000
Subject: [PATCH 11/72] metaslab: don't pass whole zio to throttle reserve APIs

They only need a couple of fields, and passing the whole thing just
invites fiddling around inside it, like modifying flags, which then
makes it much harder to understand the zio state from inside zio.c.

We move the flag update to just after a successful throttle in zio.c.

Rename ZIO_FLAG_IO_ALLOCATING to ZIO_FLAG_ALLOC_THROTTLED
Better describes what it means, and makes it look less like
IO_IS_ALLOCATING, which means something different.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17508
---
 include/sys/metaslab.h      |  7 ++++---
 include/sys/zio.h           |  2 +-
 man/man8/zpool-events.8     |  4 ++--
 module/zcommon/zfs_valstr.c |  2 +-
 module/zfs/metaslab.c       | 31 +++++++++++++++----------------
 module/zfs/zio.c            | 37 +++++++++++++++++++++----------------
 6 files changed, 44 insertions(+), 39 deletions(-)

diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h
index 4d57e52e846..36cbe06bacc 100644
--- a/include/sys/metaslab.h
+++ b/include/sys/metaslab.h
@@ -110,9 +110,10 @@ void metaslab_class_balance(metaslab_class_t *mc, boolean_t onsync);
 void metaslab_class_histogram_verify(metaslab_class_t *);
 uint64_t metaslab_class_fragmentation(metaslab_class_t *);
 uint64_t metaslab_class_expandable_space(metaslab_class_t *);
-boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, zio_t *,
-    boolean_t, boolean_t *);
-boolean_t metaslab_class_throttle_unreserve(metaslab_class_t *, int, zio_t *);
+boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, int,
+    uint64_t, boolean_t, boolean_t *);
+boolean_t metaslab_class_throttle_unreserve(metaslab_class_t *, int, int,
+    uint64_t);
 void metaslab_class_evict_old(metaslab_class_t *, uint64_t);
 const char *metaslab_class_get_name(metaslab_class_t *);
 uint64_t metaslab_class_get_alloc(metaslab_class_t *);
diff --git a/include/sys/zio.h b/include/sys/zio.h
index 01f3babeb4c..e65ac2803c4 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -196,7 +196,7 @@ typedef uint64_t zio_flag_t;
 #define	ZIO_FLAG_DONT_RETRY	(1ULL << 10)
 #define	ZIO_FLAG_NODATA		(1ULL << 12)
 #define	ZIO_FLAG_INDUCE_DAMAGE	(1ULL << 13)
-#define	ZIO_FLAG_IO_ALLOCATING	(1ULL << 14)
+#define	ZIO_FLAG_ALLOC_THROTTLED	(1ULL << 14)
 
 #define	ZIO_FLAG_DDT_INHERIT	(ZIO_FLAG_IO_RETRY - 1)
 #define	ZIO_FLAG_GANG_INHERIT	(ZIO_FLAG_IO_RETRY - 1)
diff --git a/man/man8/zpool-events.8 b/man/man8/zpool-events.8
index 7af1917da6d..2d32dce2bb6 100644
--- a/man/man8/zpool-events.8
+++ b/man/man8/zpool-events.8
@@ -28,7 +28,7 @@
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\" Copyright (c) 2024, 2025, Klara, Inc.
 .\"
-.Dd May 27, 2025
+.Dd July 3, 2025
 .Dt ZPOOL-EVENTS 8
 .Os
 .
@@ -465,7 +465,7 @@ ZIO_FLAG_DONT_RETRY:0x00000400
 ZIO_FLAG_NODATA:0x00001000
 ZIO_FLAG_INDUCE_DAMAGE:0x00002000
 
-ZIO_FLAG_IO_ALLOCATING:0x00004000
+ZIO_FLAG_ALLOC_THROTTLED:0x00004000
 ZIO_FLAG_IO_RETRY:0x00008000
 ZIO_FLAG_PROBE:0x00010000
 ZIO_FLAG_TRYHARD:0x00020000
diff --git a/module/zcommon/zfs_valstr.c b/module/zcommon/zfs_valstr.c
index 08813b81cf5..0cb9f584acc 100644
--- a/module/zcommon/zfs_valstr.c
+++ b/module/zcommon/zfs_valstr.c
@@ -203,7 +203,7 @@ _VALSTR_BITFIELD_IMPL(zio_flag,
 	{ '?', "??", "[UNUSED 11]" },
 	{ '.', "ND", "NODATA" },
 	{ '.', "ID", "INDUCE_DAMAGE" },
-	{ '.', "AL", "IO_ALLOCATING" },
+	{ '.', "AT", "ALLOC_THROTTLED" },
 	{ '.', "RE", "IO_RETRY" },
 	{ '.', "PR", "PROBE" },
 	{ '.', "TH", "TRYHARD" },
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 43b94eba2d5..23eca0425b8 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -5757,21 +5757,21 @@ metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint)
 }
 
 /*
- * Reserve some allocation slots. The reservation system must be called
- * before we call into the allocator. If there aren't any available slots
- * then the I/O will be throttled until an I/O completes and its slots are
- * freed up. The function returns true if it was successful in placing
- * the reservation.
+ * Reserve some space for a future allocation. The reservation system must be
+ * called before we call into the allocator. If there aren't enough space
+ * available, the calling I/O will be throttled until another I/O completes and
+ * its reservation is released. The function returns true if it was successful
+ * in placing the reservation.
  */
 boolean_t
-metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
-    boolean_t must, boolean_t *more)
+metaslab_class_throttle_reserve(metaslab_class_t *mc, int allocator,
+    int copies, uint64_t io_size, boolean_t must, boolean_t *more)
 {
-	metaslab_class_allocator_t *mca = &mc->mc_allocator[zio->io_allocator];
+	metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
 
 	ASSERT(mc->mc_alloc_throttle_enabled);
-	if (mc->mc_alloc_io_size < zio->io_size) {
-		mc->mc_alloc_io_size = zio->io_size;
+	if (mc->mc_alloc_io_size < io_size) {
+		mc->mc_alloc_io_size = io_size;
 		metaslab_class_balance(mc, B_FALSE);
 	}
 	if (must || mca->mca_reserved <= mc->mc_alloc_max) {
@@ -5782,10 +5782,9 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
 		 * worst that can happen is few more I/Os get to allocation
 		 * earlier, that is not a problem.
 		 */
-		int64_t delta = slots * zio->io_size;
+		int64_t delta = copies * io_size;
 		*more = (atomic_add_64_nv(&mca->mca_reserved, delta) <=
 		    mc->mc_alloc_max);
-		zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
 		return (B_TRUE);
 	}
 	*more = B_FALSE;
@@ -5793,13 +5792,13 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
 }
 
 boolean_t
-metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
-    zio_t *zio)
+metaslab_class_throttle_unreserve(metaslab_class_t *mc, int allocator,
+    int copies, uint64_t io_size)
 {
-	metaslab_class_allocator_t *mca = &mc->mc_allocator[zio->io_allocator];
+	metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
 
 	ASSERT(mc->mc_alloc_throttle_enabled);
-	int64_t delta = slots * zio->io_size;
+	int64_t delta = copies * io_size;
 	return (atomic_add_64_nv(&mca->mca_reserved, -delta) <=
 	    mc->mc_alloc_max);
 }
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index c5f385aeb72..64f3d31f565 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -1679,7 +1679,7 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
 	 * If this is a retried I/O then we ignore it since we will
 	 * have already processed the original allocating I/O.
 	 */
-	if (flags & ZIO_FLAG_IO_ALLOCATING &&
+	if (flags & ZIO_FLAG_ALLOC_THROTTLED &&
 	    (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) {
 		ASSERT(pio->io_metaslab_class != NULL);
 		ASSERT(pio->io_metaslab_class->mc_alloc_throttle_enabled);
@@ -1689,7 +1689,7 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
 		ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) ||
 		    pio->io_child_type == ZIO_CHILD_GANG);
 
-		flags &= ~ZIO_FLAG_IO_ALLOCATING;
+		flags &= ~ZIO_FLAG_ALLOC_THROTTLED;
 	}
 
 	zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size,
@@ -3151,7 +3151,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
 
 	ASSERT(ZIO_HAS_ALLOCATOR(pio));
 	int flags = METASLAB_GANG_HEADER;
-	if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+	if (pio->io_flags & ZIO_FLAG_ALLOC_THROTTLED) {
 		ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 		ASSERT(has_data);
 
@@ -3186,10 +3186,11 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
 	    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
 
 	zio_gang_inherit_allocator(pio, zio);
-	if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+	if (pio->io_flags & ZIO_FLAG_ALLOC_THROTTLED) {
 		boolean_t more;
-		VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies,
-		    zio, B_TRUE, &more));
+		VERIFY(metaslab_class_throttle_reserve(mc, zio->io_allocator,
+		    gbh_copies, zio->io_size, B_TRUE, &more));
+		zio->io_flags |= ZIO_FLAG_ALLOC_THROTTLED;
 	}
 
 	/*
@@ -4072,9 +4073,11 @@ zio_io_to_allocate(metaslab_class_allocator_t *mca, boolean_t *more)
 	 * reserve then we throttle.
 	 */
 	if (!metaslab_class_throttle_reserve(zio->io_metaslab_class,
-	    zio->io_prop.zp_copies, zio, B_FALSE, more)) {
+	    zio->io_allocator, zio->io_prop.zp_copies, zio->io_size,
+	    B_FALSE, more)) {
 		return (NULL);
 	}
+	zio->io_flags |= ZIO_FLAG_ALLOC_THROTTLED;
 
 	avl_remove(&mca->mca_tree, zio);
 	ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE);
@@ -4230,13 +4233,14 @@ zio_dva_allocate(zio_t *zio)
 		 * If we are holding old class reservation, drop it.
 		 * Dispatch the next ZIO(s) there if some are waiting.
 		 */
-		if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+		if (zio->io_flags & ZIO_FLAG_ALLOC_THROTTLED) {
 			if (metaslab_class_throttle_unreserve(mc,
-			    zio->io_prop.zp_copies, zio)) {
+			    zio->io_allocator, zio->io_prop.zp_copies,
+			    zio->io_size)) {
 				zio_allocate_dispatch(zio->io_metaslab_class,
 				    zio->io_allocator);
 			}
-			zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING;
+			zio->io_flags &= ~ZIO_FLAG_ALLOC_THROTTLED;
 		}
 
 		if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) {
@@ -5196,7 +5200,7 @@ zio_ready(zio_t *zio)
 	if (zio->io_error != 0) {
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
-		if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+		if (zio->io_flags & ZIO_FLAG_ALLOC_THROTTLED) {
 			ASSERT(IO_IS_ALLOCATING(zio));
 			ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 			ASSERT(zio->io_metaslab_class != NULL);
@@ -5207,8 +5211,8 @@ zio_ready(zio_t *zio)
 			 * issue the next I/O to allocate.
 			 */
 			if (metaslab_class_throttle_unreserve(
-			    zio->io_metaslab_class, zio->io_prop.zp_copies,
-			    zio)) {
+			    zio->io_metaslab_class, zio->io_allocator,
+			    zio->io_prop.zp_copies, zio->io_size)) {
 				zio_allocate_dispatch(zio->io_metaslab_class,
 				    zio->io_allocator);
 			}
@@ -5267,7 +5271,7 @@ zio_dva_throttle_done(zio_t *zio)
 	ASSERT3P(vd, ==, vd->vdev_top);
 	ASSERT(zio_injection_enabled || !(zio->io_flags & ZIO_FLAG_IO_RETRY));
 	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
-	ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING);
+	ASSERT(zio->io_flags & ZIO_FLAG_ALLOC_THROTTLED);
 
 	/*
 	 * Parents of gang children can have two flavors -- ones that allocated
@@ -5291,7 +5295,8 @@ zio_dva_throttle_done(zio_t *zio)
 	metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id,
 	    pio->io_allocator, flags, pio->io_size, tag);
 
-	if (metaslab_class_throttle_unreserve(zio->io_metaslab_class, 1, pio)) {
+	if (metaslab_class_throttle_unreserve(pio->io_metaslab_class,
+	    pio->io_allocator, 1, pio->io_size)) {
 		zio_allocate_dispatch(zio->io_metaslab_class,
 		    pio->io_allocator);
 	}
@@ -5322,7 +5327,7 @@ zio_done(zio_t *zio)
 	 * write. We must do this since the allocation is performed
 	 * by the logical I/O but the actual write is done by child I/Os.
 	 */
-	if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING &&
+	if (zio->io_flags & ZIO_FLAG_ALLOC_THROTTLED &&
 	    zio->io_child_type == ZIO_CHILD_VDEV)
 		zio_dva_throttle_done(zio);
 

From 523d9d6007effb01bf6a44f9942158f47a65e2b2 Mon Sep 17 00:00:00 2001
From: Ameer Hamza <ahamza@ixsystems.com>
Date: Wed, 9 Jul 2025 07:10:00 +0500
Subject: [PATCH 12/72] Validate mountpoint on path-based unmount using statx

Use statx to verify that path-based unmounts proceed only if the
mountpoint reported by statx matches the MNTTAB entry reported by
libzfs, aborting the operation if they differ. Align
`zfs umount /path` behavior with `zfs umount dataset`.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Ameer Hamza <ahamza@ixsystems.com>
Closes #17481
---
 cmd/zfs/zfs_main.c                     | 19 ++++++++++++++
 config/user-statx.m4                   | 34 ++++++++++++++++++++++++
 config/user.m4                         |  1 +
 lib/libspl/include/os/linux/sys/stat.h |  5 ++++
 lib/libspl/os/linux/getmntany.c        | 36 +++++++++++++++++++++-----
 5 files changed, 89 insertions(+), 6 deletions(-)
 create mode 100644 config/user-statx.m4

diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c
index 665e183485f..842e5d088d4 100644
--- a/cmd/zfs/zfs_main.c
+++ b/cmd/zfs/zfs_main.c
@@ -7729,6 +7729,7 @@ unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual)
 	struct extmnttab entry;
 	const char *cmdname = (op == OP_SHARE) ? "unshare" : "unmount";
 	ino_t path_inode;
+	char *zfs_mntpnt, *entry_mntpnt;
 
 	/*
 	 * Search for the given (major,minor) pair in the mount table.
@@ -7770,6 +7771,24 @@ unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual)
 		goto out;
 	}
 
+	/*
+	 * If the filesystem is mounted, check that the mountpoint matches
+	 * the one in the mnttab entry w.r.t. provided path. If it doesn't,
+	 * then we should not proceed further.
+	 */
+	entry_mntpnt = strdup(entry.mnt_mountp);
+	if (zfs_is_mounted(zhp, &zfs_mntpnt)) {
+		if (strcmp(zfs_mntpnt, entry_mntpnt) != 0) {
+			(void) fprintf(stderr, gettext("cannot %s '%s': "
+			    "not an original mountpoint\n"), cmdname, path);
+			free(zfs_mntpnt);
+			free(entry_mntpnt);
+			goto out;
+		}
+		free(zfs_mntpnt);
+	}
+	free(entry_mntpnt);
+
 	if (op == OP_SHARE) {
 		char nfs_mnt_prop[ZFS_MAXPROPLEN];
 		char smbshare_prop[ZFS_MAXPROPLEN];
diff --git a/config/user-statx.m4 b/config/user-statx.m4
new file mode 100644
index 00000000000..0315f93e0c2
--- /dev/null
+++ b/config/user-statx.m4
@@ -0,0 +1,34 @@
+dnl #
+dnl # Check for statx() function and STATX_MNT_ID availability
+dnl #
+AC_DEFUN([ZFS_AC_CONFIG_USER_STATX], [
+	AC_CHECK_HEADERS([linux/stat.h],
+		[have_stat_headers=yes],
+		[have_stat_headers=no])
+
+	AS_IF([test "x$have_stat_headers" = "xyes"], [
+		AC_CHECK_FUNC([statx], [
+			AC_DEFINE([HAVE_STATX], [1], [statx() is available])
+
+			dnl Check for STATX_MNT_ID availability
+			AC_MSG_CHECKING([for STATX_MNT_ID])
+			AC_COMPILE_IFELSE([
+				AC_LANG_PROGRAM([[
+					#include <linux/stat.h>
+				]], [[
+					struct statx stx;
+					int mask = STATX_MNT_ID;
+					(void)mask;
+					(void)stx.stx_mnt_id;
+				]])
+			], [
+				AC_MSG_RESULT([yes])
+				AC_DEFINE([HAVE_STATX_MNT_ID], [1], [STATX_MNT_ID is available])
+			], [
+				AC_MSG_RESULT([no])
+			])
+		])
+	], [
+		AC_MSG_WARN([linux/stat.h not found; skipping statx support])
+	])
+])  dnl end AC_DEFUN
diff --git a/config/user.m4 b/config/user.m4
index badd920d2b8..62e59ed9443 100644
--- a/config/user.m4
+++ b/config/user.m4
@@ -17,6 +17,7 @@ AC_DEFUN([ZFS_AC_CONFIG_USER], [
 		ZFS_AC_CONFIG_USER_LIBUDEV
 		ZFS_AC_CONFIG_USER_LIBUUID
 		ZFS_AC_CONFIG_USER_LIBBLKID
+		ZFS_AC_CONFIG_USER_STATX
 	])
 	ZFS_AC_CONFIG_USER_LIBTIRPC
 	ZFS_AC_CONFIG_USER_LIBCRYPTO
diff --git a/lib/libspl/include/os/linux/sys/stat.h b/lib/libspl/include/os/linux/sys/stat.h
index 488554f4e84..a605af962a6 100644
--- a/lib/libspl/include/os/linux/sys/stat.h
+++ b/lib/libspl/include/os/linux/sys/stat.h
@@ -31,6 +31,11 @@
 
 #include <sys/mount.h> /* for BLKGETSIZE64 */
 
+#ifdef HAVE_STATX
+#include <fcntl.h>
+#include <linux/stat.h>
+#endif
+
 /*
  * Emulate Solaris' behavior of returning the block device size in fstat64().
  */
diff --git a/lib/libspl/os/linux/getmntany.c b/lib/libspl/os/linux/getmntany.c
index dcdf7b3d6fc..ee1cdf59b9e 100644
--- a/lib/libspl/os/linux/getmntany.c
+++ b/lib/libspl/os/linux/getmntany.c
@@ -85,13 +85,21 @@ _sol_getmntent(FILE *fp, struct mnttab *mgetp)
 }
 
 static int
-getextmntent_impl(FILE *fp, struct extmnttab *mp)
+getextmntent_impl(FILE *fp, struct extmnttab *mp, uint64_t *mnt_id)
 {
 	int ret;
 	struct stat64 st;
 
+	*mnt_id = 0;
 	ret = _sol_getmntent(fp, (struct mnttab *)mp);
 	if (ret == 0) {
+#ifdef HAVE_STATX_MNT_ID
+		struct statx stx;
+		if (statx(AT_FDCWD, mp->mnt_mountp,
+		    AT_STATX_SYNC_AS_STAT | AT_SYMLINK_NOFOLLOW,
+		    STATX_MNT_ID, &stx) == 0 && (stx.stx_mask & STATX_MNT_ID))
+			*mnt_id = stx.stx_mnt_id;
+#endif
 		if (stat64(mp->mnt_mountp, &st) != 0) {
 			mp->mnt_major = 0;
 			mp->mnt_minor = 0;
@@ -110,6 +118,12 @@ getextmntent(const char *path, struct extmnttab *entry, struct stat64 *statbuf)
 	struct stat64 st;
 	FILE *fp;
 	int match;
+	boolean_t have_mnt_id = B_FALSE;
+	uint64_t target_mnt_id = 0;
+	uint64_t entry_mnt_id;
+#ifdef HAVE_STATX_MNT_ID
+	struct statx stx;
+#endif
 
 	if (strlen(path) >= MAXPATHLEN) {
 		(void) fprintf(stderr, "invalid object; pathname too long\n");
@@ -128,6 +142,13 @@ getextmntent(const char *path, struct extmnttab *entry, struct stat64 *statbuf)
 		return (-1);
 	}
 
+#ifdef HAVE_STATX_MNT_ID
+	if (statx(AT_FDCWD, path, AT_STATX_SYNC_AS_STAT | AT_SYMLINK_NOFOLLOW,
+	    STATX_MNT_ID, &stx) == 0 && (stx.stx_mask & STATX_MNT_ID)) {
+		have_mnt_id = B_TRUE;
+		target_mnt_id = stx.stx_mnt_id;
+	}
+#endif
 
 	if ((fp = fopen(MNTTAB, "re")) == NULL) {
 		(void) fprintf(stderr, "cannot open %s\n", MNTTAB);
@@ -139,12 +160,15 @@ getextmntent(const char *path, struct extmnttab *entry, struct stat64 *statbuf)
 	 */
 
 	match = 0;
-	while (getextmntent_impl(fp, entry) == 0) {
-		if (makedev(entry->mnt_major, entry->mnt_minor) ==
-		    statbuf->st_dev) {
-			match = 1;
-			break;
+	while (getextmntent_impl(fp, entry, &entry_mnt_id) == 0) {
+		if (have_mnt_id) {
+			match = (entry_mnt_id == target_mnt_id);
+		} else {
+			match = makedev(entry->mnt_major, entry->mnt_minor) ==
+			    statbuf->st_dev;
 		}
+		if (match)
+			break;
 	}
 	(void) fclose(fp);
 

From 4c2a7f85d52f4b249aeb4c76a53ebb6e72346d0d Mon Sep 17 00:00:00 2001
From: rmacklem <64620010+rmacklem@users.noreply.github.com>
Date: Tue, 8 Jul 2025 19:11:22 -0700
Subject: [PATCH 13/72] FreeBSD: Add support for _PC_HAS_HIDDENSYSTEM

In FreeBSD there is now a pathconf name _PC_HAS_HIDDENSYSTEM.
This patch adds support for it to OpenZFS.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Rick Macklem <rmacklem@uoguelph.ca>
Closes #17518
---
 module/os/freebsd/zfs/zfs_vnops_os.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c
index f53c94d3a0c..25409ceaf56 100644
--- a/module/os/freebsd/zfs/zfs_vnops_os.c
+++ b/module/os/freebsd/zfs/zfs_vnops_os.c
@@ -5428,6 +5428,11 @@ zfs_freebsd_pathconf(struct vop_pathconf_args *ap)
 			return (0);
 		}
 		return (EINVAL);
+#ifdef _PC_HAS_HIDDENSYSTEM
+	case _PC_HAS_HIDDENSYSTEM:
+		*ap->a_retval = 1;
+		return (0);
+#endif
 	default:
 		return (vop_stdpathconf(ap));
 	}

From e845be28e72c8386686ea5634d048288a3f0605b Mon Sep 17 00:00:00 2001
From: Paul Dagnelie <paul.dagnelie@klarasystems.com>
Date: Thu, 3 Jul 2025 11:03:30 -0700
Subject: [PATCH 14/72] Add no-upgrade featureflag

Adds a featureflag that is not enabled during upgrades unless listed
explicitly. This is useful for features that could cause issues unless
applied carefully; for example, a feature that could make a root pool
unbootable if bootloaders don't yet have support for it.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Rob Norris <rob.norris@klarasystems.com>
Reviewed-by: Allan Jude <allan@klarasystems.com>
Signed-off-by: Paul Dagnelie <paul.dagnelie@klarasystems.com>
Closes #17004
---
 cmd/zpool/zpool_main.c     | 13 ++++++++++---
 include/zfeature_common.h  | 10 +++++++++-
 lib/libzfs/libzfs_pool.c   |  3 ++-
 lib/libzfs/libzfs_status.c |  3 ++-
 4 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index e62441894cd..f7cd73085f0 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -11330,7 +11330,8 @@ upgrade_enable_all(zpool_handle_t *zhp, int *countp)
 		const char *fname = spa_feature_table[i].fi_uname;
 		const char *fguid = spa_feature_table[i].fi_guid;
 
-		if (!spa_feature_table[i].fi_zfs_mod_supported)
+		if (!spa_feature_table[i].fi_zfs_mod_supported ||
+		    (spa_feature_table[i].fi_flags & ZFEATURE_FLAG_NO_UPGRADE))
 			continue;
 
 		if (!nvlist_exists(enabled, fguid) && requested_features[i]) {
@@ -11485,7 +11486,11 @@ upgrade_list_disabled_cb(zpool_handle_t *zhp, void *arg)
 					    "Note that the pool "
 					    "'compatibility' feature can be "
 					    "used to inhibit\nfeature "
-					    "upgrades.\n\n"));
+					    "upgrades.\n\n"
+					    "Features marked with (*) are not "
+					    "applied automatically on upgrade, "
+					    "and\nmust be applied explicitly "
+					    "with zpool-set(7).\n\n"));
 					(void) printf(gettext("POOL  "
 					    "FEATURE\n"));
 					(void) printf(gettext("------"
@@ -11499,7 +11504,9 @@ upgrade_list_disabled_cb(zpool_handle_t *zhp, void *arg)
 					poolfirst = B_FALSE;
 				}
 
-				(void) printf(gettext("      %s\n"), fname);
+				(void) printf(gettext("      %s%s\n"), fname,
+				    spa_feature_table[i].fi_flags &
+				    ZFEATURE_FLAG_NO_UPGRADE ? "(*)" : "");
 			}
 			/*
 			 * If they did "zpool upgrade -a", then we could
diff --git a/include/zfeature_common.h b/include/zfeature_common.h
index 85537c1ae96..5d37bb95685 100644
--- a/include/zfeature_common.h
+++ b/include/zfeature_common.h
@@ -103,7 +103,15 @@ typedef enum zfeature_flags {
 	/* Activate this feature at the same time it is enabled. */
 	ZFEATURE_FLAG_ACTIVATE_ON_ENABLE =	(1 << 2),
 	/* Each dataset has a field set if it has ever used this feature. */
-	ZFEATURE_FLAG_PER_DATASET =		(1 << 3)
+	ZFEATURE_FLAG_PER_DATASET =		(1 << 3),
+	/*
+	 * This feature isn't enabled by zpool upgrade; it must be explicitly
+	 * listed to be enabled. It will also be applied if listed in an
+	 * explicitly provided compatibility list. This flag can be removed
+	 * from a given feature once support is sufficiently widespread, or
+	 * worries about backwards compatibility are no longer relevant.
+	 */
+	ZFEATURE_FLAG_NO_UPGRADE = 		(1 << 4)
 } zfeature_flags_t;
 
 typedef enum zfeature_type {
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index fb18c430975..dc2fb1a8c09 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -5093,9 +5093,10 @@ zpool_load_compat(const char *compat, boolean_t *features, char *report,
 	/* special cases (unset), "" and "off" => enable all features */
 	if (compat == NULL || compat[0] == '\0' ||
 	    strcmp(compat, ZPOOL_COMPAT_OFF) == 0) {
-		if (features != NULL)
+		if (features != NULL) {
 			for (uint_t i = 0; i < SPA_FEATURES; i++)
 				features[i] = B_TRUE;
+		}
 		if (report != NULL)
 			strlcpy(report, gettext("all features enabled"), rlen);
 		return (ZPOOL_COMPATIBILITY_OK);
diff --git a/lib/libzfs/libzfs_status.c b/lib/libzfs/libzfs_status.c
index 1ee70396823..bdddefb9216 100644
--- a/lib/libzfs/libzfs_status.c
+++ b/lib/libzfs/libzfs_status.c
@@ -484,7 +484,8 @@ check_status(nvlist_t *config, boolean_t isimport,
 		}
 		for (i = 0; i < SPA_FEATURES; i++) {
 			zfeature_info_t *fi = &spa_feature_table[i];
-			if (!fi->fi_zfs_mod_supported)
+			if (!fi->fi_zfs_mod_supported ||
+			    (fi->fi_flags & ZFEATURE_FLAG_NO_UPGRADE))
 				continue;
 			if (c_features[i] && !nvlist_exists(feat, fi->fi_guid))
 				return (ZPOOL_STATUS_FEAT_DISABLED);

From a981cb69e44fc736d94f64f432941ba247143687 Mon Sep 17 00:00:00 2001
From: Paul Dagnelie <paul.dagnelie@klarasystems.com>
Date: Thu, 23 Jan 2025 16:26:09 -0800
Subject: [PATCH 15/72] Implement dynamic gang header sizes

ZFS gang block headers are currently fixed at 512 bytes. This is
increasingly wasteful in the era of larger disk sector sizes. This PR
allows any size allocation to work as a gang header. It also contains
supporting changes to ZDB to make gang headers easier to work with.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Rob Norris <rob.norris@klarasystems.com>
Reviewed-by: Allan Jude <allan@klarasystems.com>
Signed-off-by: Paul Dagnelie <paul.dagnelie@klarasystems.com>
Closes #17004
---
 cmd/zdb/zdb.c                                 |   6 +-
 include/sys/vdev.h                            |   2 +-
 include/sys/zio.h                             |  45 ++++--
 include/zfeature_common.h                     |   1 +
 lib/libzfs/libzfs.abi                         |  12 +-
 man/man7/zpool-features.7                     |  13 ++
 module/zcommon/zfeature_common.c              |   6 +
 module/zfs/metaslab.c                         |  14 +-
 module/zfs/zio.c                              | 143 +++++++++++++-----
 module/zfs/zio_checksum.c                     |  25 ++-
 tests/runfiles/common.run                     |   3 +-
 tests/zfs-tests/tests/Makefile.am             |   3 +
 .../zpool_create_features_001_pos.ksh         |   2 +-
 .../zpool_create_features_005_pos.ksh         |   3 +
 .../cli_root/zpool_get/zpool_get.cfg          |   1 +
 .../gang_blocks_dyn_header_neg.ksh            |  53 +++++++
 .../gang_blocks_dyn_header_pos.ksh            |  73 +++++++++
 .../gang_blocks/gang_blocks_dyn_multi.ksh     |  54 +++++++
 18 files changed, 387 insertions(+), 72 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_neg.ksh
 create mode 100755 tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_pos.ksh
 create mode 100755 tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_multi.ksh

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 45eb9c78365..d6f144c0e20 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -8588,9 +8588,9 @@ zdb_dump_indirect(blkptr_t *bp, int nbps, int flags)
 }
 
 static void
-zdb_dump_gbh(void *buf, int flags)
+zdb_dump_gbh(void *buf, uint64_t size, int flags)
 {
-	zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags);
+	zdb_dump_indirect((blkptr_t *)buf, gbh_nblkptrs(size), flags);
 }
 
 static void
@@ -9073,7 +9073,7 @@ zdb_read_block(char *thing, spa_t *spa)
 		zdb_dump_indirect((blkptr_t *)buf,
 		    orig_lsize / sizeof (blkptr_t), flags);
 	else if (flags & ZDB_FLAG_GBH)
-		zdb_dump_gbh(buf, flags);
+		zdb_dump_gbh(buf, lsize, flags);
 	else
 		zdb_dump_block(thing, buf, lsize, flags);
 
diff --git a/include/sys/vdev.h b/include/sys/vdev.h
index 7f457c3a0b7..7f5a9aaef1b 100644
--- a/include/sys/vdev.h
+++ b/include/sys/vdev.h
@@ -148,7 +148,7 @@ extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);
 static inline uint64_t
 vdev_gang_header_asize(vdev_t *vd)
 {
-	return (vdev_psize_to_asize_txg(vd, SPA_GANGBLOCKSIZE, 0));
+	return (vdev_psize_to_asize_txg(vd, SPA_OLD_GANGBLOCKSIZE, 0));
 }
 
 extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux);
diff --git a/include/sys/zio.h b/include/sys/zio.h
index e65ac2803c4..b139c9de485 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -59,21 +59,36 @@ typedef struct zio_eck {
 
 /*
  * Gang block headers are self-checksumming and contain an array
- * of block pointers.
+ * of block pointers. The old gang block size has enough room for 3 blkptrs,
+ * while new gang blocks can store more.
+ *
+ * Layout:
+ * +--------+--------+--------+-----+---------+-----------+
+ * |        |        |        |     |         |           |
+ * | blkptr | blkptr | blkptr | ... | padding | zio_eck_t |
+ * |   1    |   2    |   3    |     |         |           |
+ * +--------+--------+--------+-----+---------+-----------+
+ *   128B     128B     128B             88B        40B
  */
-#define	SPA_GANGBLOCKSIZE	SPA_MINBLOCKSIZE
-#define	SPA_GBH_NBLKPTRS	((SPA_GANGBLOCKSIZE - \
-	sizeof (zio_eck_t)) / sizeof (blkptr_t))
-#define	SPA_GBH_FILLER		((SPA_GANGBLOCKSIZE - \
-	sizeof (zio_eck_t) - \
-	(SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\
-	sizeof (uint64_t))
+#define	SPA_OLD_GANGBLOCKSIZE	SPA_MINBLOCKSIZE
+typedef void zio_gbh_phys_t;
 
-typedef struct zio_gbh {
-	blkptr_t		zg_blkptr[SPA_GBH_NBLKPTRS];
-	uint64_t		zg_filler[SPA_GBH_FILLER];
-	zio_eck_t		zg_tail;
-} zio_gbh_phys_t;
+static inline uint64_t
+gbh_nblkptrs(uint64_t size) {
+	ASSERT(IS_P2ALIGNED(size, sizeof (blkptr_t)));
+	return ((size - sizeof (zio_eck_t)) / sizeof (blkptr_t));
+}
+
+static inline zio_eck_t *
+gbh_eck(zio_gbh_phys_t *gbh, uint64_t size) {
+	ASSERT(IS_P2ALIGNED(size, sizeof (blkptr_t)));
+	return ((zio_eck_t *)((uintptr_t)gbh + size - sizeof (zio_eck_t)));
+}
+
+static inline blkptr_t *
+gbh_bp(zio_gbh_phys_t *gbh, int bp) {
+	return (&((blkptr_t *)gbh)[bp]);
+}
 
 enum zio_checksum {
 	ZIO_CHECKSUM_INHERIT = 0,
@@ -398,7 +413,9 @@ typedef struct zio_vsd_ops {
 
 typedef struct zio_gang_node {
 	zio_gbh_phys_t		*gn_gbh;
-	struct zio_gang_node	*gn_child[SPA_GBH_NBLKPTRS];
+	uint64_t		gn_gangblocksize;
+	uint64_t		gn_allocsize;
+	struct zio_gang_node	*gn_child[];
 } zio_gang_node_t;
 
 typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp,
diff --git a/include/zfeature_common.h b/include/zfeature_common.h
index 5d37bb95685..53e1ecae379 100644
--- a/include/zfeature_common.h
+++ b/include/zfeature_common.h
@@ -87,6 +87,7 @@ typedef enum spa_feature {
 	SPA_FEATURE_FAST_DEDUP,
 	SPA_FEATURE_LONGNAME,
 	SPA_FEATURE_LARGE_MICROZAP,
+	SPA_FEATURE_DYNAMIC_GANG_HEADER,
 	SPA_FEATURES
 } spa_feature_t;
 
diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi
index 35ecdca767d..ecfd40efc42 100644
--- a/lib/libzfs/libzfs.abi
+++ b/lib/libzfs/libzfs.abi
@@ -631,7 +631,7 @@
     <elf-symbol name='fletcher_4_superscalar_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='libzfs_config_ops' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='sa_protocol_names' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
-    <elf-symbol name='spa_feature_table' size='2464' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='spa_feature_table' size='2520' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zfeature_checks_disable' size='4' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zfs_deleg_perm_tab' size='528' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zfs_history_event_names' size='328' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@@ -6210,7 +6210,8 @@
       <enumerator name='SPA_FEATURE_FAST_DEDUP' value='41'/>
       <enumerator name='SPA_FEATURE_LONGNAME' value='42'/>
       <enumerator name='SPA_FEATURE_LARGE_MICROZAP' value='43'/>
-      <enumerator name='SPA_FEATURES' value='44'/>
+      <enumerator name='SPA_FEATURE_DYNAMIC_GANG_HEADER' value='44'/>
+      <enumerator name='SPA_FEATURES' value='45'/>
     </enum-decl>
     <typedef-decl name='spa_feature_t' type-id='33ecb627' id='d6618c78'/>
     <qualified-type-def type-id='80f4b756' const='yes' id='b99c00c9'/>
@@ -9394,8 +9395,8 @@
     </function-decl>
   </abi-instr>
   <abi-instr address-size='64' path='module/zcommon/zfeature_common.c' language='LANG_C99'>
-    <array-type-def dimensions='1' type-id='83f29ca2' size-in-bits='19712' id='fd4573e5'>
-      <subrange length='44' type-id='7359adad' id='cf8ba455'/>
+    <array-type-def dimensions='1' type-id='83f29ca2' size-in-bits='20160' id='b948da70'>
+      <subrange length='45' type-id='7359adad' id='cb8ddca0'/>
     </array-type-def>
     <enum-decl name='zfeature_flags' id='6db816a4'>
       <underlying-type type-id='9cac1fee'/>
@@ -9403,6 +9404,7 @@
       <enumerator name='ZFEATURE_FLAG_MOS' value='2'/>
       <enumerator name='ZFEATURE_FLAG_ACTIVATE_ON_ENABLE' value='4'/>
       <enumerator name='ZFEATURE_FLAG_PER_DATASET' value='8'/>
+      <enumerator name='ZFEATURE_FLAG_NO_UPGRADE' value='16'/>
     </enum-decl>
     <typedef-decl name='zfeature_flags_t' type-id='6db816a4' id='fc329033'/>
     <enum-decl name='zfeature_type' id='c4fa2355'>
@@ -9472,7 +9474,7 @@
     <pointer-type-def type-id='611586a1' size-in-bits='64' id='2e243169'/>
     <qualified-type-def type-id='eaa32e2f' const='yes' id='83be723c'/>
     <pointer-type-def type-id='83be723c' size-in-bits='64' id='7acd98a2'/>
-    <var-decl name='spa_feature_table' type-id='fd4573e5' mangled-name='spa_feature_table' visibility='default' elf-symbol-id='spa_feature_table'/>
+    <var-decl name='spa_feature_table' type-id='b948da70' mangled-name='spa_feature_table' visibility='default' elf-symbol-id='spa_feature_table'/>
     <var-decl name='zfeature_checks_disable' type-id='c19b74c3' mangled-name='zfeature_checks_disable' visibility='default' elf-symbol-id='zfeature_checks_disable'/>
     <function-decl name='opendir' visibility='default' binding='global' size-in-bits='64'>
       <parameter type-id='80f4b756'/>
diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7
index 8ae1b2b3b92..7ec27116440 100644
--- a/man/man7/zpool-features.7
+++ b/man/man7/zpool-features.7
@@ -493,6 +493,19 @@ vdev type, or when adding a new
 .Sy draid
 vdev to an existing pool.
 .
+.feature com.klarasystems dynamic_gang_header no
+This feature enables larger gang headers based on the sector size of the pool.
+When enabled, gang headers will use the entire space allocated for them, instead
+of always restricting themselves to 512 bytes.
+This can reduce the need for nested gang trees in extreme fragmentation
+scenarios.
+.Pp
+This feature becomes active when a gang header is written that is larger than
+512 bytes.
+This feature is not enabled by
+.Xr zpool-upgrade 8 .
+Instead, it must be manually enabled, or be part of a compatibility file.
+.
 .feature org.illumos edonr no extensible_dataset
 This feature enables the use of the Edon-R hash algorithm for checksum,
 including for nopwrite
diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c
index 0362d82efbc..8ac1c7cabd6 100644
--- a/module/zcommon/zfeature_common.c
+++ b/module/zcommon/zfeature_common.c
@@ -786,6 +786,12 @@ zpool_feature_init(void)
 		    ZFEATURE_TYPE_BOOLEAN, large_microzap_deps, sfeatures);
 	}
 
+	zfeature_register(SPA_FEATURE_DYNAMIC_GANG_HEADER,
+	    "com.klarasystems:dynamic_gang_header", "dynamic_gang_header",
+	    "Support for dynamically sized gang headers",
+	    ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_NO_UPGRADE,
+	    ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures);
+
 	zfs_mod_list_supported_free(sfeatures);
 }
 
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 23eca0425b8..082d379cded 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -5974,12 +5974,12 @@ metaslab_alloc_range(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 	ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
 	ASSERT3P(zal, !=, NULL);
 
-	uint64_t cur_psize = 0;
-
+	uint64_t smallest_psize = UINT64_MAX;
 	for (int d = 0; d < ndvas; d++) {
-		error = metaslab_alloc_dva_range(spa, mc, psize, max_psize,
-		    dva, d, hintdva, txg, flags, zal, allocator,
-		    actual_psize ? &cur_psize : NULL);
+		uint64_t cur_psize = 0;
+		error = metaslab_alloc_dva_range(spa, mc, psize,
+		    MIN(smallest_psize, max_psize), dva, d, hintdva, txg,
+		    flags, zal, allocator, actual_psize ? &cur_psize : NULL);
 		if (error != 0) {
 			for (d--; d >= 0; d--) {
 				metaslab_unalloc_dva(spa, &dva[d], txg);
@@ -5999,13 +5999,13 @@ metaslab_alloc_range(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 			    DVA_GET_VDEV(&dva[d]), allocator, flags, psize,
 			    tag);
 			if (actual_psize)
-				max_psize = MIN(cur_psize, max_psize);
+				smallest_psize = MIN(cur_psize, smallest_psize);
 		}
 	}
 	ASSERT(error == 0);
 	ASSERT(BP_GET_NDVAS(bp) == ndvas);
 	if (actual_psize)
-		*actual_psize = max_psize;
+		*actual_psize = smallest_psize;
 
 	spa_config_exit(spa, SCL_ALLOC, FTAG);
 
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 64f3d31f565..67ee3d5ba2e 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -2743,11 +2743,14 @@ zio_resume_wait(spa_t *spa)
  * being nearly full, it calls zio_write_gang_block() to construct the
  * block from smaller fragments.
  *
- * A gang block consists of a gang header (zio_gbh_phys_t) and up to
- * three (SPA_GBH_NBLKPTRS) gang members.  The gang header is just like
- * an indirect block: it's an array of block pointers.  It consumes
- * only one sector and hence is allocatable regardless of fragmentation.
- * The gang header's bps point to its gang members, which hold the data.
+ * A gang block consists of a a gang header and up to gbh_nblkptrs(size)
+ * gang members. The gang header is like an indirect block: it's an array
+ * of block pointers, though the header has a small tail (a zio_eck_t)
+ * that stores an embedded checksum. It is allocated using only a single
+ * sector as the requested size, and hence is allocatable regardless of
+ * fragmentation. Its size is determined by the smallest allocatable
+ * asize of the vdevs it was allocated on. The gang header's bps point
+ * to its gang members, which hold the data.
  *
  * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
  * as the verifier to ensure uniqueness of the SHA256 checksum.
@@ -2826,10 +2829,10 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
 
 	if (gn != NULL) {
 		abd_t *gbh_abd =
-		    abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
+		    abd_get_from_buf(gn->gn_gbh, gn->gn_gangblocksize);
 		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
-		    gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL,
-		    pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
+		    gbh_abd, gn->gn_gangblocksize, zio_gang_issue_func_done,
+		    NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
 		    &pio->io_bookmark);
 		/*
 		 * As we rewrite each gang header, the pipeline will compute
@@ -2900,14 +2903,16 @@ static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
 static void zio_gang_tree_assemble_done(zio_t *zio);
 
 static zio_gang_node_t *
-zio_gang_node_alloc(zio_gang_node_t **gnpp)
+zio_gang_node_alloc(zio_gang_node_t **gnpp, uint64_t gangblocksize)
 {
 	zio_gang_node_t *gn;
 
 	ASSERT(*gnpp == NULL);
 
-	gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
-	gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
+	gn = kmem_zalloc(sizeof (*gn) +
+	    (gbh_nblkptrs(gangblocksize) * sizeof (gn)), KM_SLEEP);
+	gn->gn_gangblocksize = gn->gn_allocsize = gangblocksize;
+	gn->gn_gbh = zio_buf_alloc(gangblocksize);
 	*gnpp = gn;
 
 	return (gn);
@@ -2918,11 +2923,12 @@ zio_gang_node_free(zio_gang_node_t **gnpp)
 {
 	zio_gang_node_t *gn = *gnpp;
 
-	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
+	for (int g = 0; g < gbh_nblkptrs(gn->gn_allocsize); g++)
 		ASSERT(gn->gn_child[g] == NULL);
 
-	zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
-	kmem_free(gn, sizeof (*gn));
+	zio_buf_free(gn->gn_gbh, gn->gn_allocsize);
+	kmem_free(gn, sizeof (*gn) +
+	    (gbh_nblkptrs(gn->gn_allocsize) * sizeof (gn)));
 	*gnpp = NULL;
 }
 
@@ -2934,7 +2940,7 @@ zio_gang_tree_free(zio_gang_node_t **gnpp)
 	if (gn == NULL)
 		return;
 
-	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
+	for (int g = 0; g < gbh_nblkptrs(gn->gn_allocsize); g++)
 		zio_gang_tree_free(&gn->gn_child[g]);
 
 	zio_gang_node_free(gnpp);
@@ -2943,13 +2949,28 @@ zio_gang_tree_free(zio_gang_node_t **gnpp)
 static void
 zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
 {
-	zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
-	abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
+	uint64_t gangblocksize = UINT64_MAX;
+	if (spa_feature_is_active(gio->io_spa,
+	    SPA_FEATURE_DYNAMIC_GANG_HEADER)) {
+		spa_config_enter(gio->io_spa, SCL_VDEV, FTAG, RW_READER);
+		for (int dva = 0; dva < BP_GET_NDVAS(bp); dva++) {
+			vdev_t *vd = vdev_lookup_top(gio->io_spa,
+			    DVA_GET_VDEV(&bp->blk_dva[dva]));
+			uint64_t asize = vdev_gang_header_asize(vd);
+			gangblocksize = MIN(gangblocksize, asize);
+		}
+		spa_config_exit(gio->io_spa, SCL_VDEV, FTAG);
+	} else {
+		gangblocksize = SPA_OLD_GANGBLOCKSIZE;
+	}
+	ASSERT3U(gangblocksize, !=, UINT64_MAX);
+	zio_gang_node_t *gn = zio_gang_node_alloc(gnpp, gangblocksize);
+	abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, gangblocksize);
 
 	ASSERT(gio->io_gang_leader == gio);
 	ASSERT(BP_IS_GANG(bp));
 
-	zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE,
+	zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, gangblocksize,
 	    zio_gang_tree_assemble_done, gn, gio->io_priority,
 	    ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
 }
@@ -2972,13 +2993,17 @@ zio_gang_tree_assemble_done(zio_t *zio)
 		byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size);
 
 	ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh);
-	ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
-	ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
+	/*
+	 * If this was an old-style gangblock, the gangblocksize should have
+	 * been updated in zio_checksum_error to reflect that.
+	 */
+	ASSERT3U(gbh_eck(gn->gn_gbh, gn->gn_gangblocksize)->zec_magic,
+	    ==, ZEC_MAGIC);
 
 	abd_free(zio->io_abd);
 
-	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
-		blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
+	for (int g = 0; g < gbh_nblkptrs(gn->gn_gangblocksize); g++) {
+		blkptr_t *gbp = gbh_bp(gn->gn_gbh, g);
 		if (!BP_IS_GANG(gbp))
 			continue;
 		zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
@@ -3003,10 +3028,11 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data,
 	zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset);
 
 	if (gn != NULL) {
-		ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
+		ASSERT3U(gbh_eck(gn->gn_gbh,
+		    gn->gn_gangblocksize)->zec_magic, ==, ZEC_MAGIC);
 
-		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
-			blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
+		for (int g = 0; g < gbh_nblkptrs(gn->gn_gangblocksize); g++) {
+			blkptr_t *gbp = gbh_bp(gn->gn_gbh, g);
 			if (BP_IS_HOLE(gbp))
 				continue;
 			zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data,
@@ -3113,6 +3139,13 @@ zio_write_gang_done(zio_t *zio)
 		abd_free(zio->io_abd);
 }
 
+static void
+zio_update_feature(void *arg, dmu_tx_t *tx)
+{
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	spa_feature_incr(spa, (spa_feature_t)(uintptr_t)arg, tx);
+}
+
 static zio_t *
 zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
 {
@@ -3158,13 +3191,17 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
 		flags |= METASLAB_ASYNC_ALLOC;
 	}
 
-	error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE,
+	uint64_t gangblocksize = SPA_OLD_GANGBLOCKSIZE;
+	uint64_t candidate = gangblocksize;
+	error = metaslab_alloc_range(spa, mc, gangblocksize, gangblocksize,
 	    bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags,
-	    &pio->io_alloc_list, pio->io_allocator, pio);
+	    &pio->io_alloc_list, pio->io_allocator, pio, &candidate);
 	if (error) {
 		pio->io_error = error;
 		return (pio);
 	}
+	if (spa_feature_is_active(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER))
+		gangblocksize = candidate;
 
 	if (pio == gio) {
 		gnpp = &gio->io_gang_tree;
@@ -3173,15 +3210,15 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
 		ASSERT(pio->io_ready == zio_write_gang_member_ready);
 	}
 
-	gn = zio_gang_node_alloc(gnpp);
+	gn = zio_gang_node_alloc(gnpp, gangblocksize);
 	gbh = gn->gn_gbh;
-	memset(gbh, 0, SPA_GANGBLOCKSIZE);
-	gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE);
+	memset(gbh, 0, gangblocksize);
+	gbh_abd = abd_get_from_buf(gbh, gangblocksize);
 
 	/*
 	 * Create the gang header.
 	 */
-	zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE,
+	zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, gangblocksize,
 	    zio_write_gang_done, NULL, pio->io_priority,
 	    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
 
@@ -3198,7 +3235,9 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
 	 * opportunistic allocations. If that fails to generate enough
 	 * space, we fall back to normal zio_write calls for nested gang.
 	 */
-	for (int g = 0; resid != 0; g++) {
+	int g;
+	boolean_t any_failed = B_FALSE;
+	for (g = 0; resid != 0; g++) {
 		flags &= METASLAB_ASYNC_ALLOC;
 		flags |= METASLAB_GANG_CHILD;
 		zp.zp_checksum = gio->io_prop.zp_checksum;
@@ -3219,9 +3258,9 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
 		memset(zp.zp_mac, 0, ZIO_DATA_MAC_LEN);
 
 		uint64_t min_size = zio_roundup_alloc_size(spa,
-		    resid / (SPA_GBH_NBLKPTRS - g));
+		    resid / (gbh_nblkptrs(gangblocksize) - g));
 		min_size = MIN(min_size, resid);
-		bp = &gbh->zg_blkptr[g];
+		bp = &((blkptr_t *)gbh)[g];
 
 		zio_alloc_list_t cio_list;
 		metaslab_trace_init(&cio_list);
@@ -3231,6 +3270,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
 		    flags, &cio_list, zio->io_allocator, NULL, &allocated_size);
 
 		boolean_t allocated = error == 0;
+		any_failed |= !allocated;
 
 		uint64_t psize = allocated ? MIN(resid, allocated_size) :
 		    min_size;
@@ -3262,6 +3302,29 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
 		zio_nowait(cio);
 	}
 
+	/*
+	 * If we used more gang children than the old limit, we must already be
+	 * using the new headers. No need to update anything, just move on.
+	 *
+	 * Otherwise, we might be in a case where we need to turn on the new
+	 * feature, so we check that. We enable the new feature if we didn't
+	 * manage to fit everything into 3 gang children and we could have
+	 * written more than that.
+	 */
+	if (g > gbh_nblkptrs(SPA_OLD_GANGBLOCKSIZE)) {
+		ASSERT(spa_feature_is_active(spa,
+		    SPA_FEATURE_DYNAMIC_GANG_HEADER));
+	} else if (any_failed && candidate > SPA_OLD_GANGBLOCKSIZE &&
+	    spa_feature_is_enabled(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER) &&
+	    !spa_feature_is_active(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER)) {
+		dmu_tx_t *tx =
+		    dmu_tx_create_assigned(spa->spa_dsl_pool, txg + 1);
+		dsl_sync_task_nowait(spa->spa_dsl_pool,
+		    zio_update_feature,
+		    (void *)SPA_FEATURE_DYNAMIC_GANG_HEADER, tx);
+		dmu_tx_commit(tx);
+	}
+
 	/*
 	 * Set pio's pipeline to just wait for zio to finish.
 	 */
@@ -4331,9 +4394,9 @@ zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
 	}
 
 	if (gn != NULL) {
-		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
+		for (int g = 0; g < gbh_nblkptrs(gn->gn_gangblocksize); g++) {
 			zio_dva_unallocate(zio, gn->gn_child[g],
-			    &gn->gn_gbh->zg_blkptr[g]);
+			    gbh_bp(gn->gn_gbh, g));
 		}
 	}
 }
@@ -5262,6 +5325,7 @@ zio_dva_throttle_done(zio_t *zio)
 	vdev_t *vd = zio->io_vd;
 	int flags = METASLAB_ASYNC_ALLOC;
 	const void *tag = pio;
+	uint64_t size = pio->io_size;
 
 	ASSERT3P(zio->io_bp, !=, NULL);
 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
@@ -5277,10 +5341,13 @@ zio_dva_throttle_done(zio_t *zio)
 	 * Parents of gang children can have two flavors -- ones that allocated
 	 * the gang header (will have ZIO_FLAG_IO_REWRITE set) and ones that
 	 * allocated the constituent blocks.  The first use their parent as tag.
+	 * We set the size to match the original allocation call for that case.
 	 */
 	if (pio->io_child_type == ZIO_CHILD_GANG &&
-	    (pio->io_flags & ZIO_FLAG_IO_REWRITE))
+	    (pio->io_flags & ZIO_FLAG_IO_REWRITE)) {
 		tag = zio_unique_parent(pio);
+		size = SPA_OLD_GANGBLOCKSIZE;
+	}
 
 	ASSERT(IO_IS_ALLOCATING(pio) || (pio->io_child_type == ZIO_CHILD_GANG &&
 	    (pio->io_flags & ZIO_FLAG_IO_REWRITE)));
@@ -5293,7 +5360,7 @@ zio_dva_throttle_done(zio_t *zio)
 	ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled);
 
 	metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id,
-	    pio->io_allocator, flags, pio->io_size, tag);
+	    pio->io_allocator, flags, size, tag);
 
 	if (metaslab_class_throttle_unreserve(pio->io_metaslab_class,
 	    pio->io_allocator, 1, pio->io_size)) {
diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c
index a91775b04af..8cec3a6f562 100644
--- a/module/zfs/zio_checksum.c
+++ b/module/zfs/zio_checksum.c
@@ -545,14 +545,35 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
 	uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum :
 	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
 	int error;
-	uint64_t size = (bp == NULL ? zio->io_size :
-	    (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
+	uint64_t size = bp ? BP_GET_PSIZE(bp) : zio->io_size;
 	uint64_t offset = zio->io_offset;
 	abd_t *data = zio->io_abd;
 	spa_t *spa = zio->io_spa;
 
+	if (bp && BP_IS_GANG(bp)) {
+		if (spa_feature_is_active(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER))
+			size = zio->io_size;
+		else
+			size = SPA_OLD_GANGBLOCKSIZE;
+	}
+
 	error = zio_checksum_error_impl(spa, bp, checksum, data, size,
 	    offset, info);
+	if (error && bp && BP_IS_GANG(bp) && size > SPA_OLD_GANGBLOCKSIZE) {
+		/*
+		 * It's possible that this is an old gang block. Rerun
+		 * the checksum with the old size; if that passes, then
+		 * update the gangblocksize appropriately.
+		 */
+		error = zio_checksum_error_impl(spa, bp, checksum, data,
+		    SPA_OLD_GANGBLOCKSIZE, offset, info);
+		if (error == 0) {
+			ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
+			zio_t *pio = zio_unique_parent(zio);
+			zio_gang_node_t *gn = pio->io_private;
+			gn->gn_gangblocksize = SPA_OLD_GANGBLOCKSIZE;
+		}
+	}
 
 	if (zio_injection_enabled && error == 0 && zio->io_error == 0) {
 		error = zio_handle_fault_injection(zio, ECKSUM);
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index 376518e9f37..214fa70fe58 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -739,7 +739,8 @@ tags = ['functional', 'features', 'large_dnode']
 
 [tests/functional/gang_blocks]
 tests = ['gang_blocks_001_pos', 'gang_blocks_redundant',
-    'gang_blocks_ddt_copies']
+    'gang_blocks_ddt_copies', 'gang_blocks_dyn_header_pos',
+    'gang_blocks_dyn_header_neg', 'gang_blocks_dyn_multi']
 tags = ['functional', 'gang_blocks']
 
 [tests/functional/grow]
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index 20a17a53110..8813f262753 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -1579,6 +1579,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/gang_blocks/gang_blocks_001_pos.ksh \
 	functional/gang_blocks/gang_blocks_ddt_copies.ksh \
 	functional/gang_blocks/gang_blocks_redundant.ksh \
+	functional/gang_blocks/gang_blocks_dyn_header_neg.ksh \
+	functional/gang_blocks/gang_blocks_dyn_header_pos.ksh \
+	functional/gang_blocks/gang_blocks_dyn_multi.ksh \
 	functional/gang_blocks/setup.ksh \
 	functional/grow/grow_pool_001_pos.ksh \
 	functional/grow/grow_replicas_001_pos.ksh \
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_001_pos.ksh
index f96d291ccb1..94ccabeb80a 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_001_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_001_pos.ksh
@@ -50,7 +50,7 @@ function cleanup
 
 function check_features
 {
-	for state in $(zpool get all $TESTPOOL | \
+	for state in $(zpool get all $TESTPOOL | grep -v "dynamic_gang_header" | \
 	    awk '$2 ~ /feature@/ { print $3 }'); do
 		if [[ "$state" != "enabled" && "$state" != "active" ]]; then
 			log_fail "some features are not enabled on new pool"
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_005_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_005_pos.ksh
index 7366a46f9c8..676aca1a20a 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_005_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_005_pos.ksh
@@ -58,6 +58,9 @@ function check_features
 				return 1;
 			fi
 		else
+			if [[ "feature@dynamic_gang_header" == "${2}" ]]; then
+				continue
+			fi
 			# Failure other features must be enabled or active.
 			if [[ "${3}" != "enabled" && "${3}" != "active" ]]; then
 				return 2;
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
index cf5e0961f9f..6de0869765a 100644
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
@@ -91,6 +91,7 @@ typeset -a properties=(
     "feature@device_rebuild"
     "feature@draid"
     "feature@redaction_list_spill"
+    "feature@dynamic_gang_header"
 )
 
 if is_linux || is_freebsd; then
diff --git a/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_neg.ksh b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_neg.ksh
new file mode 100755
index 00000000000..e9cb1d2a034
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_neg.ksh
@@ -0,0 +1,53 @@
+#!/bin/ksh
+# SPDX-License-Identifier: CDDL-1.0
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2025 by Klara Inc.
+#
+
+#
+# Description:
+# Verify that we don't use larger gang headers on ashift=9 pools
+#
+# Strategy:
+# 1. Create a pool with dynamic gang headers.
+# 2. Set metaslab_force_ganging to force multi-level ganging.
+# 3. Verify that a large file has multi-level ganging
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/gang_blocks/gang_blocks.kshlib
+
+log_assert "Verify that we don't use large gang headers on small-ashift pools".
+
+log_onexit cleanup
+preamble
+
+log_must zpool create -f -o ashift=9 -o feature@dynamic_gang_header=enabled $TESTPOOL $DISKS
+log_must zfs create -o recordsize=1M $TESTPOOL/$TESTFS
+mountpoint=$(get_prop mountpoint $TESTPOOL/$TESTFS)
+set_tunable64 METASLAB_FORCE_GANGING 200000
+set_tunable32 METASLAB_FORCE_GANGING_PCT 100
+
+path="${mountpoint}/file"
+log_must dd if=/dev/urandom of=$path bs=1M count=1
+log_must zpool sync $TESTPOOL
+first_block=$(get_first_block_dva $TESTPOOL/$TESTFS file)
+leaves=$(read_gang_header $TESTPOOL $first_block 200)
+gangs=$(echo "$leaves" | grep -c gang)
+[[ "$gangs" -gt 0 ]] || log_fail "We didn't use a deep gang tree when needed"
+
+log_must verify_pool $TESTPOOL
+status=$(get_pool_prop feature@dynamic_gang_header $TESTPOOL)
+[[ "$status" == "enabled" ]] || log_fail "Dynamic gang headers active on an ashift-9 pool"
+log_pass "We don't use large gang headers on small-ashift pools".
diff --git a/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_pos.ksh b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_pos.ksh
new file mode 100755
index 00000000000..e6d6629e9e1
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_pos.ksh
@@ -0,0 +1,73 @@
+#!/bin/ksh
+# SPDX-License-Identifier: CDDL-1.0
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2025 by Klara Inc.
+#
+
+#
+# Description:
+# Verify that we use larger gang headers on ashift=12 pools
+#
+# Strategy:
+# 1. Create a pool with dynamic gang headers.
+# 2. Set metaslab_force_ganging to force ganging.
+# 3. Verify that a large file has more than 3 gang headers.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/gang_blocks/gang_blocks.kshlib
+
+log_assert "Verify that we don't use large gang headers on small-ashift pools".
+
+log_onexit cleanup
+preamble
+
+log_must zpool create -f -o ashift=12 -o feature@dynamic_gang_header=enabled $TESTPOOL $DISKS
+log_must zfs create -o recordsize=1M $TESTPOOL/$TESTFS
+mountpoint=$(get_prop mountpoint $TESTPOOL/$TESTFS)
+set_tunable64 METASLAB_FORCE_GANGING 200000
+set_tunable32 METASLAB_FORCE_GANGING_PCT 100
+
+status=$(get_pool_prop feature@dynamic_gang_header $TESTPOOL)
+[[ "$status" == "enabled" ]] || log_fail "Dynamic gang headers not enabled"
+path="${mountpoint}/file"
+log_must dd if=/dev/urandom of=$path bs=1M count=1
+log_must zpool sync $TESTPOOL
+first_block=$(get_first_block_dva $TESTPOOL/$TESTFS file)
+leaves=$(read_gang_header $TESTPOOL $first_block 1000 | grep -v HOLE)
+first_dva=$(echo "$leaves" | head -n 1 | awk '{print $1}' | sed 's/.*<//' | sed 's/>.*//')
+check_not_gang_dva $first_dva
+
+num_leaves=$(echo "$leaves" | wc -l)
+[[ "$num_leaves" -gt 3 ]] && log_fail "used a larger gang header too soon: \"$leaves\""
+log_must verify_pool $TESTPOOL
+status=$(get_pool_prop feature@dynamic_gang_header $TESTPOOL)
+[[ "$status" == "active" ]] || log_fail "Dynamic gang headers not active"
+
+path="${mountpoint}/file2"
+log_must dd if=/dev/urandom of=$path bs=1M count=1
+log_must zpool sync $TESTPOOL
+first_block=$(get_first_block_dva $TESTPOOL/$TESTFS file2)
+leaves=$(read_gang_header $TESTPOOL $first_block 1000 | grep -v HOLE)
+first_dva=$(echo "$leaves" | head -n 1 | awk '{print $1}' | sed 's/.*<//' | sed 's/>.*//')
+check_not_gang_dva $first_dva
+
+num_leaves=$(echo "$leaves" | wc -l)
+[[ "$num_leaves" -gt 3 ]] || log_fail "didn't use a larger gang header: \"$leaves\""
+
+
+log_must verify_pool $TESTPOOL
+status=$(get_pool_prop feature@dynamic_gang_header $TESTPOOL)
+[[ "$status" == "active" ]] || log_fail "Dynamic gang headers not active"
+log_pass "We don't use large gang headers on small-ashift pools".
diff --git a/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_multi.ksh b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_multi.ksh
new file mode 100755
index 00000000000..2ffe24968f1
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_multi.ksh
@@ -0,0 +1,54 @@
+#!/bin/ksh
+# SPDX-License-Identifier: CDDL-1.0
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2025 by Klara Inc.
+#
+
+#
+# Description:
+# Verify that multi-level ganging still works with dynamic headers
+#
+# Strategy:
+# 1. Create a pool with dynamic gang headers and ashift=12.
+# 2. Set metaslab_force_ganging to force multi-level ganging.
+# 3. Verify that a large file has multi-level ganging
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/gang_blocks/gang_blocks.kshlib
+
+log_assert "Verify that we can still multi-level gang with large headers."
+
+log_onexit cleanup
+preamble
+
+log_must zpool create -f -o ashift=12 -o feature@dynamic_gang_header=enabled $TESTPOOL $DISKS
+log_must zfs create -o recordsize=16M $TESTPOOL/$TESTFS
+mountpoint=$(get_prop mountpoint $TESTPOOL/$TESTFS)
+set_tunable64 METASLAB_FORCE_GANGING 50000
+set_tunable32 METASLAB_FORCE_GANGING_PCT 100
+
+path="${mountpoint}/file"
+log_must dd if=/dev/urandom of=$path bs=16M count=1
+log_must zpool sync $TESTPOOL
+first_block=$(get_first_block_dva $TESTPOOL/$TESTFS file)
+leaves=$(read_gang_header $TESTPOOL $first_block 200)
+gangs=$(echo "$leaves" | grep -c gang)
+[[ "$gangs" -gt 0 ]] || log_fail "We didn't use a deep gang tree when needed"
+
+log_must verify_pool $TESTPOOL
+status=$(get_pool_prop feature@dynamic_gang_header $TESTPOOL)
+[[ "$status" == "active" ]] || log_fail "Dynamic gang headers not active"
+
+log_pass "We can still multi-level gang with large headers."

From ea38787f2ec95515ef7c971f5cc3b9316fc9affd Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Wed, 9 Jul 2025 14:34:02 -0700
Subject: [PATCH 16/72] Revert "Fix incorrect expected error in ztest"

This reverts commit 2076011e0c4c2d8ad6a59534a4784a6aa5f4f3df.  The
comment which explains EINVAL should be expected for this case was
wrong, not the code.  The kernel will return ENOTSUP when attaching
a distributed spare to the wrong top-level dRAID vdev.  See the
check for this in spa_vdev_attach().

Reviewed-by: Paul Dagnelie <paul.dagnelie@klarasystems.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #17503
---
 cmd/ztest.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmd/ztest.c b/cmd/ztest.c
index c7982c59ff4..e334641fef1 100644
--- a/cmd/ztest.c
+++ b/cmd/ztest.c
@@ -3882,7 +3882,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
 	 * If newvd is too small, it should fail with EOVERFLOW.
 	 *
 	 * If newvd is a distributed spare and it's being attached to a
-	 * dRAID which is not its parent it should fail with EINVAL.
+	 * dRAID which is not its parent it should fail with ENOTSUP.
 	 */
 	if (pvd->vdev_ops != &vdev_mirror_ops &&
 	    pvd->vdev_ops != &vdev_root_ops && (!replacing ||
@@ -3901,7 +3901,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
 	else if (ashift > oldvd->vdev_top->vdev_ashift)
 		expected_error = EDOM;
 	else if (newvd_is_dspare && pvd != vdev_draid_spare_get_parent(newvd))
-		expected_error = EINVAL;
+		expected_error = ENOTSUP;
 	else
 		expected_error = 0;
 

From f66b57c87d6ea98144f0661926ee7d5d2bdb96ee Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Wed, 9 Jul 2025 17:38:32 -0400
Subject: [PATCH 17/72] CI: Switch from FreeBSD 13.4 to 13.5

FreeBSD 13.4 is EOL since June 30, 2025.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Closes #17519
---
 .github/workflows/scripts/qemu-2-start.sh | 8 --------
 .github/workflows/zfs-qemu.yml            | 4 ++--
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/scripts/qemu-2-start.sh b/.github/workflows/scripts/qemu-2-start.sh
index 7e20a98c2fa..885a64037f8 100755
--- a/.github/workflows/scripts/qemu-2-start.sh
+++ b/.github/workflows/scripts/qemu-2-start.sh
@@ -71,14 +71,6 @@ case "$OS" in
     OSv="fedora-unknown"
     URL="https://download.fedoraproject.org/pub/fedora/linux/releases/42/Cloud/x86_64/images/Fedora-Cloud-Base-Generic-42-1.1.x86_64.qcow2"
     ;;
-  freebsd13-4r)
-    FreeBSD="13.4-RELEASE"
-    OSNAME="FreeBSD $FreeBSD"
-    OSv="freebsd13.0"
-    URLxz="$FREEBSD_REL/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI.raw.xz"
-    KSRC="$FREEBSD_REL/../amd64/$FreeBSD/src.txz"
-    NIC="rtl8139"
-    ;;
   freebsd13-5r)
     FreeBSD="13.5-RELEASE"
     OSNAME="FreeBSD $FreeBSD"
diff --git a/.github/workflows/zfs-qemu.yml b/.github/workflows/zfs-qemu.yml
index 035d8be7e22..ea17014a117 100644
--- a/.github/workflows/zfs-qemu.yml
+++ b/.github/workflows/zfs-qemu.yml
@@ -39,7 +39,7 @@ jobs:
       - name: Generate OS config and CI type
         id: os
         run: |
-          FULL_OS='["almalinux8", "almalinux9", "almalinux10", "debian11", "debian12", "fedora41", "fedora42", "freebsd13-4r", "freebsd14-3s", "freebsd15-0c", "ubuntu22", "ubuntu24"]'
+          FULL_OS='["almalinux8", "almalinux9", "almalinux10", "debian11", "debian12", "fedora41", "fedora42", "freebsd13-5r", "freebsd14-3s", "freebsd15-0c", "ubuntu22", "ubuntu24"]'
           QUICK_OS='["almalinux8", "almalinux9", "almalinux10", "debian12", "fedora42", "freebsd14-3s", "ubuntu24"]'
           # determine CI type when running on PR
           ci_type="full"
@@ -85,7 +85,7 @@ jobs:
         # debian:  debian11, debian12, ubuntu22, ubuntu24
         # misc:    archlinux, tumbleweed
         # FreeBSD variants of 2025-06:
-        # FreeBSD Release: freebsd13-4r, freebsd13-5r, freebsd14-1r, freebsd14-2r, freebsd14-3r
+        # FreeBSD Release: freebsd13-5r, freebsd14-2r, freebsd14-3r
         # FreeBSD Stable:  freebsd13-5s, freebsd14-3s
         # FreeBSD Current: freebsd15-0c
         os: ${{ fromJson(needs.test-config.outputs.test_os) }}

From d6dcae31660c3256964e763f32d623b782c4b8b2 Mon Sep 17 00:00:00 2001
From: Tino Reichardt <milky-zfs@mcmilk.de>
Date: Wed, 9 Jul 2025 23:40:32 +0200
Subject: [PATCH 18/72] ZTS: Fix FreeBSD 15.0 ksh errors

The package ksh93 is replaced by ksh now.
This works for FreeBSD 13 and 14 also.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Tino Reichardt <milky-zfs@mcmilk.de>
Closes #17523
---
 .github/workflows/scripts/qemu-3-deps-vm.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/scripts/qemu-3-deps-vm.sh b/.github/workflows/scripts/qemu-3-deps-vm.sh
index a581b13c2f5..904fbfbf1e1 100755
--- a/.github/workflows/scripts/qemu-3-deps-vm.sh
+++ b/.github/workflows/scripts/qemu-3-deps-vm.sh
@@ -51,7 +51,7 @@ function freebsd() {
 
   echo "##[group]Install Development Tools"
   sudo pkg install -y autoconf automake autotools base64 checkbashisms fio \
-    gdb gettext gettext-runtime git gmake gsed jq ksh93 lcov libtool lscpu \
+    gdb gettext gettext-runtime git gmake gsed jq ksh lcov libtool lscpu \
     pkgconf python python3 pamtester pamtester qemu-guest-agent rsync xxhash
   sudo pkg install -xy \
     '^samba4[[:digit:]]+$' \

From 2461e6f636562421fb5d6362e3f3384f1835779d Mon Sep 17 00:00:00 2001
From: Tino Reichardt <milky-zfs@mcmilk.de>
Date: Fri, 11 Jul 2025 17:49:06 +0200
Subject: [PATCH 19/72] Delete unused .cirrus.yml

The Cirrus_CI was planned for testing FreeBSD, but never really used I
think. Currently it's not needed anymore, so remove it.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Signed-off-by: Tino Reichardt <milky-zfs@mcmilk.de>
Closes #17155
Closes #17535
---
 .cirrus.yml | 21 ---------------------
 1 file changed, 21 deletions(-)
 delete mode 100644 .cirrus.yml

diff --git a/.cirrus.yml b/.cirrus.yml
deleted file mode 100644
index 366bb87fbb1..00000000000
--- a/.cirrus.yml
+++ /dev/null
@@ -1,21 +0,0 @@
-env:
-  CIRRUS_CLONE_DEPTH: 1
-  ARCH: amd64
-
-build_task:
-  matrix:
-    freebsd_instance:
-      image_family: freebsd-13-5
-    freebsd_instance:
-      image_family: freebsd-14-2
-    freebsd_instance:
-      image_family: freebsd-15-0-snap
-  prepare_script:
-    - pkg install -y autoconf automake libtool gettext-runtime gmake ksh93 py311-packaging py311-cffi py311-sysctl
-  configure_script:
-    - env MAKE=gmake ./autogen.sh
-    - env MAKE=gmake ./configure --with-config="user" --with-python=3.11
-  build_script:
-    - gmake -j `sysctl -n kern.smp.cpus`
-  install_script:
-    - gmake install

From 8de8e0df9fea1a787d10792f4ed3b3ff88b55043 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Attila=20F=C3=BCl=C3=B6p?= <attila@fueloep.org>
Date: Tue, 15 Jul 2025 00:10:02 +0200
Subject: [PATCH 20/72] objtool wrapper: use absolute path to call the wrapper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Older kernel versions run make outside of the build directory. This
works since all paths are absolute. Relative paths will fail in such
a scenario.

Use an absolute path to the objtool wrapper as well, since the
relative path breaks the build on older kernels.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Attila Fülöp <attila@fueloep.org>
Closes #17541
---
 module/Makefile.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/module/Makefile.in b/module/Makefile.in
index e9a26812176..859ba8649dd 100644
--- a/module/Makefile.in
+++ b/module/Makefile.in
@@ -57,7 +57,7 @@ modules-Linux:
 		$(if @KERNEL_LD@,LD=@KERNEL_LD@) $(if @KERNEL_LLVM@,LLVM=@KERNEL_LLVM@) \
 		$(if @KERNEL_CROSS_COMPILE@,CROSS_COMPILE=@KERNEL_CROSS_COMPILE@) \
 		$(if @KERNEL_ARCH@,ARCH=@KERNEL_ARCH@) \
-		$(if @OBJTOOL_DISABLE_WERROR@,objtool=@top_builddir@/scripts/objtool-wrapper) \
+		$(if @OBJTOOL_DISABLE_WERROR@,objtool=@abs_top_builddir@/scripts/objtool-wrapper) \
 		M="$$PWD" @KERNEL_MAKE@ CONFIG_ZFS=m modules
 
 modules-FreeBSD:

From fe3b2b76cf1a6e02a103e0a01413d5c3984c5705 Mon Sep 17 00:00:00 2001
From: Carl George <carlwgeorge@gmail.com>
Date: Tue, 15 Jul 2025 12:00:35 -0500
Subject: [PATCH 21/72] CI: Add CentOS Stream 9/10 to the FULL_OS runner list

Testing on CentOS Stream provides several months advance notice of
changes coming to the RHEL kernel.  This should help OpenZFS be
proactive instead of reactive to new RHEL minor versions.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Signed-off-by: Carl George <carlwgeorge@gmail.com>
ZFS-CI-Type: full
Closes #16904
Closes #17526
---
 .github/workflows/zfs-qemu.yml | 20 +-------------------
 1 file changed, 1 insertion(+), 19 deletions(-)

diff --git a/.github/workflows/zfs-qemu.yml b/.github/workflows/zfs-qemu.yml
index ea17014a117..cda62031318 100644
--- a/.github/workflows/zfs-qemu.yml
+++ b/.github/workflows/zfs-qemu.yml
@@ -5,16 +5,6 @@ on:
   pull_request:
   workflow_dispatch:
     inputs:
-      include_stream9:
-        type: boolean
-        required: false
-        default: false
-        description: 'Test on CentOS 9 stream'
-      include_stream10:
-        type: boolean
-        required: false
-        default: false
-        description: 'Test on CentOS 10 stream'
       fedora_kernel_ver:
         type: string
         required: false
@@ -39,7 +29,7 @@ jobs:
       - name: Generate OS config and CI type
         id: os
         run: |
-          FULL_OS='["almalinux8", "almalinux9", "almalinux10", "debian11", "debian12", "fedora41", "fedora42", "freebsd13-5r", "freebsd14-3s", "freebsd15-0c", "ubuntu22", "ubuntu24"]'
+          FULL_OS='["almalinux8", "almalinux9", "almalinux10", "centos-stream9", "centos-stream10", "debian11", "debian12", "fedora41", "fedora42", "freebsd13-5r", "freebsd14-3s", "freebsd15-0c", "ubuntu22", "ubuntu24"]'
           QUICK_OS='["almalinux8", "almalinux9", "almalinux10", "debian12", "fedora42", "freebsd14-3s", "ubuntu24"]'
           # determine CI type when running on PR
           ci_type="full"
@@ -63,14 +53,6 @@ jobs:
               os_json=$(echo ${os_selection} | jq -c)
           fi
 
-          # Add optional runners
-          if [ "${{ github.event.inputs.include_stream9 }}" == 'true' ]; then
-            os_json=$(echo $os_json | jq -c '. += ["centos-stream9"]')
-          fi
-          if [ "${{ github.event.inputs.include_stream10 }}" == 'true' ]; then
-            os_json=$(echo $os_json | jq -c '. += ["centos-stream10"]')
-          fi
-
           echo $os_json
           echo "os=$os_json" >> $GITHUB_OUTPUT
           echo "ci_type=$ci_type" >> $GITHUB_OUTPUT

From 3a494c6d2ac1cd42aba87d5d0bb1dd1cc8f1365f Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Sat, 12 Jul 2025 09:28:36 +1000
Subject: [PATCH 22/72] mod.h: make consistent across all three platforms

mod.h only exists to include the platform-specific mod_os.h, so we can
get rid of it and just call the platform header mod.h.

Then, create a libspl mod.h, and move the relevant items to it so we can
start building on it.

Sponsored-by: https://despairlabs.com/sponsor/
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Closes #17537
---
 include/Makefile.am                           |  1 -
 include/os/freebsd/Makefile.am                |  2 +-
 .../os/freebsd/spl/sys/{mod_os.h => mod.h}    |  0
 include/os/linux/Makefile.am                  |  2 +-
 include/os/linux/spl/sys/{mod_os.h => mod.h}  |  0
 include/sys/mod.h                             | 36 -----------------
 include/sys/vdev_impl.h                       |  2 +-
 include/sys/zfs_context.h                     | 12 ------
 lib/libspl/include/Makefile.am                |  1 +
 lib/libspl/include/sys/mod.h                  | 39 +++++++++++++++++++
 scripts/spdxcheck.pl                          |  1 -
 11 files changed, 43 insertions(+), 53 deletions(-)
 rename include/os/freebsd/spl/sys/{mod_os.h => mod.h} (100%)
 rename include/os/linux/spl/sys/{mod_os.h => mod.h} (100%)
 delete mode 100644 include/sys/mod.h
 create mode 100644 lib/libspl/include/sys/mod.h

diff --git a/include/Makefile.am b/include/Makefile.am
index a9258deabfd..a0427ae6a47 100644
--- a/include/Makefile.am
+++ b/include/Makefile.am
@@ -69,7 +69,6 @@ COMMON_H = \
 	sys/metaslab_impl.h \
 	sys/mmp.h \
 	sys/mntent.h \
-	sys/mod.h \
 	sys/multilist.h \
 	sys/nvpair.h \
 	sys/nvpair_impl.h \
diff --git a/include/os/freebsd/Makefile.am b/include/os/freebsd/Makefile.am
index d975c4fe69f..d6b6923d033 100644
--- a/include/os/freebsd/Makefile.am
+++ b/include/os/freebsd/Makefile.am
@@ -33,7 +33,7 @@ noinst_HEADERS = \
 	%D%/spl/sys/list_impl.h \
 	%D%/spl/sys/lock.h \
 	%D%/spl/sys/misc.h \
-	%D%/spl/sys/mod_os.h \
+	%D%/spl/sys/mod.h \
 	%D%/spl/sys/mode.h \
 	%D%/spl/sys/mount.h \
 	%D%/spl/sys/mutex.h \
diff --git a/include/os/freebsd/spl/sys/mod_os.h b/include/os/freebsd/spl/sys/mod.h
similarity index 100%
rename from include/os/freebsd/spl/sys/mod_os.h
rename to include/os/freebsd/spl/sys/mod.h
diff --git a/include/os/linux/Makefile.am b/include/os/linux/Makefile.am
index 4fe6705defe..e156ca183db 100644
--- a/include/os/linux/Makefile.am
+++ b/include/os/linux/Makefile.am
@@ -75,7 +75,7 @@ kernel_spl_sys_HEADERS = \
 	%D%/spl/sys/kstat.h \
 	%D%/spl/sys/list.h \
 	%D%/spl/sys/misc.h \
-	%D%/spl/sys/mod_os.h \
+	%D%/spl/sys/mod.h \
 	%D%/spl/sys/mutex.h \
 	%D%/spl/sys/param.h \
 	%D%/spl/sys/proc.h \
diff --git a/include/os/linux/spl/sys/mod_os.h b/include/os/linux/spl/sys/mod.h
similarity index 100%
rename from include/os/linux/spl/sys/mod_os.h
rename to include/os/linux/spl/sys/mod.h
diff --git a/include/sys/mod.h b/include/sys/mod.h
deleted file mode 100644
index 4122889ab75..00000000000
--- a/include/sys/mod.h
+++ /dev/null
@@ -1,36 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
- *  Copyright (C) 2007 The Regents of the University of California.
- *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
- *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
- *  UCRL-CODE-235197
- *
- *  This file is part of the SPL, Solaris Porting Layer.
- *
- *  The SPL is free software; you can redistribute it and/or modify it
- *  under the terms of the GNU General Public License as published by the
- *  Free Software Foundation; either version 2 of the License, or (at your
- *  option) any later version.
- *
- *  The SPL is distributed in the hope that it will be useful, but WITHOUT
- *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- *  for more details.
- *
- *  You should have received a copy of the GNU General Public License along
- *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
- */
-#ifndef _SYS_MOD_H
-#define	_SYS_MOD_H
-
-#ifdef _KERNEL
-#include <sys/mod_os.h>
-#else
-/*
- * Exported symbols
- */
-#define	EXPORT_SYMBOL(x)
-#endif
-
-#endif /* SYS_MOD_H */
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index 385d7224f2c..c925eb490cd 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -645,7 +645,7 @@ extern int vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise);
 int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj);
 void vdev_metaslab_group_create(vdev_t *vd);
 uint64_t vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b);
-#if defined(__linux__)
+#if defined(__linux__) && defined(_KERNEL)
 int param_get_raidz_impl(char *buf, zfs_kernel_param_t *kp);
 #endif
 int param_set_raidz_impl(ZFS_MODULE_PARAM_ARGS);
diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h
index 256c9c2cc2d..e155f2daa39 100644
--- a/include/sys/zfs_context.h
+++ b/include/sys/zfs_context.h
@@ -204,18 +204,6 @@ extern void vpanic(const char *, va_list)
 #endif	/* DTRACE_PROBE4 */
 #define	DTRACE_PROBE4(a, b, c, d, e, f, g, h, i)
 
-/*
- * Tunables.
- */
-typedef struct zfs_kernel_param {
-	const char *name;	/* unused stub */
-} zfs_kernel_param_t;
-
-#define	ZFS_MODULE_PARAM(scope_prefix, name_prefix, name, type, perm, desc)
-#define	ZFS_MODULE_PARAM_ARGS void
-#define	ZFS_MODULE_PARAM_CALL(scope_prefix, name_prefix, name, setfunc, \
-	getfunc, perm, desc)
-
 /*
  * Threads.
  */
diff --git a/lib/libspl/include/Makefile.am b/lib/libspl/include/Makefile.am
index 8c286142f29..e17119e968e 100644
--- a/lib/libspl/include/Makefile.am
+++ b/lib/libspl/include/Makefile.am
@@ -45,6 +45,7 @@ libspl_sys_HEADERS = \
 	%D%/sys/list_impl.h \
 	%D%/sys/mhd.h \
 	%D%/sys/mkdev.h \
+	%D%/sys/mod.h \
 	%D%/sys/policy.h \
 	%D%/sys/poll.h \
 	%D%/sys/priv.h \
diff --git a/lib/libspl/include/sys/mod.h b/lib/libspl/include/sys/mod.h
new file mode 100644
index 00000000000..b1a39e91309
--- /dev/null
+++ b/lib/libspl/include/sys/mod.h
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: CDDL-1.0
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+#ifndef _SYS_MOD_H
+#define	_SYS_MOD_H
+
+#define	ZFS_MODULE_PARAM(scope, prefix, name, type, perm, desc)
+#define	ZFS_MODULE_PARAM_ARGS void
+#define	ZFS_MODULE_PARAM_CALL(scope_prefix, name_prefix, name, setfunc, \
+	getfunc, perm, desc)
+
+#define	EXPORT_SYMBOL(x)
+
+#endif
diff --git a/scripts/spdxcheck.pl b/scripts/spdxcheck.pl
index 47128402f7b..88f5a235d70 100755
--- a/scripts/spdxcheck.pl
+++ b/scripts/spdxcheck.pl
@@ -253,7 +253,6 @@ my %override_file_license_tags = (
 	'GPL-2.0-or-later' => [qw(
 		include/os/freebsd/spl/sys/kstat.h
 		include/os/freebsd/spl/sys/sunddi.h
-		include/sys/mod.h
 	)],
 	'CDDL-1.0' => [qw(
 		include/os/linux/spl/sys/errno.h

From 967ce75669254851da443aeb927242c3d12d3b46 Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Sat, 12 Jul 2025 11:31:17 +1000
Subject: [PATCH 23/72] libspl: implement ZFS_MODULE_PARAM for userspace

For each tunable declaration, we create a zfs_tunable_t with its
details, and then a pointer to it in the 'zfs_tunables' ELF section,
that we can access later with a little support from the linker.

Sponsored-by: https://despairlabs.com/sponsor/
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Closes #17537
---
 lib/libspl/Makefile.am            |  1 +
 lib/libspl/include/Makefile.am    |  1 +
 lib/libspl/include/sys/mod.h      | 19 +++++++-
 lib/libspl/include/sys/tunables.h | 52 ++++++++++++++++++++++
 lib/libspl/tunables.c             | 72 +++++++++++++++++++++++++++++++
 5 files changed, 144 insertions(+), 1 deletion(-)
 create mode 100644 lib/libspl/include/sys/tunables.h
 create mode 100644 lib/libspl/tunables.c

diff --git a/lib/libspl/Makefile.am b/lib/libspl/Makefile.am
index f8943572bf2..6640ecd582a 100644
--- a/lib/libspl/Makefile.am
+++ b/lib/libspl/Makefile.am
@@ -20,6 +20,7 @@ libspl_la_SOURCES = \
 	%D%/strlcat.c \
 	%D%/strlcpy.c \
 	%D%/timestamp.c \
+	%D%/tunables.c \
 	%D%/include/sys/list.h \
 	%D%/include/sys/list_impl.h
 
diff --git a/lib/libspl/include/Makefile.am b/lib/libspl/include/Makefile.am
index e17119e968e..21f0c70db9e 100644
--- a/lib/libspl/include/Makefile.am
+++ b/lib/libspl/include/Makefile.am
@@ -59,6 +59,7 @@ libspl_sys_HEADERS = \
 	%D%/sys/time.h \
 	%D%/sys/trace_spl.h \
 	%D%/sys/trace_zfs.h \
+	%D%/sys/tunables.h \
 	%D%/sys/types.h \
 	%D%/sys/types32.h \
 	%D%/sys/uio.h \
diff --git a/lib/libspl/include/sys/mod.h b/lib/libspl/include/sys/mod.h
index b1a39e91309..ad19b6607a4 100644
--- a/lib/libspl/include/sys/mod.h
+++ b/lib/libspl/include/sys/mod.h
@@ -24,12 +24,29 @@
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
  */
 
 #ifndef _SYS_MOD_H
 #define	_SYS_MOD_H
 
-#define	ZFS_MODULE_PARAM(scope, prefix, name, type, perm, desc)
+#include <sys/tunables.h>
+
+#define	ZFS_MODULE_PARAM(scope, prefix, name, type, perm, desc)		\
+	static const zfs_tunable_t _zfs_tunable_##prefix##name = {	\
+		.zt_name = #prefix#name,				\
+		.zt_varp = &prefix##name,				\
+		.zt_varsz = sizeof (prefix##name),			\
+		.zt_type = ZFS_TUNABLE_TYPE_##type,			\
+		.zt_perm = ZFS_TUNABLE_PERM_##perm,			\
+		.zt_desc = desc						\
+	};								\
+	static const zfs_tunable_t *					\
+	__zfs_tunable_##prefix##name					\
+	__attribute__((__section__("zfs_tunables")))			\
+	__attribute__((__used__))					\
+	= &_zfs_tunable_##prefix##name;
+
 #define	ZFS_MODULE_PARAM_ARGS void
 #define	ZFS_MODULE_PARAM_CALL(scope_prefix, name_prefix, name, setfunc, \
 	getfunc, perm, desc)
diff --git a/lib/libspl/include/sys/tunables.h b/lib/libspl/include/sys/tunables.h
new file mode 100644
index 00000000000..9c7036791ee
--- /dev/null
+++ b/lib/libspl/include/sys/tunables.h
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: CDDL-1.0
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
+ */
+
+#ifndef _SYS_TUNABLES_H
+#define	_SYS_TUNABLES_H
+
+typedef enum {
+	ZFS_TUNABLE_TYPE_INT,
+	ZFS_TUNABLE_TYPE_UINT,
+	ZFS_TUNABLE_TYPE_ULONG,
+	ZFS_TUNABLE_TYPE_U64,
+	ZFS_TUNABLE_TYPE_STRING,
+} zfs_tunable_type_t;
+
+typedef enum {
+	ZFS_TUNABLE_PERM_ZMOD_RW,
+	ZFS_TUNABLE_PERM_ZMOD_RD,
+} zfs_tunable_perm_t;
+
+typedef struct zfs_tunable {
+	const char		*zt_name;
+	void			*zt_varp;
+	size_t			zt_varsz;
+	zfs_tunable_type_t	zt_type;
+	zfs_tunable_perm_t	zt_perm;
+	const char		*zt_desc;
+} zfs_tunable_t;
+
+#endif
diff --git a/lib/libspl/tunables.c b/lib/libspl/tunables.c
new file mode 100644
index 00000000000..8ad74fc95ef
--- /dev/null
+++ b/lib/libspl/tunables.c
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: CDDL-1.0
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
+ */
+
+#include <stddef.h>
+#include <sys/tunables.h>
+
+/*
+ * Userspace tunables.
+ *
+ * Tunables are external pointers to global variables that are wired up to the
+ * host environment in some way that allows the operator to directly change
+ * their values "under the hood".
+ *
+ * In userspace, the "host environment" is the program using libzpool.so. So
+ * that it can manipulate tunables if it wants, we provide an API to access
+ * them.
+ *
+ * Tunables are declared through the ZFS_MODULE_PARAM* macros, which associate
+ * a global variable with some metadata we can use to describe and access the
+ * tunable. This is done by creating a uniquely-named zfs_tunable_t.
+ *
+ * At runtime, we need a way to discover these zfs_tunable_t items. Since they
+ * are declared globally, all over the codebase, there's no central place to
+ * record or list them. So, we take advantage of the compiler's "linker set"
+ * feature.
+ *
+ * In the ZFS_MODULE_PARAM macro, after we create the zfs_tunable_t, we also
+ * create a zfs_tunable_t* pointing to it. That pointer is forced into the
+ * "zfs_tunables" ELF section in compiled object. At link time, the linker will
+ * collect all these pointers into one single big "zfs_tunable" section, and
+ * will generate two new symbols in the final object: __start_zfs_tunable and
+ * __stop_zfs_tunable. These point to the first and last item in that section,
+ * which allows us to access the pointers in that section like an array, and
+ * through those pointers access the tunable metadata, and from there the
+ * actual C variable that the tunable describes.
+ */
+
+extern const zfs_tunable_t *__start_zfs_tunables;
+extern const zfs_tunable_t *__stop_zfs_tunables;
+
+/*
+ * Because there are no tunables in libspl itself, the above symbols will not
+ * be generated, which will stop libspl being linked at all. To work around
+ * that, we force a symbol into that section, and then when iterating, skip
+ * any NULL pointers.
+ */
+static void *__zfs_tunable__placeholder
+	__attribute__((__section__("zfs_tunables")))
+	__attribute__((__used__)) = NULL;

From cb9742e5328cd34844b3e85708b355b4a87f9015 Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Sat, 12 Jul 2025 11:32:44 +1000
Subject: [PATCH 24/72] libspl: add API for manipulating tunables

Sponsored-by: https://despairlabs.com/sponsor/
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Closes #17537
---
 lib/libspl/include/sys/tunables.h |   8 +
 lib/libspl/tunables.c             | 247 ++++++++++++++++++++++++++++++
 lib/libuutil/libuutil.abi         | 196 +++++++++++++++++++++++-
 lib/libzfs/libzfs.abi             | 204 +++++++++++++++++++++++-
 lib/libzfs_core/libzfs_core.abi   | 206 ++++++++++++++++++++++++-
 5 files changed, 843 insertions(+), 18 deletions(-)

diff --git a/lib/libspl/include/sys/tunables.h b/lib/libspl/include/sys/tunables.h
index 9c7036791ee..5d9bb3d71a4 100644
--- a/lib/libspl/include/sys/tunables.h
+++ b/lib/libspl/include/sys/tunables.h
@@ -49,4 +49,12 @@ typedef struct zfs_tunable {
 	const char		*zt_desc;
 } zfs_tunable_t;
 
+int zfs_tunable_set(const zfs_tunable_t *tunable, const char *val);
+int zfs_tunable_get(const zfs_tunable_t *tunable, char *val, size_t valsz);
+
+const zfs_tunable_t *zfs_tunable_lookup(const char *name);
+
+typedef int (*zfs_tunable_iter_t)(const zfs_tunable_t *tunable, void *arg);
+void zfs_tunable_iter(zfs_tunable_iter_t cb, void *arg);
+
 #endif
diff --git a/lib/libspl/tunables.c b/lib/libspl/tunables.c
index 8ad74fc95ef..67dc9710dee 100644
--- a/lib/libspl/tunables.c
+++ b/lib/libspl/tunables.c
@@ -25,6 +25,12 @@
  */
 
 #include <stddef.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <errno.h>
+#include <limits.h>
+#include <inttypes.h>
 #include <sys/tunables.h>
 
 /*
@@ -70,3 +76,244 @@ extern const zfs_tunable_t *__stop_zfs_tunables;
 static void *__zfs_tunable__placeholder
 	__attribute__((__section__("zfs_tunables")))
 	__attribute__((__used__)) = NULL;
+
+/*
+ * Find the name tunable by walking through the linker set and comparing names,
+ * as described above. This is not particularly efficient but it's a fairly
+ * rare task, so it shouldn't be a big deal.
+ */
+const zfs_tunable_t *
+zfs_tunable_lookup(const char *name)
+{
+	for (const zfs_tunable_t **ztp = &__start_zfs_tunables;
+	    ztp != &__stop_zfs_tunables; ztp++) {
+		const zfs_tunable_t *zt = *ztp;
+		if (zt == NULL)
+			continue;
+		if (strcmp(name, zt->zt_name) == 0)
+			return (zt);
+	}
+
+	return (NULL);
+}
+
+/*
+ * Like zfs_tunable_lookup, but call the provided callback for each tunable.
+ */
+void
+zfs_tunable_iter(zfs_tunable_iter_t cb, void *arg)
+{
+	for (const zfs_tunable_t **ztp = &__start_zfs_tunables;
+	    ztp != &__stop_zfs_tunables; ztp++) {
+		const zfs_tunable_t *zt = *ztp;
+		if (zt == NULL)
+			continue;
+		if (cb(zt, arg))
+			return;
+	}
+}
+
+/*
+ * Parse a string into an int or uint. It's easier to have a pair of "generic"
+ * functions that clamp to a given min and max rather than have multiple
+ * functions for each width of type.
+ */
+static int
+zfs_tunable_parse_int(const char *val, intmax_t *np,
+    intmax_t min, intmax_t max)
+{
+	intmax_t n;
+	char *end;
+	errno = 0;
+	n = strtoimax(val, &end, 0);
+	if (errno != 0)
+		return (errno);
+	if (*end != '\0')
+		return (EINVAL);
+	if (n < min || n > max)
+		return (ERANGE);
+	*np = n;
+	return (0);
+}
+
+static int
+zfs_tunable_parse_uint(const char *val, uintmax_t *np,
+    uintmax_t min, uintmax_t max)
+{
+	uintmax_t n;
+	char *end;
+	errno = 0;
+	n = strtoumax(val, &end, 0);
+	if (errno != 0)
+		return (errno);
+	if (*end != '\0')
+		return (EINVAL);
+	if (strchr(val, '-'))
+		return (ERANGE);
+	if (n < min || n > max)
+		return (ERANGE);
+	*np = n;
+	return (0);
+}
+
+/*
+ * Set helpers for each tunable type. Parses the string, and if produces a
+ * valid value for the tunable, sets it. No effort is made to make sure the
+ * tunable is of the right type; that's done in zfs_tunable_set() below.
+ */
+static int
+zfs_tunable_set_int(const zfs_tunable_t *zt, const char *val)
+{
+	intmax_t n;
+	int err = zfs_tunable_parse_int(val, &n, INT_MIN, INT_MAX);
+	if (err != 0)
+		return (err);
+	*(int *)zt->zt_varp = n;
+	return (0);
+}
+
+static int
+zfs_tunable_set_uint(const zfs_tunable_t *zt, const char *val)
+{
+	uintmax_t n;
+	int err = zfs_tunable_parse_uint(val, &n, 0, UINT_MAX);
+	if (err != 0)
+		return (err);
+	*(unsigned int *)zt->zt_varp = n;
+	return (0);
+}
+
+static int
+zfs_tunable_set_ulong(const zfs_tunable_t *zt, const char *val)
+{
+	uintmax_t n;
+	int err = zfs_tunable_parse_uint(val, &n, 0, ULONG_MAX);
+	if (err != 0)
+		return (err);
+	*(unsigned long *)zt->zt_varp = n;
+	return (0);
+}
+
+static int
+zfs_tunable_set_u64(const zfs_tunable_t *zt, const char *val)
+{
+	uintmax_t n;
+	int err = zfs_tunable_parse_uint(val, &n, 0, UINT64_MAX);
+	if (err != 0)
+		return (err);
+	*(uint64_t *)zt->zt_varp = n;
+	return (0);
+}
+
+static int
+zfs_tunable_set_string(const zfs_tunable_t *zt, const char *val)
+{
+	(void) zt, (void) val;
+	/*
+	 * We can't currently handle strings. String tunables are pointers
+	 * into read-only memory, so we can update the pointer, but not the
+	 * contents. That would mean taking an allocation, but we don't have
+	 * an obvious place to free it.
+	 *
+	 * For now, it's no big deal as there's only a couple of string
+	 * tunables anyway.
+	 */
+	return (ENOTSUP);
+}
+
+/*
+ * Get helpers for each tunable type. Converts the value to a string if
+ * necessary and writes it into the provided buffer. The type is assumed to
+ * be correct; zfs_tunable_get() below will call the correct function for the
+ * type.
+ */
+static int
+zfs_tunable_get_int(const zfs_tunable_t *zt, char *val, size_t valsz)
+{
+	snprintf(val, valsz, "%d", *(int *)zt->zt_varp);
+	return (0);
+}
+
+static int
+zfs_tunable_get_uint(const zfs_tunable_t *zt, char *val, size_t valsz)
+{
+	snprintf(val, valsz, "%u", *(unsigned int *)zt->zt_varp);
+	return (0);
+}
+
+static int
+zfs_tunable_get_ulong(const zfs_tunable_t *zt, char *val, size_t valsz)
+{
+	snprintf(val, valsz, "%lu", *(unsigned long *)zt->zt_varp);
+	return (0);
+}
+
+static int
+zfs_tunable_get_u64(const zfs_tunable_t *zt, char *val, size_t valsz)
+{
+	snprintf(val, valsz, "%"PRIu64, *(uint64_t *)zt->zt_varp);
+	return (0);
+}
+
+static int
+zfs_tunable_get_string(const zfs_tunable_t *zt, char *val, size_t valsz)
+{
+	strlcpy(val, *(char **)zt->zt_varp, valsz);
+	return (0);
+}
+
+/* The public set function. Delegates to the type-specific version. */
+int
+zfs_tunable_set(const zfs_tunable_t *zt, const char *val)
+{
+	int err;
+	switch (zt->zt_type) {
+	case ZFS_TUNABLE_TYPE_INT:
+		err = zfs_tunable_set_int(zt, val);
+		break;
+	case ZFS_TUNABLE_TYPE_UINT:
+		err = zfs_tunable_set_uint(zt, val);
+		break;
+	case ZFS_TUNABLE_TYPE_ULONG:
+		err = zfs_tunable_set_ulong(zt, val);
+		break;
+	case ZFS_TUNABLE_TYPE_U64:
+		err = zfs_tunable_set_u64(zt, val);
+		break;
+	case ZFS_TUNABLE_TYPE_STRING:
+		err = zfs_tunable_set_string(zt, val);
+		break;
+	default:
+		err = EOPNOTSUPP;
+		break;
+	}
+	return (err);
+}
+
+/* The public get function. Delegates to the type-specific version. */
+int
+zfs_tunable_get(const zfs_tunable_t *zt, char *val, size_t valsz)
+{
+	int err;
+	switch (zt->zt_type) {
+	case ZFS_TUNABLE_TYPE_INT:
+		err = zfs_tunable_get_int(zt, val, valsz);
+		break;
+	case ZFS_TUNABLE_TYPE_UINT:
+		err = zfs_tunable_get_uint(zt, val, valsz);
+		break;
+	case ZFS_TUNABLE_TYPE_ULONG:
+		err = zfs_tunable_get_ulong(zt, val, valsz);
+		break;
+	case ZFS_TUNABLE_TYPE_U64:
+		err = zfs_tunable_get_u64(zt, val, valsz);
+		break;
+	case ZFS_TUNABLE_TYPE_STRING:
+		err = zfs_tunable_get_string(zt, val, valsz);
+		break;
+	default:
+		err = EOPNOTSUPP;
+		break;
+	}
+	return (err);
+}
diff --git a/lib/libuutil/libuutil.abi b/lib/libuutil/libuutil.abi
index 0052f0d47a7..744b5312762 100644
--- a/lib/libuutil/libuutil.abi
+++ b/lib/libuutil/libuutil.abi
@@ -244,6 +244,10 @@
     <elf-symbol name='uu_strerror' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='uu_strndup' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='uu_zalloc' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='zfs_tunable_get' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='zfs_tunable_iter' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='zfs_tunable_lookup' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='zfs_tunable_set' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
   </elf-function-symbols>
   <abi-instr address-size='64' path='lib/libspl/assert.c' language='LANG_C99'>
     <typedef-decl name='__pid_t' type-id='95e97e5e' id='3629bad8'/>
@@ -612,7 +616,6 @@
     <array-type-def dimensions='1' type-id='de572c22' size-in-bits='1472' id='6d3c2f42'>
       <subrange length='23' type-id='7359adad' id='fdd0f594'/>
     </array-type-def>
-    <type-decl name='long long int' size-in-bits='64' id='1eb56b1e'/>
     <array-type-def dimensions='1' type-id='3a47d82b' size-in-bits='256' id='a133ec23'>
       <subrange length='4' type-id='7359adad' id='16fe7105'/>
     </array-type-def>
@@ -978,8 +981,6 @@
   </abi-instr>
   <abi-instr address-size='64' path='lib/libspl/os/linux/gethostid.c' language='LANG_C99'>
     <type-decl name='long long unsigned int' size-in-bits='64' id='3a47d82b'/>
-    <pointer-type-def type-id='26a90f95' size-in-bits='64' id='9b23c9ad'/>
-    <qualified-type-def type-id='9b23c9ad' restrict='yes' id='8c85230f'/>
     <function-decl name='fclose' visibility='default' binding='global' size-in-bits='64'>
       <parameter type-id='822cd80b'/>
       <return type-id='95e97e5e'/>
@@ -1019,6 +1020,13 @@
     <array-type-def dimensions='1' type-id='03085adc' size-in-bits='192' id='083f8d58'>
       <subrange length='3' type-id='7359adad' id='56f209d2'/>
     </array-type-def>
+    <array-type-def dimensions='1' type-id='d315442e' size-in-bits='16' id='811205dc'>
+      <subrange length='1' type-id='7359adad' id='52f813b4'/>
+    </array-type-def>
+    <array-type-def dimensions='1' type-id='d3130597' size-in-bits='768' id='f63f23b9'>
+      <subrange length='12' type-id='7359adad' id='84827bdc'/>
+    </array-type-def>
+    <type-decl name='long long int' size-in-bits='64' id='1eb56b1e'/>
     <class-decl name='mnttab' size-in-bits='256' is-struct='yes' visibility='default' id='1b055409'>
       <data-member access='public' layout-offset-in-bits='0'>
         <var-decl name='mnt_special' type-id='26a90f95' visibility='default'/>
@@ -1053,6 +1061,93 @@
         <var-decl name='mnt_minor' type-id='3502e3ff' visibility='default'/>
       </data-member>
     </class-decl>
+    <typedef-decl name='__u16' type-id='8efea9e5' id='d315442e'/>
+    <typedef-decl name='__s32' type-id='95e97e5e' id='3158a266'/>
+    <typedef-decl name='__u32' type-id='f0981eeb' id='3f1a6b60'/>
+    <typedef-decl name='__s64' type-id='1eb56b1e' id='49659421'/>
+    <typedef-decl name='__u64' type-id='3a47d82b' id='d3130597'/>
+    <class-decl name='statx_timestamp' size-in-bits='128' is-struct='yes' visibility='default' id='94101016'>
+      <data-member access='public' layout-offset-in-bits='0'>
+        <var-decl name='tv_sec' type-id='49659421' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='64'>
+        <var-decl name='tv_nsec' type-id='3f1a6b60' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='96'>
+        <var-decl name='__reserved' type-id='3158a266' visibility='default'/>
+      </data-member>
+    </class-decl>
+    <class-decl name='statx' size-in-bits='2048' is-struct='yes' visibility='default' id='720b04c5'>
+      <data-member access='public' layout-offset-in-bits='0'>
+        <var-decl name='stx_mask' type-id='3f1a6b60' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='32'>
+        <var-decl name='stx_blksize' type-id='3f1a6b60' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='64'>
+        <var-decl name='stx_attributes' type-id='d3130597' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='128'>
+        <var-decl name='stx_nlink' type-id='3f1a6b60' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='160'>
+        <var-decl name='stx_uid' type-id='3f1a6b60' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='192'>
+        <var-decl name='stx_gid' type-id='3f1a6b60' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='224'>
+        <var-decl name='stx_mode' type-id='d315442e' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='240'>
+        <var-decl name='__spare0' type-id='811205dc' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='256'>
+        <var-decl name='stx_ino' type-id='d3130597' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='320'>
+        <var-decl name='stx_size' type-id='d3130597' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='384'>
+        <var-decl name='stx_blocks' type-id='d3130597' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='448'>
+        <var-decl name='stx_attributes_mask' type-id='d3130597' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='512'>
+        <var-decl name='stx_atime' type-id='94101016' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='640'>
+        <var-decl name='stx_btime' type-id='94101016' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='768'>
+        <var-decl name='stx_ctime' type-id='94101016' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='896'>
+        <var-decl name='stx_mtime' type-id='94101016' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='1024'>
+        <var-decl name='stx_rdev_major' type-id='3f1a6b60' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='1056'>
+        <var-decl name='stx_rdev_minor' type-id='3f1a6b60' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='1088'>
+        <var-decl name='stx_dev_major' type-id='3f1a6b60' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='1120'>
+        <var-decl name='stx_dev_minor' type-id='3f1a6b60' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='1152'>
+        <var-decl name='stx_mnt_id' type-id='d3130597' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='1216'>
+        <var-decl name='__spare2' type-id='d3130597' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='1280'>
+        <var-decl name='__spare3' type-id='f63f23b9' visibility='default'/>
+      </data-member>
+    </class-decl>
     <class-decl name='mntent' size-in-bits='320' is-struct='yes' visibility='default' id='56fe4a37'>
       <data-member access='public' layout-offset-in-bits='0'>
         <var-decl name='mnt_fsname' type-id='26a90f95' visibility='default'/>
@@ -1142,6 +1237,8 @@
     <pointer-type-def type-id='1b055409' size-in-bits='64' id='9d424d31'/>
     <pointer-type-def type-id='0bbec9cd' size-in-bits='64' id='62f7a03d'/>
     <qualified-type-def type-id='62f7a03d' restrict='yes' id='f1cadedf'/>
+    <pointer-type-def type-id='720b04c5' size-in-bits='64' id='936b8e35'/>
+    <qualified-type-def type-id='936b8e35' restrict='yes' id='31d265b7'/>
     <function-decl name='getmntent_r' visibility='default' binding='global' size-in-bits='64'>
       <parameter type-id='e75a27e9'/>
       <parameter type-id='3cad23cd'/>
@@ -1157,6 +1254,14 @@
       <parameter type-id='95e97e5e'/>
       <return type-id='26a90f95'/>
     </function-decl>
+    <function-decl name='statx' visibility='default' binding='global' size-in-bits='64'>
+      <parameter type-id='95e97e5e'/>
+      <parameter type-id='9d26089a'/>
+      <parameter type-id='95e97e5e'/>
+      <parameter type-id='f0981eeb'/>
+      <parameter type-id='31d265b7'/>
+      <return type-id='95e97e5e'/>
+    </function-decl>
     <function-decl name='__fprintf_chk' visibility='default' binding='global' size-in-bits='64'>
       <parameter type-id='e75a27e9'/>
       <parameter type-id='95e97e5e'/>
@@ -1307,6 +1412,91 @@
       <return type-id='48b5725f'/>
     </function-decl>
   </abi-instr>
+  <abi-instr address-size='64' path='lib/libspl/tunables.c' language='LANG_C99'>
+    <enum-decl name='zfs_tunable_type_t' naming-typedef-id='f50b1525' id='56905369'>
+      <underlying-type type-id='9cac1fee'/>
+      <enumerator name='ZFS_TUNABLE_TYPE_INT' value='0'/>
+      <enumerator name='ZFS_TUNABLE_TYPE_UINT' value='1'/>
+      <enumerator name='ZFS_TUNABLE_TYPE_ULONG' value='2'/>
+      <enumerator name='ZFS_TUNABLE_TYPE_U64' value='3'/>
+      <enumerator name='ZFS_TUNABLE_TYPE_STRING' value='4'/>
+    </enum-decl>
+    <typedef-decl name='zfs_tunable_type_t' type-id='56905369' id='f50b1525'/>
+    <enum-decl name='zfs_tunable_perm_t' naming-typedef-id='ada7336b' id='e80e6ebf'>
+      <underlying-type type-id='9cac1fee'/>
+      <enumerator name='ZFS_TUNABLE_PERM_ZMOD_RW' value='0'/>
+      <enumerator name='ZFS_TUNABLE_PERM_ZMOD_RD' value='1'/>
+    </enum-decl>
+    <typedef-decl name='zfs_tunable_perm_t' type-id='e80e6ebf' id='ada7336b'/>
+    <class-decl name='zfs_tunable' size-in-bits='320' is-struct='yes' visibility='default' id='1a97ee0e'>
+      <data-member access='public' layout-offset-in-bits='0'>
+        <var-decl name='zt_name' type-id='80f4b756' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='64'>
+        <var-decl name='zt_varp' type-id='eaa32e2f' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='128'>
+        <var-decl name='zt_varsz' type-id='b59d7dce' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='192'>
+        <var-decl name='zt_type' type-id='f50b1525' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='224'>
+        <var-decl name='zt_perm' type-id='ada7336b' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='256'>
+        <var-decl name='zt_desc' type-id='80f4b756' visibility='default'/>
+      </data-member>
+    </class-decl>
+    <typedef-decl name='zfs_tunable_t' type-id='1a97ee0e' id='12bf5c5e'/>
+    <typedef-decl name='zfs_tunable_iter_t' type-id='7ef33f92' id='d8d5f4ab'/>
+    <typedef-decl name='intmax_t' type-id='5b475db0' id='e104d842'/>
+    <typedef-decl name='uintmax_t' type-id='04d82f4b' id='f8b828c9'/>
+    <typedef-decl name='__intmax_t' type-id='bd54fe1a' id='5b475db0'/>
+    <typedef-decl name='__uintmax_t' type-id='7359adad' id='04d82f4b'/>
+    <pointer-type-def type-id='26a90f95' size-in-bits='64' id='9b23c9ad'/>
+    <qualified-type-def type-id='9b23c9ad' restrict='yes' id='8c85230f'/>
+    <qualified-type-def type-id='12bf5c5e' const='yes' id='180e47ee'/>
+    <pointer-type-def type-id='180e47ee' size-in-bits='64' id='a27af98c'/>
+    <pointer-type-def type-id='92f86508' size-in-bits='64' id='7ef33f92'/>
+    <function-decl name='strtoimax' visibility='default' binding='global' size-in-bits='64'>
+      <parameter type-id='9d26089a'/>
+      <parameter type-id='8c85230f'/>
+      <parameter type-id='95e97e5e'/>
+      <return type-id='e104d842'/>
+    </function-decl>
+    <function-decl name='strtoumax' visibility='default' binding='global' size-in-bits='64'>
+      <parameter type-id='9d26089a'/>
+      <parameter type-id='8c85230f'/>
+      <parameter type-id='95e97e5e'/>
+      <return type-id='f8b828c9'/>
+    </function-decl>
+    <function-decl name='zfs_tunable_lookup' mangled-name='zfs_tunable_lookup' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_tunable_lookup'>
+      <parameter type-id='80f4b756' name='name'/>
+      <return type-id='a27af98c'/>
+    </function-decl>
+    <function-decl name='zfs_tunable_set' mangled-name='zfs_tunable_set' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_tunable_set'>
+      <parameter type-id='a27af98c' name='zt'/>
+      <parameter type-id='80f4b756' name='val'/>
+      <return type-id='95e97e5e'/>
+    </function-decl>
+    <function-decl name='zfs_tunable_get' mangled-name='zfs_tunable_get' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_tunable_get'>
+      <parameter type-id='a27af98c' name='zt'/>
+      <parameter type-id='26a90f95' name='val'/>
+      <parameter type-id='b59d7dce' name='valsz'/>
+      <return type-id='95e97e5e'/>
+    </function-decl>
+    <function-decl name='zfs_tunable_iter' mangled-name='zfs_tunable_iter' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_tunable_iter'>
+      <parameter type-id='d8d5f4ab' name='cb'/>
+      <parameter type-id='eaa32e2f' name='arg'/>
+      <return type-id='48b5725f'/>
+    </function-decl>
+    <function-type size-in-bits='64' id='92f86508'>
+      <parameter type-id='a27af98c'/>
+      <parameter type-id='eaa32e2f'/>
+      <return type-id='95e97e5e'/>
+    </function-type>
+  </abi-instr>
   <abi-instr address-size='64' path='lib/libuutil/uu_alloc.c' language='LANG_C99'>
     <type-decl name='char' size-in-bits='8' id='a84c031d'/>
     <type-decl name='unsigned int' size-in-bits='32' id='f0981eeb'/>
diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi
index ecfd40efc42..06e74387f4b 100644
--- a/lib/libzfs/libzfs.abi
+++ b/lib/libzfs/libzfs.abi
@@ -451,6 +451,10 @@
     <elf-symbol name='zfs_strip_partition' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zfs_strip_path' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zfs_truncate_shares' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='zfs_tunable_get' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='zfs_tunable_iter' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='zfs_tunable_lookup' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='zfs_tunable_set' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zfs_type_to_name' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zfs_unmount' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zfs_unmountall' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@@ -1450,8 +1454,103 @@
     </function-decl>
   </abi-instr>
   <abi-instr address-size='64' path='lib/libspl/os/linux/getmntany.c' language='LANG_C99'>
+    <array-type-def dimensions='1' type-id='d315442e' size-in-bits='16' id='811205dc'>
+      <subrange length='1' type-id='7359adad' id='52f813b4'/>
+    </array-type-def>
+    <array-type-def dimensions='1' type-id='d3130597' size-in-bits='768' id='f63f23b9'>
+      <subrange length='12' type-id='7359adad' id='84827bdc'/>
+    </array-type-def>
+    <typedef-decl name='__u16' type-id='8efea9e5' id='d315442e'/>
+    <typedef-decl name='__s32' type-id='95e97e5e' id='3158a266'/>
+    <typedef-decl name='__u32' type-id='f0981eeb' id='3f1a6b60'/>
+    <typedef-decl name='__s64' type-id='1eb56b1e' id='49659421'/>
+    <typedef-decl name='__u64' type-id='3a47d82b' id='d3130597'/>
+    <class-decl name='statx_timestamp' size-in-bits='128' is-struct='yes' visibility='default' id='94101016'>
+      <data-member access='public' layout-offset-in-bits='0'>
+        <var-decl name='tv_sec' type-id='49659421' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='64'>
+        <var-decl name='tv_nsec' type-id='3f1a6b60' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='96'>
+        <var-decl name='__reserved' type-id='3158a266' visibility='default'/>
+      </data-member>
+    </class-decl>
+    <class-decl name='statx' size-in-bits='2048' is-struct='yes' visibility='default' id='720b04c5'>
+      <data-member access='public' layout-offset-in-bits='0'>
+        <var-decl name='stx_mask' type-id='3f1a6b60' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='32'>
+        <var-decl name='stx_blksize' type-id='3f1a6b60' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='64'>
+        <var-decl name='stx_attributes' type-id='d3130597' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='128'>
+        <var-decl name='stx_nlink' type-id='3f1a6b60' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='160'>
+        <var-decl name='stx_uid' type-id='3f1a6b60' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='192'>
+        <var-decl name='stx_gid' type-id='3f1a6b60' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='224'>
+        <var-decl name='stx_mode' type-id='d315442e' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='240'>
+        <var-decl name='__spare0' type-id='811205dc' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='256'>
+        <var-decl name='stx_ino' type-id='d3130597' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='320'>
+        <var-decl name='stx_size' type-id='d3130597' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='384'>
+        <var-decl name='stx_blocks' type-id='d3130597' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='448'>
+        <var-decl name='stx_attributes_mask' type-id='d3130597' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='512'>
+        <var-decl name='stx_atime' type-id='94101016' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='640'>
+        <var-decl name='stx_btime' type-id='94101016' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='768'>
+        <var-decl name='stx_ctime' type-id='94101016' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='896'>
+        <var-decl name='stx_mtime' type-id='94101016' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='1024'>
+        <var-decl name='stx_rdev_major' type-id='3f1a6b60' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='1056'>
+        <var-decl name='stx_rdev_minor' type-id='3f1a6b60' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='1088'>
+        <var-decl name='stx_dev_major' type-id='3f1a6b60' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='1120'>
+        <var-decl name='stx_dev_minor' type-id='3f1a6b60' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='1152'>
+        <var-decl name='stx_mnt_id' type-id='d3130597' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='1216'>
+        <var-decl name='__spare2' type-id='d3130597' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='1280'>
+        <var-decl name='__spare3' type-id='f63f23b9' visibility='default'/>
+      </data-member>
+    </class-decl>
     <pointer-type-def type-id='56fe4a37' size-in-bits='64' id='b6b61d2f'/>
     <qualified-type-def type-id='b6b61d2f' restrict='yes' id='3cad23cd'/>
+    <pointer-type-def type-id='720b04c5' size-in-bits='64' id='936b8e35'/>
+    <qualified-type-def type-id='936b8e35' restrict='yes' id='31d265b7'/>
     <function-decl name='getmntent_r' visibility='default' binding='global' size-in-bits='64'>
       <parameter type-id='e75a27e9'/>
       <parameter type-id='3cad23cd'/>
@@ -1463,6 +1562,14 @@
       <parameter type-id='822cd80b'/>
       <return type-id='95e97e5e'/>
     </function-decl>
+    <function-decl name='statx' visibility='default' binding='global' size-in-bits='64'>
+      <parameter type-id='95e97e5e'/>
+      <parameter type-id='9d26089a'/>
+      <parameter type-id='95e97e5e'/>
+      <parameter type-id='f0981eeb'/>
+      <parameter type-id='31d265b7'/>
+      <return type-id='95e97e5e'/>
+    </function-decl>
   </abi-instr>
   <abi-instr address-size='64' path='lib/libspl/timestamp.c' language='LANG_C99'>
     <typedef-decl name='nl_item' type-id='95e97e5e' id='03b79a94'/>
@@ -1487,6 +1594,89 @@
       <return type-id='48b5725f'/>
     </function-decl>
   </abi-instr>
+  <abi-instr address-size='64' path='lib/libspl/tunables.c' language='LANG_C99'>
+    <enum-decl name='zfs_tunable_type_t' naming-typedef-id='f50b1525' id='56905369'>
+      <underlying-type type-id='9cac1fee'/>
+      <enumerator name='ZFS_TUNABLE_TYPE_INT' value='0'/>
+      <enumerator name='ZFS_TUNABLE_TYPE_UINT' value='1'/>
+      <enumerator name='ZFS_TUNABLE_TYPE_ULONG' value='2'/>
+      <enumerator name='ZFS_TUNABLE_TYPE_U64' value='3'/>
+      <enumerator name='ZFS_TUNABLE_TYPE_STRING' value='4'/>
+    </enum-decl>
+    <typedef-decl name='zfs_tunable_type_t' type-id='56905369' id='f50b1525'/>
+    <enum-decl name='zfs_tunable_perm_t' naming-typedef-id='ada7336b' id='e80e6ebf'>
+      <underlying-type type-id='9cac1fee'/>
+      <enumerator name='ZFS_TUNABLE_PERM_ZMOD_RW' value='0'/>
+      <enumerator name='ZFS_TUNABLE_PERM_ZMOD_RD' value='1'/>
+    </enum-decl>
+    <typedef-decl name='zfs_tunable_perm_t' type-id='e80e6ebf' id='ada7336b'/>
+    <class-decl name='zfs_tunable' size-in-bits='320' is-struct='yes' visibility='default' id='1a97ee0e'>
+      <data-member access='public' layout-offset-in-bits='0'>
+        <var-decl name='zt_name' type-id='80f4b756' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='64'>
+        <var-decl name='zt_varp' type-id='eaa32e2f' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='128'>
+        <var-decl name='zt_varsz' type-id='b59d7dce' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='192'>
+        <var-decl name='zt_type' type-id='f50b1525' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='224'>
+        <var-decl name='zt_perm' type-id='ada7336b' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='256'>
+        <var-decl name='zt_desc' type-id='80f4b756' visibility='default'/>
+      </data-member>
+    </class-decl>
+    <typedef-decl name='zfs_tunable_t' type-id='1a97ee0e' id='12bf5c5e'/>
+    <typedef-decl name='zfs_tunable_iter_t' type-id='7ef33f92' id='d8d5f4ab'/>
+    <typedef-decl name='intmax_t' type-id='5b475db0' id='e104d842'/>
+    <typedef-decl name='uintmax_t' type-id='04d82f4b' id='f8b828c9'/>
+    <typedef-decl name='__intmax_t' type-id='bd54fe1a' id='5b475db0'/>
+    <typedef-decl name='__uintmax_t' type-id='7359adad' id='04d82f4b'/>
+    <qualified-type-def type-id='12bf5c5e' const='yes' id='180e47ee'/>
+    <pointer-type-def type-id='180e47ee' size-in-bits='64' id='a27af98c'/>
+    <pointer-type-def type-id='92f86508' size-in-bits='64' id='7ef33f92'/>
+    <function-decl name='strtoimax' visibility='default' binding='global' size-in-bits='64'>
+      <parameter type-id='9d26089a'/>
+      <parameter type-id='8c85230f'/>
+      <parameter type-id='95e97e5e'/>
+      <return type-id='e104d842'/>
+    </function-decl>
+    <function-decl name='strtoumax' visibility='default' binding='global' size-in-bits='64'>
+      <parameter type-id='9d26089a'/>
+      <parameter type-id='8c85230f'/>
+      <parameter type-id='95e97e5e'/>
+      <return type-id='f8b828c9'/>
+    </function-decl>
+    <function-decl name='zfs_tunable_lookup' mangled-name='zfs_tunable_lookup' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_tunable_lookup'>
+      <parameter type-id='80f4b756' name='name'/>
+      <return type-id='a27af98c'/>
+    </function-decl>
+    <function-decl name='zfs_tunable_set' mangled-name='zfs_tunable_set' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_tunable_set'>
+      <parameter type-id='a27af98c' name='zt'/>
+      <parameter type-id='80f4b756' name='val'/>
+      <return type-id='95e97e5e'/>
+    </function-decl>
+    <function-decl name='zfs_tunable_get' mangled-name='zfs_tunable_get' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_tunable_get'>
+      <parameter type-id='a27af98c' name='zt'/>
+      <parameter type-id='26a90f95' name='val'/>
+      <parameter type-id='b59d7dce' name='valsz'/>
+      <return type-id='95e97e5e'/>
+    </function-decl>
+    <function-decl name='zfs_tunable_iter' mangled-name='zfs_tunable_iter' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_tunable_iter'>
+      <parameter type-id='d8d5f4ab' name='cb'/>
+      <parameter type-id='eaa32e2f' name='arg'/>
+      <return type-id='48b5725f'/>
+    </function-decl>
+    <function-type size-in-bits='64' id='92f86508'>
+      <parameter type-id='a27af98c'/>
+      <parameter type-id='eaa32e2f'/>
+      <return type-id='95e97e5e'/>
+    </function-type>
+  </abi-instr>
   <abi-instr address-size='64' path='lib/libtpool/thread_pool.c' language='LANG_C99'>
     <array-type-def dimensions='1' type-id='49ef3ffd' size-in-bits='1024' id='a14403f5'>
       <subrange length='16' type-id='7359adad' id='848d0938'/>
@@ -4135,13 +4325,6 @@
       <parameter type-id='58603c44'/>
       <return type-id='9c313c2d'/>
     </function-decl>
-    <function-decl name='zpool_prop_get_feature' mangled-name='zpool_prop_get_feature' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_prop_get_feature'>
-      <parameter type-id='4c81de99'/>
-      <parameter type-id='80f4b756'/>
-      <parameter type-id='26a90f95'/>
-      <parameter type-id='b59d7dce'/>
-      <return type-id='95e97e5e'/>
-    </function-decl>
     <function-decl name='zfs_iter_snapshots_v2' mangled-name='zfs_iter_snapshots_v2' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_iter_snapshots_v2'>
       <parameter type-id='9200a744'/>
       <parameter type-id='95e97e5e'/>
@@ -6630,6 +6813,13 @@
       <parameter type-id='e4378506' name='plp'/>
       <return type-id='95e97e5e'/>
     </function-decl>
+    <function-decl name='zpool_prop_get_feature' mangled-name='zpool_prop_get_feature' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_prop_get_feature'>
+      <parameter type-id='4c81de99' name='zhp'/>
+      <parameter type-id='80f4b756' name='propname'/>
+      <parameter type-id='26a90f95' name='buf'/>
+      <parameter type-id='b59d7dce' name='len'/>
+      <return type-id='95e97e5e'/>
+    </function-decl>
     <function-decl name='zpool_get_state' mangled-name='zpool_get_state' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_get_state'>
       <parameter type-id='4c81de99' name='zhp'/>
       <return type-id='95e97e5e'/>
diff --git a/lib/libzfs_core/libzfs_core.abi b/lib/libzfs_core/libzfs_core.abi
index 63904f447e8..2af20894853 100644
--- a/lib/libzfs_core/libzfs_core.abi
+++ b/lib/libzfs_core/libzfs_core.abi
@@ -222,6 +222,10 @@
     <elf-symbol name='spl_pagesize' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='strlcat' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='strlcpy' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='zfs_tunable_get' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='zfs_tunable_iter' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='zfs_tunable_lookup' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='zfs_tunable_set' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
   </elf-function-symbols>
   <abi-instr address-size='64' path='lib/libspl/assert.c' language='LANG_C99'>
     <class-decl name='__va_list_tag' size-in-bits='192' is-struct='yes' visibility='default' id='d5027220'>
@@ -613,7 +617,6 @@
     <array-type-def dimensions='1' type-id='de572c22' size-in-bits='1472' id='6d3c2f42'>
       <subrange length='23' type-id='7359adad' id='fdd0f594'/>
     </array-type-def>
-    <type-decl name='long long int' size-in-bits='64' id='1eb56b1e'/>
     <array-type-def dimensions='1' type-id='3a47d82b' size-in-bits='256' id='a133ec23'>
       <subrange length='4' type-id='7359adad' id='16fe7105'/>
     </array-type-def>
@@ -974,8 +977,6 @@
   </abi-instr>
   <abi-instr address-size='64' path='lib/libspl/os/linux/gethostid.c' language='LANG_C99'>
     <type-decl name='long long unsigned int' size-in-bits='64' id='3a47d82b'/>
-    <pointer-type-def type-id='26a90f95' size-in-bits='64' id='9b23c9ad'/>
-    <qualified-type-def type-id='9b23c9ad' restrict='yes' id='8c85230f'/>
     <function-decl name='strtoull' visibility='default' binding='global' size-in-bits='64'>
       <parameter type-id='9d26089a'/>
       <parameter type-id='8c85230f'/>
@@ -987,6 +988,13 @@
     </function-decl>
   </abi-instr>
   <abi-instr address-size='64' path='lib/libspl/os/linux/getmntany.c' language='LANG_C99'>
+    <array-type-def dimensions='1' type-id='d315442e' size-in-bits='16' id='811205dc'>
+      <subrange length='1' type-id='7359adad' id='52f813b4'/>
+    </array-type-def>
+    <array-type-def dimensions='1' type-id='d3130597' size-in-bits='768' id='f63f23b9'>
+      <subrange length='12' type-id='7359adad' id='84827bdc'/>
+    </array-type-def>
+    <type-decl name='long long int' size-in-bits='64' id='1eb56b1e'/>
     <class-decl name='mnttab' size-in-bits='256' is-struct='yes' visibility='default' id='1b055409'>
       <data-member access='public' layout-offset-in-bits='0'>
         <var-decl name='mnt_special' type-id='26a90f95' visibility='default'/>
@@ -1021,6 +1029,93 @@
         <var-decl name='mnt_minor' type-id='3502e3ff' visibility='default'/>
       </data-member>
     </class-decl>
+    <typedef-decl name='__u16' type-id='8efea9e5' id='d315442e'/>
+    <typedef-decl name='__s32' type-id='95e97e5e' id='3158a266'/>
+    <typedef-decl name='__u32' type-id='f0981eeb' id='3f1a6b60'/>
+    <typedef-decl name='__s64' type-id='1eb56b1e' id='49659421'/>
+    <typedef-decl name='__u64' type-id='3a47d82b' id='d3130597'/>
+    <class-decl name='statx_timestamp' size-in-bits='128' is-struct='yes' visibility='default' id='94101016'>
+      <data-member access='public' layout-offset-in-bits='0'>
+        <var-decl name='tv_sec' type-id='49659421' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='64'>
+        <var-decl name='tv_nsec' type-id='3f1a6b60' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='96'>
+        <var-decl name='__reserved' type-id='3158a266' visibility='default'/>
+      </data-member>
+    </class-decl>
+    <class-decl name='statx' size-in-bits='2048' is-struct='yes' visibility='default' id='720b04c5'>
+      <data-member access='public' layout-offset-in-bits='0'>
+        <var-decl name='stx_mask' type-id='3f1a6b60' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='32'>
+        <var-decl name='stx_blksize' type-id='3f1a6b60' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='64'>
+        <var-decl name='stx_attributes' type-id='d3130597' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='128'>
+        <var-decl name='stx_nlink' type-id='3f1a6b60' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='160'>
+        <var-decl name='stx_uid' type-id='3f1a6b60' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='192'>
+        <var-decl name='stx_gid' type-id='3f1a6b60' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='224'>
+        <var-decl name='stx_mode' type-id='d315442e' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='240'>
+        <var-decl name='__spare0' type-id='811205dc' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='256'>
+        <var-decl name='stx_ino' type-id='d3130597' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='320'>
+        <var-decl name='stx_size' type-id='d3130597' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='384'>
+        <var-decl name='stx_blocks' type-id='d3130597' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='448'>
+        <var-decl name='stx_attributes_mask' type-id='d3130597' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='512'>
+        <var-decl name='stx_atime' type-id='94101016' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='640'>
+        <var-decl name='stx_btime' type-id='94101016' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='768'>
+        <var-decl name='stx_ctime' type-id='94101016' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='896'>
+        <var-decl name='stx_mtime' type-id='94101016' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='1024'>
+        <var-decl name='stx_rdev_major' type-id='3f1a6b60' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='1056'>
+        <var-decl name='stx_rdev_minor' type-id='3f1a6b60' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='1088'>
+        <var-decl name='stx_dev_major' type-id='3f1a6b60' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='1120'>
+        <var-decl name='stx_dev_minor' type-id='3f1a6b60' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='1152'>
+        <var-decl name='stx_mnt_id' type-id='d3130597' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='1216'>
+        <var-decl name='__spare2' type-id='d3130597' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='1280'>
+        <var-decl name='__spare3' type-id='f63f23b9' visibility='default'/>
+      </data-member>
+    </class-decl>
     <class-decl name='mntent' size-in-bits='320' is-struct='yes' visibility='default' id='56fe4a37'>
       <data-member access='public' layout-offset-in-bits='0'>
         <var-decl name='mnt_fsname' type-id='26a90f95' visibility='default'/>
@@ -1096,6 +1191,8 @@
     <pointer-type-def type-id='1b055409' size-in-bits='64' id='9d424d31'/>
     <pointer-type-def type-id='0bbec9cd' size-in-bits='64' id='62f7a03d'/>
     <qualified-type-def type-id='62f7a03d' restrict='yes' id='f1cadedf'/>
+    <pointer-type-def type-id='720b04c5' size-in-bits='64' id='936b8e35'/>
+    <qualified-type-def type-id='936b8e35' restrict='yes' id='31d265b7'/>
     <function-decl name='getmntent_r' visibility='default' binding='global' size-in-bits='64'>
       <parameter type-id='e75a27e9'/>
       <parameter type-id='3cad23cd'/>
@@ -1107,15 +1204,18 @@
       <parameter type-id='822cd80b'/>
       <return type-id='95e97e5e'/>
     </function-decl>
-    <function-decl name='strcmp' visibility='default' binding='global' size-in-bits='64'>
-      <parameter type-id='80f4b756'/>
-      <parameter type-id='80f4b756'/>
-      <return type-id='95e97e5e'/>
-    </function-decl>
     <function-decl name='strerror' visibility='default' binding='global' size-in-bits='64'>
       <parameter type-id='95e97e5e'/>
       <return type-id='26a90f95'/>
     </function-decl>
+    <function-decl name='statx' visibility='default' binding='global' size-in-bits='64'>
+      <parameter type-id='95e97e5e'/>
+      <parameter type-id='9d26089a'/>
+      <parameter type-id='95e97e5e'/>
+      <parameter type-id='f0981eeb'/>
+      <parameter type-id='31d265b7'/>
+      <return type-id='95e97e5e'/>
+    </function-decl>
     <function-decl name='stat64' visibility='default' binding='global' size-in-bits='64'>
       <parameter type-id='9d26089a'/>
       <parameter type-id='f1cadedf'/>
@@ -1258,6 +1358,96 @@
       <return type-id='48b5725f'/>
     </function-decl>
   </abi-instr>
+  <abi-instr address-size='64' path='lib/libspl/tunables.c' language='LANG_C99'>
+    <enum-decl name='zfs_tunable_type_t' naming-typedef-id='f50b1525' id='56905369'>
+      <underlying-type type-id='9cac1fee'/>
+      <enumerator name='ZFS_TUNABLE_TYPE_INT' value='0'/>
+      <enumerator name='ZFS_TUNABLE_TYPE_UINT' value='1'/>
+      <enumerator name='ZFS_TUNABLE_TYPE_ULONG' value='2'/>
+      <enumerator name='ZFS_TUNABLE_TYPE_U64' value='3'/>
+      <enumerator name='ZFS_TUNABLE_TYPE_STRING' value='4'/>
+    </enum-decl>
+    <typedef-decl name='zfs_tunable_type_t' type-id='56905369' id='f50b1525'/>
+    <enum-decl name='zfs_tunable_perm_t' naming-typedef-id='ada7336b' id='e80e6ebf'>
+      <underlying-type type-id='9cac1fee'/>
+      <enumerator name='ZFS_TUNABLE_PERM_ZMOD_RW' value='0'/>
+      <enumerator name='ZFS_TUNABLE_PERM_ZMOD_RD' value='1'/>
+    </enum-decl>
+    <typedef-decl name='zfs_tunable_perm_t' type-id='e80e6ebf' id='ada7336b'/>
+    <class-decl name='zfs_tunable' size-in-bits='320' is-struct='yes' visibility='default' id='1a97ee0e'>
+      <data-member access='public' layout-offset-in-bits='0'>
+        <var-decl name='zt_name' type-id='80f4b756' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='64'>
+        <var-decl name='zt_varp' type-id='eaa32e2f' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='128'>
+        <var-decl name='zt_varsz' type-id='b59d7dce' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='192'>
+        <var-decl name='zt_type' type-id='f50b1525' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='224'>
+        <var-decl name='zt_perm' type-id='ada7336b' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='256'>
+        <var-decl name='zt_desc' type-id='80f4b756' visibility='default'/>
+      </data-member>
+    </class-decl>
+    <typedef-decl name='zfs_tunable_t' type-id='1a97ee0e' id='12bf5c5e'/>
+    <typedef-decl name='zfs_tunable_iter_t' type-id='7ef33f92' id='d8d5f4ab'/>
+    <typedef-decl name='intmax_t' type-id='5b475db0' id='e104d842'/>
+    <typedef-decl name='uintmax_t' type-id='04d82f4b' id='f8b828c9'/>
+    <typedef-decl name='__intmax_t' type-id='bd54fe1a' id='5b475db0'/>
+    <typedef-decl name='__uintmax_t' type-id='7359adad' id='04d82f4b'/>
+    <pointer-type-def type-id='26a90f95' size-in-bits='64' id='9b23c9ad'/>
+    <qualified-type-def type-id='9b23c9ad' restrict='yes' id='8c85230f'/>
+    <qualified-type-def type-id='12bf5c5e' const='yes' id='180e47ee'/>
+    <pointer-type-def type-id='180e47ee' size-in-bits='64' id='a27af98c'/>
+    <pointer-type-def type-id='92f86508' size-in-bits='64' id='7ef33f92'/>
+    <function-decl name='strtoimax' visibility='default' binding='global' size-in-bits='64'>
+      <parameter type-id='9d26089a'/>
+      <parameter type-id='8c85230f'/>
+      <parameter type-id='95e97e5e'/>
+      <return type-id='e104d842'/>
+    </function-decl>
+    <function-decl name='strtoumax' visibility='default' binding='global' size-in-bits='64'>
+      <parameter type-id='9d26089a'/>
+      <parameter type-id='8c85230f'/>
+      <parameter type-id='95e97e5e'/>
+      <return type-id='f8b828c9'/>
+    </function-decl>
+    <function-decl name='strcmp' visibility='default' binding='global' size-in-bits='64'>
+      <parameter type-id='80f4b756'/>
+      <parameter type-id='80f4b756'/>
+      <return type-id='95e97e5e'/>
+    </function-decl>
+    <function-decl name='zfs_tunable_lookup' mangled-name='zfs_tunable_lookup' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_tunable_lookup'>
+      <parameter type-id='80f4b756' name='name'/>
+      <return type-id='a27af98c'/>
+    </function-decl>
+    <function-decl name='zfs_tunable_set' mangled-name='zfs_tunable_set' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_tunable_set'>
+      <parameter type-id='a27af98c' name='zt'/>
+      <parameter type-id='80f4b756' name='val'/>
+      <return type-id='95e97e5e'/>
+    </function-decl>
+    <function-decl name='zfs_tunable_get' mangled-name='zfs_tunable_get' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_tunable_get'>
+      <parameter type-id='a27af98c' name='zt'/>
+      <parameter type-id='26a90f95' name='val'/>
+      <parameter type-id='b59d7dce' name='valsz'/>
+      <return type-id='95e97e5e'/>
+    </function-decl>
+    <function-decl name='zfs_tunable_iter' mangled-name='zfs_tunable_iter' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_tunable_iter'>
+      <parameter type-id='d8d5f4ab' name='cb'/>
+      <parameter type-id='eaa32e2f' name='arg'/>
+      <return type-id='48b5725f'/>
+    </function-decl>
+    <function-type size-in-bits='64' id='92f86508'>
+      <parameter type-id='a27af98c'/>
+      <parameter type-id='eaa32e2f'/>
+      <return type-id='95e97e5e'/>
+    </function-type>
+  </abi-instr>
   <abi-instr address-size='64' path='lib/libzfs_core/libzfs_core.c' language='LANG_C99'>
     <array-type-def dimensions='1' type-id='03085adc' size-in-bits='192' id='083f8d58'>
       <subrange length='3' type-id='7359adad' id='56f209d2'/>

From fce18e04d5300026df9224ad9edb2c62ecefe27c Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Sat, 12 Jul 2025 11:33:19 +1000
Subject: [PATCH 25/72] libzpool: tunable-based option interface for zdb/ztest

Removes the old dlsym() based option setter and adds a new
function handle_tunable_option() that can set, get and list all the
tunables in the system. And then wire it up to zdb and ztest.

Sponsored-by: https://despairlabs.com/sponsor/
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Closes #17537
---
 cmd/zdb/zdb.c             |  16 +++-
 cmd/ztest.c               |   6 +-
 include/sys/zfs_context.h |   2 +-
 lib/libzpool/Makefile.am  |   2 +-
 lib/libzpool/util.c       | 188 +++++++++++++++++++++-----------------
 man/man1/ztest.1          |   8 +-
 man/man8/zdb.8            |  13 ++-
 7 files changed, 133 insertions(+), 102 deletions(-)

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index d6f144c0e20..037a7681a31 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -797,8 +797,8 @@ usage(void)
 	    "[default is 200]\n");
 	(void) fprintf(stderr, "        -K --key=KEY                 "
 	    "decryption key for encrypted dataset\n");
-	(void) fprintf(stderr, "        -o --option=\"OPTION=INTEGER\" "
-	    "set global variable to an unsigned 32-bit integer\n");
+	(void) fprintf(stderr, "        -o --option=\"NAME=VALUE\" "
+	    "set the named tunable to the given value\n");
 	(void) fprintf(stderr, "        -p --path==PATH              "
 	    "use one or more with -e to specify path to vdev dir\n");
 	(void) fprintf(stderr, "        -P --parseable               "
@@ -9377,9 +9377,11 @@ main(int argc, char **argv)
 			while (*optarg != '\0') { *optarg++ = '*'; }
 			break;
 		case 'o':
-			error = set_global_var(optarg);
+			dump_opt[c]++;
+			dump_all = 0;
+			error = handle_tunable_option(optarg, B_FALSE);
 			if (error != 0)
-				usage();
+				zdb_exit(1);
 			break;
 		case 'p':
 			if (searchdirs == NULL) {
@@ -9545,6 +9547,12 @@ main(int argc, char **argv)
 			error = 0;
 			goto fini;
 		}
+		if (dump_opt['o'])
+			/*
+			 * Avoid blasting tunable options off the top of the
+			 * screen.
+			 */
+			zdb_exit(1);
 		usage();
 	}
 
diff --git a/cmd/ztest.c b/cmd/ztest.c
index e334641fef1..ec1efd638f1 100644
--- a/cmd/ztest.c
+++ b/cmd/ztest.c
@@ -809,8 +809,8 @@ static ztest_option_t option_table[] = {
 	{ 'X', "raidz-expansion", NULL,
 	    "Perform a dedicated raidz expansion test",
 	    NO_DEFAULT, NULL},
-	{ 'o',	"option", "\"OPTION=INTEGER\"",
-	    "Set global variable to an unsigned 32-bit integer value",
+	{ 'o',	"option", "\"NAME=VALUE\"",
+	    "Set the named tunable to the given value",
 	    NO_DEFAULT, NULL},
 	{ 'G',	"dump-debug-msg", NULL,
 	    "Dump zfs_dbgmsg buffer before exiting due to an error",
@@ -7069,7 +7069,7 @@ ztest_set_global_vars(void)
 		char *kv = ztest_opts.zo_gvars[i];
 		VERIFY3U(strlen(kv), <=, ZO_GVARS_MAX_ARGLEN);
 		VERIFY3U(strlen(kv), >, 0);
-		int err = set_global_var(kv);
+		int err = handle_tunable_option(kv, B_TRUE);
 		if (ztest_opts.zo_verbose > 0) {
 			(void) printf("setting global var %s ... %s\n", kv,
 			    err ? "failed" : "ok");
diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h
index e155f2daa39..31edab919f0 100644
--- a/include/sys/zfs_context.h
+++ b/include/sys/zfs_context.h
@@ -661,7 +661,7 @@ extern void random_fini(void);
 
 struct spa;
 extern void show_pool_stats(struct spa *);
-extern int set_global_var(char const *arg);
+extern int handle_tunable_option(const char *, boolean_t);
 
 typedef struct callb_cpr {
 	kmutex_t	*cc_lockp;
diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am
index 8553b377a76..5cdb6a3eb24 100644
--- a/lib/libzpool/Makefile.am
+++ b/lib/libzpool/Makefile.am
@@ -199,7 +199,7 @@ libzpool_la_LIBADD = \
 	libzstd.la \
 	libzutil.la
 
-libzpool_la_LIBADD += $(LIBCLOCK_GETTIME) $(ZLIB_LIBS) -ldl -lm
+libzpool_la_LIBADD += $(LIBCLOCK_GETTIME) $(ZLIB_LIBS) -lm
 
 libzpool_la_LDFLAGS = -pthread
 
diff --git a/lib/libzpool/util.c b/lib/libzpool/util.c
index a297daedbd4..1d0d1a1e56d 100644
--- a/lib/libzpool/util.c
+++ b/lib/libzpool/util.c
@@ -36,7 +36,7 @@
 #include <sys/fs/zfs.h>
 #include <sys/zfs_refcount.h>
 #include <sys/zfs_ioctl.h>
-#include <dlfcn.h>
+#include <sys/tunables.h>
 #include <libzutil.h>
 
 /*
@@ -151,97 +151,119 @@ show_pool_stats(spa_t *spa)
 	nvlist_free(config);
 }
 
-/* *k_out must be freed by the caller */
-static int
-set_global_var_parse_kv(const char *arg, char **k_out, u_longlong_t *v_out)
-{
-	int err;
-	VERIFY(arg);
-	char *d = strdup(arg);
-
-	char *save = NULL;
-	char *k = strtok_r(d, "=", &save);
-	char *v_str = strtok_r(NULL, "=", &save);
-	char *follow = strtok_r(NULL, "=", &save);
-	if (k == NULL || v_str == NULL || follow != NULL) {
-		err = EINVAL;
-		goto err_free;
-	}
-
-	u_longlong_t val = strtoull(v_str, NULL, 0);
-	if (val > UINT32_MAX) {
-		fprintf(stderr, "Value for global variable '%s' must "
-		    "be a 32-bit unsigned integer, got '%s'\n", k, v_str);
-		err = EOVERFLOW;
-		goto err_free;
-	}
-
-	*k_out = strdup(k);
-	*v_out = val;
-	free(d);
-	return (0);
-
-err_free:
-	free(d);
-
-	return (err);
-}
-
 /*
- * Sets given global variable in libzpool to given unsigned 32-bit value.
- * arg: "<variable>=<value>"
+ * Common helper for working with libzpool tunables from the command line.
+ *
+ * Valid inputs:
+ *
+ *   <name>		show named tunable and value
+ *   <name>=<value>	set tunable value
+ *
+ *   show		show all tunables and values
+ *   show=<name>	show named tunable and value
+ *   info		show info about all tunables
+ *   info=<name>	show info about named tunable
  */
-int
-set_global_var(char const *arg)
+
+typedef enum { SHOW, INFO, SET } tunable_mode_t;
+
+static int
+list_tunables_cb(const zfs_tunable_t *tunable, void *arg)
 {
-	void *zpoolhdl;
-	char *varname;
-	u_longlong_t val;
-	int ret;
+	const tunable_mode_t *mode = arg;
 
-#ifndef _ZFS_LITTLE_ENDIAN
-	/*
-	 * On big endian systems changing a 64-bit variable would set the high
-	 * 32 bits instead of the low 32 bits, which could cause unexpected
-	 * results.
-	 */
-	fprintf(stderr, "Setting global variables is only supported on "
-	    "little-endian systems\n");
-	ret = ENOTSUP;
-	goto out_ret;
-#endif
-
-	if ((ret = set_global_var_parse_kv(arg, &varname, &val)) != 0) {
-		goto out_ret;
-	}
-
-	zpoolhdl = dlopen("libzpool.so", RTLD_LAZY);
-	if (zpoolhdl != NULL) {
-		uint32_t *var;
-		var = dlsym(zpoolhdl, varname);
-		if (var == NULL) {
-			fprintf(stderr, "Global variable '%s' does not exist "
-			    "in libzpool.so\n", varname);
-			ret = EINVAL;
-			goto out_dlclose;
-		}
-		*var = (uint32_t)val;
+	static const char *type[] = {
+		"int", "uint", "ulong", "u64", "str",
+	};
+	static const char *perm[] = {
+		"rw", "rd",
+	};
 
+	if (*mode == SHOW) {
+		char val[64];
+		int err = zfs_tunable_get(tunable, val, sizeof (val));
+		if (err == 0)
+			printf("%s: %s\n", tunable->zt_name, val);
+		else
+			printf("%s: [error getting tunable value: %s]\n",
+			    tunable->zt_name, strerror(err));
 	} else {
-		fprintf(stderr, "Failed to open libzpool.so to set global "
-		    "variable\n");
-		ret = EIO;
-		goto out_free;
+		printf("%s [%s %s]: %s\n", tunable->zt_name,
+		    type[tunable->zt_type], perm[tunable->zt_perm],
+		    tunable->zt_desc);
 	}
 
-	ret = 0;
+	return (0);
+}
+int
+handle_tunable_option(const char *_arg, boolean_t quiet)
+{
+	int err = 0;
+	char *arg = strdup(_arg);
+	char *k, *v;
 
-out_dlclose:
-	dlclose(zpoolhdl);
-out_free:
-	free(varname);
-out_ret:
-	return (ret);
+	v = arg;
+	k = strsep(&v, "=");
+
+	tunable_mode_t mode;
+
+	if (strcmp(k, "show") == 0) {
+		mode = SHOW;
+		k = v;
+	} else if (strcmp(k, "info") == 0) {
+		mode = INFO;
+		k = v;
+	} else if (v == NULL) {
+		mode = SHOW;
+	} else {
+		mode = SET;
+	}
+
+	if (quiet && mode != SET) {
+		err = EINVAL;
+		goto out;
+	}
+
+	if (mode == SET) {
+		const zfs_tunable_t *tunable = zfs_tunable_lookup(k);
+		if (tunable == NULL) {
+			err = ENOENT;
+			goto out;
+		}
+
+		char vold[256], vnew[256];
+		if (zfs_tunable_get(tunable, vold, sizeof (vold)) != 0)
+			strcpy(vold, "???");
+		err = zfs_tunable_set(tunable, v);
+		if (err != 0)
+			goto out;
+		if (zfs_tunable_get(tunable, vnew, sizeof (vnew)) != 0)
+			strcpy(vnew, "???");
+
+		if (!quiet)
+			printf("%s: %s -> %s\n", k, vold, vnew);
+	} else if (k != NULL) {
+		const zfs_tunable_t *tunable = zfs_tunable_lookup(k);
+		if (tunable == NULL) {
+			err = ENOENT;
+			goto out;
+		}
+		list_tunables_cb(tunable, &mode);
+	} else {
+		zfs_tunable_iter(list_tunables_cb, &mode);
+	}
+
+out:
+	if (!quiet) {
+		if (err == ENOENT)
+			fprintf(stderr, "no such tunable: %s\n", k);
+		else if (err != 0)
+			fprintf(stderr, "couldn't set tunable '%s': %s\n",
+			    k, strerror(err));
+	}
+
+	free(arg);
+	return (err);
 }
 
 static nvlist_t *
diff --git a/man/man1/ztest.1 b/man/man1/ztest.1
index 0cbb58e40dd..febbb62b166 100644
--- a/man/man1/ztest.1
+++ b/man/man1/ztest.1
@@ -188,12 +188,8 @@ i.e. given
 will be loaded.
 .It Fl C , -vdev-class-state Ns = Ns Sy on Ns | Ns Sy off Ns | Ns Sy random No (default : Sy random  )
 The vdev allocation class state.
-.It Fl o , -option Ns = Ns Ar variable Ns = Ns Ar value
-Set global
-.Ar variable
-to an unsigned 32-bit integer
-.Ar value
-(little-endian only).
+.It Fl o , -option Ns = Ns Ar var Ns = Ns Ar value Ns …
+Set the given tunable to the provided value.
 .It Fl G , -dump-debug
 Dump zfs_dbgmsg buffer before exiting due to an error.
 .It Fl V , -verbose
diff --git a/man/man8/zdb.8 b/man/man8/zdb.8
index 8bfd0dcdc38..3984aaac586 100644
--- a/man/man8/zdb.8
+++ b/man/man8/zdb.8
@@ -474,10 +474,15 @@ as it runs.
 Exercise extreme caution when using this option in shared or uncontrolled
 environments.
 .It Fl o , -option Ns = Ns Ar var Ns = Ns Ar value Ns …
-Set the given global libzpool variable to the provided value.
-The value must be an unsigned 32-bit integer.
-Currently only little-endian systems are supported to avoid accidentally setting
-the high 32 bits of 64-bit variables.
+Set the given tunable to the provided value.
+.It Fl o , -option Ns = Ns Ar var Ns …
+Show the value of the given tunable.
+.It Fl o , -option Ns = Ns show
+Show all tunables and their values.
+.It Fl o , -option Ns = Ns info Ns = Ns Ar value Ns …
+Show info about a tunable, including their name, type and description.
+.It Fl o , -option Ns = Ns info
+Show info about all tunables.
 .It Fl P , -parseable
 Print numbers in an unscaled form more amenable to parsing, e.g.\&
 .Sy 1000000

From 1b84bd1dff931c5217b856fe668a8e270d7efd78 Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Sat, 12 Jul 2025 15:39:38 +1000
Subject: [PATCH 26/72] ZTS: test that zdb can work with libzpool tunables

Sponsored-by: https://despairlabs.com/sponsor/
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Closes #17537
---
 tests/runfiles/common.run                     |  3 +-
 tests/zfs-tests/tests/Makefile.am             |  1 +
 .../functional/cli_root/zdb/zdb_tunables.ksh  | 71 +++++++++++++++++++
 3 files changed, 74 insertions(+), 1 deletion(-)
 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zdb/zdb_tunables.ksh

diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index 214fa70fe58..3eda5d4d904 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -170,7 +170,8 @@ tests = ['zdb_002_pos', 'zdb_003_pos', 'zdb_004_pos', 'zdb_005_pos',
     'zdb_block_size_histogram', 'zdb_checksum', 'zdb_decompress',
     'zdb_display_block', 'zdb_encrypted', 'zdb_label_checksum',
     'zdb_object_range_neg', 'zdb_object_range_pos', 'zdb_objset_id',
-    'zdb_decompress_zstd', 'zdb_recover', 'zdb_recover_2', 'zdb_backup']
+    'zdb_decompress_zstd', 'zdb_recover', 'zdb_recover_2', 'zdb_backup',
+    'zdb_tunables']
 pre =
 post =
 tags = ['functional', 'cli_root', 'zdb']
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index 8813f262753..194ae4169e4 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -645,6 +645,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/cli_root/zdb/zdb_objset_id.ksh \
 	functional/cli_root/zdb/zdb_recover_2.ksh \
 	functional/cli_root/zdb/zdb_recover.ksh \
+	functional/cli_root/zdb/zdb_tunables.ksh \
 	functional/cli_root/zfs_bookmark/cleanup.ksh \
 	functional/cli_root/zfs_bookmark/setup.ksh \
 	functional/cli_root/zfs_bookmark/zfs_bookmark_cliargs.ksh \
diff --git a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_tunables.ksh b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_tunables.ksh
new file mode 100755
index 00000000000..46965aa7cc3
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_tunables.ksh
@@ -0,0 +1,71 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+verify_runnable "both"
+
+log_assert "zdb can work with libzpool tunables"
+
+# a tunable name by itself, or with the "show" command, produces name and value
+log_must eval 'zdb -o zfs_recover | grep -qE "^zfs_recover: 0$"'
+log_must eval 'zdb -o show=zfs_recover | grep -qE "^zfs_recover: 0$"'
+
+# info about a tunable shows a different format
+log_must eval 'zdb -o info=zfs_recover | grep -qE "^zfs_recover \[[[:alnum:]_]+ r[dw]]: .+"'
+
+# "show" by itself shows all the tunables and their values
+# this tests limits to 50 tunables, and then counts the number that match
+# the format, which should be all of them
+log_must test $(zdb -o show | head -50 | grep -cE "^[[:alnum:]_]+: .+") -eq 50
+
+# "info" by itself shows info about all tunables
+# like previous test, we limit and then count
+log_must test $(zdb -o info | head -50 | grep -cE "^[[:alnum:]_]+ \[[[:alnum:]_]+ r[dw]]: .+") -eq 50
+
+# can't lookup nonexistent tunables
+log_mustnot_expect 'no such tunable: hello' zdb -o hello
+log_mustnot_expect 'no such tunable: hello' zdb -o show=hello
+log_mustnot_expect 'no such tunable: hello' zdb -o info=hello
+
+# setting a tunable shows the old and the new value
+log_must eval 'zdb -o zfs_recover=1 | grep -qE "^zfs_recover: 0 -> 1$"'
+
+# replacing a value still sets it
+log_must eval 'zdb -o zfs_recover=0 | grep -qE "^zfs_recover: 0 -> 0$"'
+
+# can't set the "magic" commands
+log_mustnot_expect 'no such tunable: 0' zdb -o show=0
+log_mustnot_expect 'no such tunable: 1' zdb -o info=1
+
+# can set multiple in same command
+log_must eval 'zdb -o zfs_recover=1 -o zfs_flags=512 | xargs | grep -qE "^zfs_recover: 0 -> 1 zfs_flags: 4294965758 -> 512$"'
+
+# can set and show in same command
+log_must eval 'zdb -o zfs_recover=1 -o zfs_recover -o zfs_recover=0 | xargs | grep -qE "^zfs_recover: 0 -> 1 zfs_recover: 1 zfs_recover: 1 -> 0$"'
+
+log_pass "zdb can work with libzpool tunables"

From ee2a2d941aff6aa40f2ed6effb23f6e308825fc3 Mon Sep 17 00:00:00 2001
From: Mark Johnston <markj@FreeBSD.org>
Date: Sat, 28 Jun 2025 02:32:16 +0000
Subject: [PATCH 27/72] Revert "FreeBSD: zfs_putpages: don't undirty pages
 until after write completes"

This causes async putpages to leave the pages sbusied for a long time,
which hurts concurrency.  Revert for now until we have a better
approach.

This reverts commit 238eab7dc16932edbe9bcc990e8e5376bfe5b2ba.

Reported by:    Ihor Antonov <ngor@hugpoint.tech>
Discussed with: Rob Norris <rob.norris@klarasystems.com>

References: freebsd/freebsd-src@738a9a7
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Mark Johnston <markj@FreeBSD.org>
Ported-by: Rob Norris <rob.norris@klarasystems.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17533
---
 include/os/freebsd/spl/sys/vm.h      |  1 -
 module/os/freebsd/spl/spl_vm.c       |  1 -
 module/os/freebsd/zfs/zfs_vnops_os.c | 60 +++++++---------------------
 3 files changed, 15 insertions(+), 47 deletions(-)

diff --git a/include/os/freebsd/spl/sys/vm.h b/include/os/freebsd/spl/sys/vm.h
index d36bee881d0..454078f0fe7 100644
--- a/include/os/freebsd/spl/sys/vm.h
+++ b/include/os/freebsd/spl/sys/vm.h
@@ -35,7 +35,6 @@
 extern const int zfs_vm_pagerret_bad;
 extern const int zfs_vm_pagerret_error;
 extern const int zfs_vm_pagerret_ok;
-extern const int zfs_vm_pagerret_pend;
 extern const int zfs_vm_pagerput_sync;
 extern const int zfs_vm_pagerput_inval;
 
diff --git a/module/os/freebsd/spl/spl_vm.c b/module/os/freebsd/spl/spl_vm.c
index 9d5f025423a..733c2bd07eb 100644
--- a/module/os/freebsd/spl/spl_vm.c
+++ b/module/os/freebsd/spl/spl_vm.c
@@ -43,7 +43,6 @@
 const int zfs_vm_pagerret_bad = VM_PAGER_BAD;
 const int zfs_vm_pagerret_error = VM_PAGER_ERROR;
 const int zfs_vm_pagerret_ok = VM_PAGER_OK;
-const int zfs_vm_pagerret_pend = VM_PAGER_PEND;
 const int zfs_vm_pagerput_sync = VM_PAGER_PUT_SYNC;
 const int zfs_vm_pagerput_inval = VM_PAGER_PUT_INVAL;
 
diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c
index 25409ceaf56..5cc4ca02c18 100644
--- a/module/os/freebsd/zfs/zfs_vnops_os.c
+++ b/module/os/freebsd/zfs/zfs_vnops_os.c
@@ -25,7 +25,6 @@
  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2017 Nexenta Systems, Inc.
- * Copyright (c) 2025, Klara, Inc.
  */
 
 /* Portions Copyright 2007 Jeremy Teo */
@@ -4300,33 +4299,6 @@ zfs_freebsd_getpages(struct vop_getpages_args *ap)
 	    ap->a_rahead));
 }
 
-typedef struct {
-	uint_t		pca_npages;
-	vm_page_t	pca_pages[];
-} putpage_commit_arg_t;
-
-static void
-zfs_putpage_commit_cb(void *arg)
-{
-	putpage_commit_arg_t *pca = arg;
-	vm_object_t object = pca->pca_pages[0]->object;
-
-	zfs_vmobject_wlock(object);
-
-	for (uint_t i = 0; i < pca->pca_npages; i++) {
-		vm_page_t pp = pca->pca_pages[i];
-		vm_page_undirty(pp);
-		vm_page_sunbusy(pp);
-	}
-
-	vm_object_pip_wakeupn(object, pca->pca_npages);
-
-	zfs_vmobject_wunlock(object);
-
-	kmem_free(pca,
-	    offsetof(putpage_commit_arg_t, pca_pages[pca->pca_npages]));
-}
-
 static int
 zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
     int *rtvals)
@@ -4428,12 +4400,10 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
 	}
 
 	if (zp->z_blksz < PAGE_SIZE) {
-		vm_ooffset_t woff = off;
-		size_t wlen = len;
-		for (i = 0; wlen > 0; woff += tocopy, wlen -= tocopy, i++) {
-			tocopy = MIN(PAGE_SIZE, wlen);
+		for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) {
+			tocopy = len > PAGE_SIZE ? PAGE_SIZE : len;
 			va = zfs_map_page(ma[i], &sf);
-			dmu_write(zfsvfs->z_os, zp->z_id, woff, tocopy, va, tx);
+			dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx);
 			zfs_unmap_page(sf);
 		}
 	} else {
@@ -4454,19 +4424,19 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
 		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
 		err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 		ASSERT0(err);
+		/*
+		 * XXX we should be passing a callback to undirty
+		 * but that would make the locking messier
+		 */
+		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off,
+		    len, commit, B_FALSE, NULL, NULL);
 
-		putpage_commit_arg_t *pca = kmem_alloc(
-		    offsetof(putpage_commit_arg_t, pca_pages[ncount]),
-		    KM_SLEEP);
-		pca->pca_npages = ncount;
-		memcpy(pca->pca_pages, ma, sizeof (vm_page_t) * ncount);
-
-		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp,
-		    off, len, commit, B_FALSE, zfs_putpage_commit_cb, pca);
-
-		for (i = 0; i < ncount; i++)
-			rtvals[i] = zfs_vm_pagerret_pend;
-
+		zfs_vmobject_wlock(object);
+		for (i = 0; i < ncount; i++) {
+			rtvals[i] = zfs_vm_pagerret_ok;
+			vm_page_undirty(ma[i]);
+		}
+		zfs_vmobject_wunlock(object);
 		VM_CNT_INC(v_vnodeout);
 		VM_CNT_ADD(v_vnodepgsout, ncount);
 	}

From d323fbf49c1239ecfe25f1dc66a682314f33221e Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 1 Jul 2025 09:24:23 +1000
Subject: [PATCH 28/72] FreeBSD: zfs_putpages: don't undirty pages until after
 write completes

In syncing mode, zfs_putpages() would put the entire range of pages onto
the ZIL, then return VM_PAGER_OK for each page to the kernel. However,
an associated zil_commit() or txg sync had not happened at this point,
so the write may not actually be on disk.

So, we rework that case to use a ZIL commit callback, and do the
post-write work of undirtying the page and signaling completion there.
We return VM_PAGER_PEND to the kernel instead so it knows that we will
take care of it.

The original version of this (238eab7dc1) copied the Linux model and did
the cleanup in a ZIL callback for both async and sync. This was a
mistake, as FreeBSD does not have a separate "busy for writeback" flag
like Linux which keeps the page usable. The full sbusy flag locks the
entire page out until the itx callback fires, which for async is after
txg sync, which could be literal seconds in the future.

For the async case, the data is already on the DMU and the in-memory
ZIL, which is sufficient for async writeback, so the old method of
logging it without a callback, undirtying the page and returning is more
than sufficient and reclaims that lost performance.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Mark Johnston <markj@FreeBSD.org>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17533
---
 include/os/freebsd/spl/sys/vm.h      |  1 +
 module/os/freebsd/spl/spl_vm.c       |  1 +
 module/os/freebsd/zfs/zfs_vnops_os.c | 87 +++++++++++++++++++++++-----
 3 files changed, 75 insertions(+), 14 deletions(-)

diff --git a/include/os/freebsd/spl/sys/vm.h b/include/os/freebsd/spl/sys/vm.h
index 454078f0fe7..d36bee881d0 100644
--- a/include/os/freebsd/spl/sys/vm.h
+++ b/include/os/freebsd/spl/sys/vm.h
@@ -35,6 +35,7 @@
 extern const int zfs_vm_pagerret_bad;
 extern const int zfs_vm_pagerret_error;
 extern const int zfs_vm_pagerret_ok;
+extern const int zfs_vm_pagerret_pend;
 extern const int zfs_vm_pagerput_sync;
 extern const int zfs_vm_pagerput_inval;
 
diff --git a/module/os/freebsd/spl/spl_vm.c b/module/os/freebsd/spl/spl_vm.c
index 733c2bd07eb..9d5f025423a 100644
--- a/module/os/freebsd/spl/spl_vm.c
+++ b/module/os/freebsd/spl/spl_vm.c
@@ -43,6 +43,7 @@
 const int zfs_vm_pagerret_bad = VM_PAGER_BAD;
 const int zfs_vm_pagerret_error = VM_PAGER_ERROR;
 const int zfs_vm_pagerret_ok = VM_PAGER_OK;
+const int zfs_vm_pagerret_pend = VM_PAGER_PEND;
 const int zfs_vm_pagerput_sync = VM_PAGER_PUT_SYNC;
 const int zfs_vm_pagerput_inval = VM_PAGER_PUT_INVAL;
 
diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c
index 5cc4ca02c18..da6a1cc85b6 100644
--- a/module/os/freebsd/zfs/zfs_vnops_os.c
+++ b/module/os/freebsd/zfs/zfs_vnops_os.c
@@ -25,6 +25,7 @@
  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2017 Nexenta Systems, Inc.
+ * Copyright (c) 2025, Klara, Inc.
  */
 
 /* Portions Copyright 2007 Jeremy Teo */
@@ -4299,6 +4300,33 @@ zfs_freebsd_getpages(struct vop_getpages_args *ap)
 	    ap->a_rahead));
 }
 
+typedef struct {
+	uint_t		pca_npages;
+	vm_page_t	pca_pages[];
+} putpage_commit_arg_t;
+
+static void
+zfs_putpage_commit_cb(void *arg)
+{
+	putpage_commit_arg_t *pca = arg;
+	vm_object_t object = pca->pca_pages[0]->object;
+
+	zfs_vmobject_wlock(object);
+
+	for (uint_t i = 0; i < pca->pca_npages; i++) {
+		vm_page_t pp = pca->pca_pages[i];
+		vm_page_undirty(pp);
+		vm_page_sunbusy(pp);
+	}
+
+	vm_object_pip_wakeupn(object, pca->pca_npages);
+
+	zfs_vmobject_wunlock(object);
+
+	kmem_free(pca,
+	    offsetof(putpage_commit_arg_t, pca_pages[pca->pca_npages]));
+}
+
 static int
 zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
     int *rtvals)
@@ -4400,10 +4428,12 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
 	}
 
 	if (zp->z_blksz < PAGE_SIZE) {
-		for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) {
-			tocopy = len > PAGE_SIZE ? PAGE_SIZE : len;
+		vm_ooffset_t woff = off;
+		size_t wlen = len;
+		for (i = 0; wlen > 0; woff += tocopy, wlen -= tocopy, i++) {
+			tocopy = MIN(PAGE_SIZE, wlen);
 			va = zfs_map_page(ma[i], &sf);
-			dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx);
+			dmu_write(zfsvfs->z_os, zp->z_id, woff, tocopy, va, tx);
 			zfs_unmap_page(sf);
 		}
 	} else {
@@ -4424,19 +4454,48 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
 		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
 		err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 		ASSERT0(err);
-		/*
-		 * XXX we should be passing a callback to undirty
-		 * but that would make the locking messier
-		 */
-		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off,
-		    len, commit, B_FALSE, NULL, NULL);
 
-		zfs_vmobject_wlock(object);
-		for (i = 0; i < ncount; i++) {
-			rtvals[i] = zfs_vm_pagerret_ok;
-			vm_page_undirty(ma[i]);
+		if (commit) {
+			/*
+			 * Caller requested that we commit immediately. We set
+			 * a callback on the log entry, to be called once its
+			 * on disk after the call to zil_commit() below. The
+			 * pages will be undirtied and unbusied there.
+			 */
+			putpage_commit_arg_t *pca = kmem_alloc(
+			    offsetof(putpage_commit_arg_t, pca_pages[ncount]),
+			    KM_SLEEP);
+			pca->pca_npages = ncount;
+			memcpy(pca->pca_pages, ma, sizeof (vm_page_t) * ncount);
+
+			zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len,
+			    B_TRUE, B_FALSE, zfs_putpage_commit_cb, pca);
+
+			for (i = 0; i < ncount; i++)
+				rtvals[i] = zfs_vm_pagerret_pend;
+		} else {
+			/*
+			 * Caller just wants the page written back somewhere,
+			 * but doesn't need it committed yet. We've already
+			 * written it back to the DMU, so we just need to put
+			 * it on the async log, then undirty the page and
+			 * return.
+			 *
+			 * We cannot use a callback here, because it would keep
+			 * the page busy (locked) until it is eventually
+			 * written down at txg sync.
+			 */
+			zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len,
+			    B_FALSE, B_FALSE, NULL, NULL);
+
+			zfs_vmobject_wlock(object);
+			for (i = 0; i < ncount; i++) {
+				rtvals[i] = zfs_vm_pagerret_ok;
+				vm_page_undirty(ma[i]);
+			}
+			zfs_vmobject_wunlock(object);
 		}
-		zfs_vmobject_wunlock(object);
+
 		VM_CNT_INC(v_vnodeout);
 		VM_CNT_ADD(v_vnodepgsout, ncount);
 	}

From b21e04e8d9bdd134f1e7dac50285d8d35b6ae6ba Mon Sep 17 00:00:00 2001
From: Paul Dagnelie <pcd@delphix.com>
Date: Tue, 15 Jul 2025 17:01:49 -0700
Subject: [PATCH 29/72] Fix zdb pool/ with -k

When examining the root dataset with zdb -k, we get into a mismatched
state. main() knows we are not examining the whole pool, but it strips
off the trailing slash. import_checkpointed_state() then thinks we are
examining the whole pool, and does not update the target path
appropriately. The fix is to directly inform import_checkpointed_state
that we are examining a filesystem, and not the whole pool.

Sponsored-by: Klara, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Rob Norris <rob.norris@klarasystems.com>
Signed-off-by: Paul Dagnelie <paul.dagnelie@klarasystems.com>
Co-authored-by: Paul Dagnelie <paul.dagnelie@klarasystems.com>
Closes #17536
---
 cmd/zdb/zdb.c                                 | 20 ++++++++++---------
 .../pool_checkpoint/checkpoint_zdb.ksh        |  2 ++
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 037a7681a31..75b54ab4ea5 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -7706,7 +7706,8 @@ zdb_set_skip_mmp(char *target)
  * applies to the new_path parameter if allocated.
  */
 static char *
-import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path)
+import_checkpointed_state(char *target, nvlist_t *cfg, boolean_t target_is_spa,
+    char **new_path)
 {
 	int error = 0;
 	char *poolname, *bogus_name = NULL;
@@ -7714,11 +7715,11 @@ import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path)
 
 	/* If the target is not a pool, the extract the pool name */
 	char *path_start = strchr(target, '/');
-	if (path_start != NULL) {
+	if (target_is_spa || path_start == NULL) {
+		poolname = target;
+	} else {
 		size_t poolname_len = path_start - target;
 		poolname = strndup(target, poolname_len);
-	} else {
-		poolname = target;
 	}
 
 	if (cfg == NULL) {
@@ -7749,10 +7750,11 @@ import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path)
 		    "with error %d\n", bogus_name, error);
 	}
 
-	if (new_path != NULL && path_start != NULL) {
-		if (asprintf(new_path, "%s%s", bogus_name, path_start) == -1) {
+	if (new_path != NULL && !target_is_spa) {
+		if (asprintf(new_path, "%s%s", bogus_name,
+		    path_start != NULL ? path_start : "") == -1) {
 			free(bogus_name);
-			if (path_start != NULL)
+			if (!target_is_spa && path_start != NULL)
 				free(poolname);
 			return (NULL);
 		}
@@ -7981,7 +7983,7 @@ verify_checkpoint_blocks(spa_t *spa)
 	 * name) so we can do verification on it against the current state
 	 * of the pool.
 	 */
-	checkpoint_pool = import_checkpointed_state(spa->spa_name, NULL,
+	checkpoint_pool = import_checkpointed_state(spa->spa_name, NULL, B_TRUE,
 	    NULL);
 	ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0);
 
@@ -9705,7 +9707,7 @@ main(int argc, char **argv)
 	char *checkpoint_target = NULL;
 	if (dump_opt['k']) {
 		checkpoint_pool = import_checkpointed_state(target, cfg,
-		    &checkpoint_target);
+		    target_is_spa, &checkpoint_target);
 
 		if (checkpoint_target != NULL)
 			target = checkpoint_target;
diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_zdb.ksh b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_zdb.ksh
index cd4573b2e4d..b364a5cb4bd 100755
--- a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_zdb.ksh
+++ b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_zdb.ksh
@@ -63,6 +63,7 @@ log_must eval "zdb $TESTPOOL | grep -q \"Checkpointed uberblock found\""
 log_mustnot eval "zdb -k $TESTPOOL | grep -q \"Checkpointed uberblock found\""
 log_mustnot eval "zdb $TESTPOOL | grep \"Dataset $FS1\""
 log_must eval "zdb -k $TESTPOOL | grep \"Dataset $CHECKPOINTED_FS1\""
+log_must eval "zdb -k $TESTPOOL/ | grep \"$TESTPOOL$BOGUS_SUFFIX\""
 
 log_must zpool export $TESTPOOL
 
@@ -70,6 +71,7 @@ log_must eval "zdb -e $TESTPOOL | grep \"Checkpointed uberblock found\""
 log_mustnot eval "zdb -k -e $TESTPOOL | grep \"Checkpointed uberblock found\""
 log_mustnot eval "zdb -e $TESTPOOL | grep \"Dataset $FS1\""
 log_must eval "zdb -k -e $TESTPOOL | grep \"Dataset $CHECKPOINTED_FS1\""
+log_must eval "zdb -k -e $TESTPOOL/ | grep \"$TESTPOOL$BOGUS_SUFFIX\""
 
 log_must zpool import $TESTPOOL
 

From c1e51c55f5c6b17d6fd9a249b1a3ccb956a2e623 Mon Sep 17 00:00:00 2001
From: Paul Dagnelie <pcd@delphix.com>
Date: Wed, 16 Jul 2025 10:20:57 -0700
Subject: [PATCH 30/72] Correct weight recalculation of space-based metaslabs

Currently, after a failed allocation, the metaslab code recalculates the
weight for a metaslab. However, for space-based metaslabs, it uses the
maximum free segment size instead of the normal weighting
algorithm. This is presumably because the normal metaslab weight is
(roughly) intended to estimate the size of the largest free segment, but
it doesn't do that reliably at most fragmentation levels. This means
that recalculated metaslabs are forced to a weight that isn't really
using the same units as the rest of them, resulting in undesirable
behaviors. We switch this to use the normal space-weighting function.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Paul Dagnelie <paul.dagnelie@klarasystems.com>
Sponsored-by: Wasabi Technology, Inc.
Sponsored-by: Klara, Inc.
Closes #17531
---
 module/zfs/metaslab.c | 34 +++++++---------------------------
 1 file changed, 7 insertions(+), 27 deletions(-)

diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 082d379cded..5e002af4fd9 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -5199,29 +5199,16 @@ metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
 
 		/*
 		 * We were unable to allocate from this metaslab so determine
-		 * a new weight for this metaslab. Now that we have loaded
-		 * the metaslab we can provide a better hint to the metaslab
-		 * selector.
-		 *
-		 * For space-based metaslabs, we use the maximum block size.
-		 * This information is only available when the metaslab
-		 * is loaded and is more accurate than the generic free
-		 * space weight that was calculated by metaslab_weight().
-		 * This information allows us to quickly compare the maximum
-		 * available allocation in the metaslab to the allocation
-		 * size being requested.
-		 *
-		 * For segment-based metaslabs, determine the new weight
-		 * based on the highest bucket in the range tree. We
-		 * explicitly use the loaded segment weight (i.e. the range
-		 * tree histogram) since it contains the space that is
-		 * currently available for allocation and is accurate
-		 * even within a sync pass.
+		 * a new weight for this metaslab. The weight was last
+		 * recalculated either when we loaded it (if this is the first
+		 * TXG it's been loaded in), or the last time a txg was synced
+		 * out.
 		 */
 		uint64_t weight;
 		if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
-			weight = metaslab_largest_allocatable(msp);
-			WEIGHT_SET_SPACEBASED(weight);
+			metaslab_set_fragmentation(msp, B_TRUE);
+			weight = metaslab_space_weight(msp) &
+			    ~METASLAB_ACTIVE_MASK;
 		} else {
 			weight = metaslab_weight_from_range_tree(msp);
 		}
@@ -5233,13 +5220,6 @@ metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
 			 * For the case where we use the metaslab that is
 			 * active for another allocator we want to make
 			 * sure that we retain the activation mask.
-			 *
-			 * Note that we could attempt to use something like
-			 * metaslab_recalculate_weight_and_sort() that
-			 * retains the activation mask here. That function
-			 * uses metaslab_weight() to set the weight though
-			 * which is not as accurate as the calculations
-			 * above.
 			 */
 			weight |= msp->ms_weight & METASLAB_ACTIVE_MASK;
 			metaslab_group_sort(mg, msp, weight);

From d7ab07dfb45e32f7bb1cc763b1a00dc652472eb6 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Thu, 17 Jul 2025 18:31:19 -0400
Subject: [PATCH 31/72] ZIL: Force writing of open LWB on suspend

Under parallel workloads ZIL may delay writes of open LWBs that
are not full enough.  On suspend we do not expect anything new to
appear since zil_get_commit_list() will not let it pass, only
returning TXG number to wait for.  But I suspect that waiting for
the TXG commit without having the last LWB issued may not wait for
its completion, resulting in panic described in #17509.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Rob Norris <rob.norris@klarasystems.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #17521
---
 module/zfs/zil.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index 00059b2c6de..139f147d193 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -2902,19 +2902,14 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 
-	/*
-	 * Return if there's nothing to commit before we dirty the fs by
-	 * calling zil_create().
-	 */
-	if (list_is_empty(&zilog->zl_itx_commit_list))
-		return;
-
-	list_create(&nolwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node));
-	list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t),
-	    offsetof(zil_commit_waiter_t, zcw_node));
-
 	lwb = list_tail(&zilog->zl_lwb_list);
 	if (lwb == NULL) {
+		/*
+		 * Return if there's nothing to commit before we dirty the fs.
+		 */
+		if (list_is_empty(&zilog->zl_itx_commit_list))
+			return;
+
 		lwb = zil_create(zilog);
 	} else {
 		/*
@@ -2942,6 +2937,10 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 		}
 	}
 
+	list_create(&nolwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node));
+	list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t),
+	    offsetof(zil_commit_waiter_t, zcw_node));
+
 	while ((itx = list_remove_head(&zilog->zl_itx_commit_list)) != NULL) {
 		lr_t *lrc = &itx->itx_lr;
 		uint64_t txg = lrc->lrc_txg;
@@ -3111,7 +3110,8 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 		 * possible, without significantly impacting the latency
 		 * of each individual itx.
 		 */
-		if (lwb->lwb_state == LWB_STATE_OPENED && !zilog->zl_parallel) {
+		if (lwb->lwb_state == LWB_STATE_OPENED &&
+		    (!zilog->zl_parallel || zilog->zl_suspend > 0)) {
 			zil_burst_done(zilog);
 			list_insert_tail(ilwbs, lwb);
 			lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);

From 2669b00f133e510f777c1c03c4f4714a09824a0b Mon Sep 17 00:00:00 2001
From: Chunwei Chen <tuxoko@gmail.com>
Date: Fri, 18 Jul 2025 08:45:13 -0700
Subject: [PATCH 32/72] Define sops->free_inode() to prevent use-after-free
 during lookup

On Linux, when doing path lookup with LOOKUP_RCU, dentry and inode can
be dereferenced without refcounts and locks. For this reason, dentry and
inode must only be freed after RCU grace period.

However, zfs currently frees inode in zfs_inode_destroy synchronously
and we can't use GPL-only call_rcu() in zfs directly. Fortunately, on
Linux 5.2 and after, if we define sops->free_inode(), the kernel will do
call_rcu() for us.

This issue may be triggered more easily with init_on_free=1 boot
parameter:

BUG: kernel NULL pointer dereference, address: 0000000000000020
RIP: 0010:selinux_inode_permission+0x10e/0x1c0
Call Trace:
 ? show_trace_log_lvl+0x1be/0x2d9
 ? show_trace_log_lvl+0x1be/0x2d9
 ? show_trace_log_lvl+0x1be/0x2d9
 ? security_inode_permission+0x37/0x60
 ? __die_body.cold+0x8/0xd
 ? no_context+0x113/0x220
 ? exc_page_fault+0x6d/0x130
 ? asm_exc_page_fault+0x1e/0x30
 ? selinux_inode_permission+0x10e/0x1c0
 security_inode_permission+0x37/0x60
 link_path_walk.part.0.constprop.0+0xb5/0x360
 ? path_init+0x27d/0x3c0
 path_lookupat+0x3e/0x1a0
 filename_lookup+0xc0/0x1d0
 ? __check_object_size.part.0+0x123/0x150
 ? strncpy_from_user+0x4e/0x130
 ? getname_flags.part.0+0x4b/0x1c0
 vfs_statx+0x72/0x120
 ? ioctl_has_perm.constprop.0.isra.0+0xbd/0x120
 __do_sys_newlstat+0x39/0x70
 ? __x64_sys_ioctl+0x8d/0xd0
 do_syscall_64+0x30/0x40
 entry_SYSCALL_64_after_hwframe+0x62/0xc7

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Rob Norris <rob.norris@klarasystems.com>
Signed-off-by: Chunwei Chen <david.chen@nutanix.com>
Co-authored-by: Chunwei Chen <david.chen@nutanix.com>
Closes #17546
---
 config/kernel-free-inode.m4               | 24 +++++++++++++++++++++++
 config/kernel.m4                          |  2 ++
 include/os/linux/zfs/sys/zfs_znode_impl.h |  1 +
 module/os/linux/zfs/zfs_znode_os.c        | 17 ++++++++++++++--
 module/os/linux/zfs/zpl_super.c           | 12 ++++++++++++
 5 files changed, 54 insertions(+), 2 deletions(-)
 create mode 100644 config/kernel-free-inode.m4

diff --git a/config/kernel-free-inode.m4 b/config/kernel-free-inode.m4
new file mode 100644
index 00000000000..baa1c34845b
--- /dev/null
+++ b/config/kernel-free-inode.m4
@@ -0,0 +1,24 @@
+dnl #
+dnl # Linux 5.2 API change
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_SOPS_FREE_INODE], [
+	ZFS_LINUX_TEST_SRC([super_operations_free_inode], [
+		#include <linux/fs.h>
+
+		static void free_inode(struct inode *) { }
+
+		static struct super_operations sops __attribute__ ((unused)) = {
+			.free_inode = free_inode,
+		};
+	],[])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_SOPS_FREE_INODE], [
+	AC_MSG_CHECKING([whether sops->free_inode() exists])
+	ZFS_LINUX_TEST_RESULT([super_operations_free_inode], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_SOPS_FREE_INODE, 1, [sops->free_inode() exists])
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
diff --git a/config/kernel.m4 b/config/kernel.m4
index 7e6af62dede..e3e7625db7d 100644
--- a/config/kernel.m4
+++ b/config/kernel.m4
@@ -134,6 +134,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 	ZFS_AC_KERNEL_SRC_PIN_USER_PAGES
 	ZFS_AC_KERNEL_SRC_TIMER
 	ZFS_AC_KERNEL_SRC_SUPER_BLOCK_S_WB_ERR
+	ZFS_AC_KERNEL_SRC_SOPS_FREE_INODE
 	case "$host_cpu" in
 		powerpc*)
 			ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
@@ -252,6 +253,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 	ZFS_AC_KERNEL_PIN_USER_PAGES
 	ZFS_AC_KERNEL_TIMER
 	ZFS_AC_KERNEL_SUPER_BLOCK_S_WB_ERR
+	ZFS_AC_KERNEL_SOPS_FREE_INODE
 	case "$host_cpu" in
 		powerpc*)
 			ZFS_AC_KERNEL_CPU_HAS_FEATURE
diff --git a/include/os/linux/zfs/sys/zfs_znode_impl.h b/include/os/linux/zfs/sys/zfs_znode_impl.h
index b38847b2046..6a77e40abe1 100644
--- a/include/os/linux/zfs/sys/zfs_znode_impl.h
+++ b/include/os/linux/zfs/sys/zfs_znode_impl.h
@@ -157,6 +157,7 @@ struct znode;
 
 extern int	zfs_sync(struct super_block *, int, cred_t *);
 extern int	zfs_inode_alloc(struct super_block *, struct inode **ip);
+extern void	zfs_inode_free(struct inode *);
 extern void	zfs_inode_destroy(struct inode *);
 extern void	zfs_mark_inode_dirty(struct inode *);
 extern boolean_t zfs_relatime_need_update(const struct inode *);
diff --git a/module/os/linux/zfs/zfs_znode_os.c b/module/os/linux/zfs/zfs_znode_os.c
index 54e60b4820f..5692868c9dc 100644
--- a/module/os/linux/zfs/zfs_znode_os.c
+++ b/module/os/linux/zfs/zfs_znode_os.c
@@ -371,6 +371,12 @@ zfs_inode_alloc(struct super_block *sb, struct inode **ip)
 	return (0);
 }
 
+void
+zfs_inode_free(struct inode *ip)
+{
+	kmem_cache_free(znode_cache, ITOZ(ip));
+}
+
 /*
  * Called in multiple places when an inode should be destroyed.
  */
@@ -395,8 +401,15 @@ zfs_inode_destroy(struct inode *ip)
 		nvlist_free(zp->z_xattr_cached);
 		zp->z_xattr_cached = NULL;
 	}
-
-	kmem_cache_free(znode_cache, zp);
+#ifndef HAVE_SOPS_FREE_INODE
+	/*
+	 * inode needs to be freed in RCU callback.  If we have
+	 * super_operations->free_inode, Linux kernel will do call_rcu
+	 * for us.  But if we don't have it, since call_rcu is GPL-only
+	 * symbol, we can only free synchronously and accept the risk.
+	 */
+	zfs_inode_free(ip);
+#endif
 }
 
 static void
diff --git a/module/os/linux/zfs/zpl_super.c b/module/os/linux/zfs/zpl_super.c
index a682bfd33c3..94dcdd0b887 100644
--- a/module/os/linux/zfs/zpl_super.c
+++ b/module/os/linux/zfs/zpl_super.c
@@ -45,6 +45,15 @@ zpl_inode_alloc(struct super_block *sb)
 	return (ip);
 }
 
+#ifdef HAVE_SOPS_FREE_INODE
+static void
+zpl_inode_free(struct inode *ip)
+{
+	ASSERT(atomic_read(&ip->i_count) == 0);
+	zfs_inode_free(ip);
+}
+#endif
+
 static void
 zpl_inode_destroy(struct inode *ip)
 {
@@ -455,6 +464,9 @@ zpl_prune_sb(uint64_t nr_to_scan, void *arg)
 
 const struct super_operations zpl_super_operations = {
 	.alloc_inode		= zpl_inode_alloc,
+#ifdef HAVE_SOPS_FREE_INODE
+	.free_inode		= zpl_inode_free,
+#endif
 	.destroy_inode		= zpl_inode_destroy,
 	.dirty_inode		= zpl_dirty_inode,
 	.write_inode		= NULL,

From be1e991a1a83f50c84608709c63d20740fae762f Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Fri, 18 Jul 2025 21:44:14 -0400
Subject: [PATCH 33/72] Allow and prefer special vdevs as ZIL

Before this change ZIL blocks were allocated only from normal or
SLOG vdevs.  In typical situation when special vdevs are SSDs and
normal are HDDs it could cause weird inversions when data blocks
are written to SSDs, but ZIL referencing them to HDDs.

This change assumes that special vdevs typically have much better
(or at least not worse) latency than normal, and so in absence of
SLOGs should store ZIL blocks.  It means similar to normal vdevs
introduction of special embedded log allocation class and updating
the allocation fallback order to: SLOG -> special embedded log ->
special -> normal embedded log -> normal.

The code tries to guess whether data block is going to be written
to normal or special vdev (it can not be done precisely before
compression) and prefer indirect writes for blocks written to a
special vdev to avoid double-write.  For blocks that are going to
be written to normal vdev, special vdev by default plays as SLOG,
reducing write latency by the cost of higher special vdev wear,
but it is tunable via module parameter.

This should allow HDD pools with decent SSD as special vdev to
work under synchronous workloads without requiring additional
SLOG SSD, impractical in many scenarios.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Rob Norris <rob.norris@klarasystems.com>
Reviewed-by: Paul Dagnelie <paul.dagnelie@klarasystems.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #17505
---
 cmd/zdb/zdb.c          | 20 +++++++++++++-
 include/sys/spa.h      |  1 +
 include/sys/spa_impl.h |  1 +
 include/sys/zil.h      |  4 +++
 man/man4/zfs.4         | 16 +++++++++---
 module/zfs/metaslab.c  |  9 ++++---
 module/zfs/spa.c       | 15 ++++++++++-
 module/zfs/spa_misc.c  |  9 +++++++
 module/zfs/vdev.c      | 21 ++++++++++-----
 module/zfs/zfs_log.c   | 16 ++----------
 module/zfs/zil.c       | 59 ++++++++++++++++++++++++++++++++++++++++++
 module/zfs/zio.c       | 20 ++++++++++++++
 module/zfs/zvol.c      | 17 ++----------
 13 files changed, 164 insertions(+), 44 deletions(-)

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 75b54ab4ea5..2dcf6404c09 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -6750,6 +6750,7 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
 	spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
 	spa->spa_log_class->mc_ops = &zdb_metaslab_ops;
 	spa->spa_embedded_log_class->mc_ops = &zdb_metaslab_ops;
+	spa->spa_special_embedded_log_class->mc_ops = &zdb_metaslab_ops;
 
 	zcb->zcb_vd_obsolete_counts =
 	    umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
@@ -6887,7 +6888,9 @@ zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)
 		for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
 			metaslab_t *msp = vd->vdev_ms[m];
 			ASSERT3P(msp->ms_group, ==, (msp->ms_group->mg_class ==
-			    spa_embedded_log_class(spa)) ?
+			    spa_embedded_log_class(spa) ||
+			    msp->ms_group->mg_class ==
+			    spa_special_embedded_log_class(spa)) ?
 			    vd->vdev_log_mg : vd->vdev_mg);
 
 			/*
@@ -7121,6 +7124,8 @@ dump_block_stats(spa_t *spa)
 	zcb->zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa));
 	zcb->zcb_totalasize +=
 	    metaslab_class_get_alloc(spa_embedded_log_class(spa));
+	zcb->zcb_totalasize +=
+	    metaslab_class_get_alloc(spa_special_embedded_log_class(spa));
 	zcb->zcb_start = zcb->zcb_lastprint = gethrtime();
 	err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, zcb);
 
@@ -7169,6 +7174,7 @@ dump_block_stats(spa_t *spa)
 	total_alloc = norm_alloc +
 	    metaslab_class_get_alloc(spa_log_class(spa)) +
 	    metaslab_class_get_alloc(spa_embedded_log_class(spa)) +
+	    metaslab_class_get_alloc(spa_special_embedded_log_class(spa)) +
 	    metaslab_class_get_alloc(spa_special_class(spa)) +
 	    metaslab_class_get_alloc(spa_dedup_class(spa)) +
 	    get_unflushed_alloc_space(spa);
@@ -7252,6 +7258,18 @@ dump_block_stats(spa_t *spa)
 		    100.0 * alloc / space);
 	}
 
+	if (spa_special_embedded_log_class(spa)->mc_allocator[0].mca_rotor
+	    != NULL) {
+		uint64_t alloc = metaslab_class_get_alloc(
+		    spa_special_embedded_log_class(spa));
+		uint64_t space = metaslab_class_get_space(
+		    spa_special_embedded_log_class(spa));
+
+		(void) printf("\t%-16s %14llu     used: %5.2f%%\n",
+		    "Special embedded log", (u_longlong_t)alloc,
+		    100.0 * alloc / space);
+	}
+
 	for (i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) {
 		if (zcb->zcb_embedded_blocks[i] == 0)
 			continue;
diff --git a/include/sys/spa.h b/include/sys/spa.h
index a3e36c1f59a..e5ec39b64dc 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -1065,6 +1065,7 @@ extern metaslab_class_t *spa_normal_class(spa_t *spa);
 extern metaslab_class_t *spa_log_class(spa_t *spa);
 extern metaslab_class_t *spa_embedded_log_class(spa_t *spa);
 extern metaslab_class_t *spa_special_class(spa_t *spa);
+extern metaslab_class_t *spa_special_embedded_log_class(spa_t *spa);
 extern metaslab_class_t *spa_dedup_class(spa_t *spa);
 extern metaslab_class_t *spa_preferred_class(spa_t *spa, const zio_t *zio);
 extern boolean_t spa_special_has_ddt(spa_t *spa);
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index 8c52f751a81..a596235ce01 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -246,6 +246,7 @@ struct spa {
 	metaslab_class_t *spa_log_class;	/* intent log data class */
 	metaslab_class_t *spa_embedded_log_class; /* log on normal vdevs */
 	metaslab_class_t *spa_special_class;	/* special allocation class */
+	metaslab_class_t *spa_special_embedded_log_class; /* log on special */
 	metaslab_class_t *spa_dedup_class;	/* dedup allocation class */
 	uint64_t	spa_first_txg;		/* first txg after spa_open() */
 	uint64_t	spa_final_txg;		/* txg of export/destroy */
diff --git a/include/sys/zil.h b/include/sys/zil.h
index fa7945d8ab8..9d1fb47e2df 100644
--- a/include/sys/zil.h
+++ b/include/sys/zil.h
@@ -635,6 +635,8 @@ extern void	zil_set_logbias(zilog_t *zilog, uint64_t slogval);
 
 extern uint64_t	zil_max_copied_data(zilog_t *zilog);
 extern uint64_t	zil_max_log_data(zilog_t *zilog, size_t hdrsize);
+extern itx_wr_state_t zil_write_state(zilog_t *zilog, uint64_t size,
+    uint32_t blocksize, boolean_t o_direct, boolean_t commit);
 
 extern void zil_sums_init(zil_sums_t *zs);
 extern void zil_sums_fini(zil_sums_t *zs);
@@ -642,6 +644,8 @@ extern void zil_kstat_values_update(zil_kstat_values_t *zs,
     zil_sums_t *zil_sums);
 
 extern int zil_replay_disable;
+extern uint_t zfs_immediate_write_sz;
+extern int zil_special_is_slog;
 
 #ifdef	__cplusplus
 }
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 67b2cef46e8..fba91d1e28b 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -1713,10 +1713,18 @@ Similar to
 but for cleanup of old indirection records for removed vdevs.
 .
 .It Sy zfs_immediate_write_sz Ns = Ns Sy 32768 Ns B Po 32 KiB Pc Pq s64
-Largest data block to write to the ZIL.
-Larger blocks will be treated as if the dataset being written to had the
-.Sy logbias Ns = Ns Sy throughput
-property set.
+Largest write size to store the data directly into the ZIL if
+.Sy logbias Ns = Ns Sy latency .
+Larger writes may be written indirectly similar to
+.Sy logbias Ns = Ns Sy throughput .
+In presence of SLOG this parameter is ignored, as if it was set to infinity,
+storing all written data into ZIL to not depend on regular vdev latency.
+.
+.It Sy zil_special_is_slog Ns = Ns Sy 1 Ns | Ns 0 Pq int
+When enabled, and written blocks go to normal vdevs, treat present special
+vdevs as SLOGs, storing all synchronously written data into ZIL directly.
+Disabling this forces the indirect writes to preserve special vdev write
+throughput and endurance, likely at the cost of normal vdev latency.
 .
 .It Sy zfs_initialize_value Ns = Ns Sy 16045690984833335022 Po 0xDEADBEEFDEADBEEE Pc Pq u64
 Pattern written to vdev free space by
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 5e002af4fd9..2f91f2bb364 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -750,7 +750,8 @@ metaslab_class_histogram_verify(metaslab_class_t *mc)
 		}
 
 		IMPLY(mg == mg->mg_vd->vdev_log_mg,
-		    mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
+		    mc == spa_embedded_log_class(mg->mg_vd->vdev_spa) ||
+		    mc == spa_special_embedded_log_class(mg->mg_vd->vdev_spa));
 
 		for (i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++)
 			mc_hist[i] += mg->mg_histogram[i];
@@ -1288,7 +1289,8 @@ metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
 	mutex_enter(&mc->mc_lock);
 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 		IMPLY(mg == mg->mg_vd->vdev_log_mg,
-		    mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
+		    mc == spa_embedded_log_class(mg->mg_vd->vdev_spa) ||
+		    mc == spa_special_embedded_log_class(mg->mg_vd->vdev_spa));
 		mg->mg_histogram[i + ashift] +=
 		    msp->ms_sm->sm_phys->smp_histogram[i];
 		mc->mc_histogram[i + ashift] +=
@@ -1316,7 +1318,8 @@ metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
 		ASSERT3U(mc->mc_histogram[i + ashift], >=,
 		    msp->ms_sm->sm_phys->smp_histogram[i]);
 		IMPLY(mg == mg->mg_vd->vdev_log_mg,
-		    mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
+		    mc == spa_embedded_log_class(mg->mg_vd->vdev_spa) ||
+		    mc == spa_special_embedded_log_class(mg->mg_vd->vdev_spa));
 
 		mg->mg_histogram[i + ashift] -=
 		    msp->ms_sm->sm_phys->smp_histogram[i];
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 6b52c6cb1f9..46794cc62e3 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -417,11 +417,15 @@ spa_prop_get_config(spa_t *spa, nvlist_t *nv)
 		alloc += metaslab_class_get_alloc(spa_special_class(spa));
 		alloc += metaslab_class_get_alloc(spa_dedup_class(spa));
 		alloc += metaslab_class_get_alloc(spa_embedded_log_class(spa));
+		alloc += metaslab_class_get_alloc(
+		    spa_special_embedded_log_class(spa));
 
 		size = metaslab_class_get_space(mc);
 		size += metaslab_class_get_space(spa_special_class(spa));
 		size += metaslab_class_get_space(spa_dedup_class(spa));
 		size += metaslab_class_get_space(spa_embedded_log_class(spa));
+		size += metaslab_class_get_space(
+		    spa_special_embedded_log_class(spa));
 
 		spa_prop_add_list(nv, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
 		spa_prop_add_list(nv, ZPOOL_PROP_SIZE, NULL, size, src);
@@ -1679,6 +1683,8 @@ spa_activate(spa_t *spa, spa_mode_t mode)
 	    "embedded_log", msp, B_TRUE);
 	spa->spa_special_class = metaslab_class_create(spa, "special",
 	    msp, B_FALSE);
+	spa->spa_special_embedded_log_class = metaslab_class_create(spa,
+	    "special_embedded_log", msp, B_TRUE);
 	spa->spa_dedup_class = metaslab_class_create(spa, "dedup",
 	    msp, B_FALSE);
 
@@ -1853,6 +1859,9 @@ spa_deactivate(spa_t *spa)
 	metaslab_class_destroy(spa->spa_special_class);
 	spa->spa_special_class = NULL;
 
+	metaslab_class_destroy(spa->spa_special_embedded_log_class);
+	spa->spa_special_embedded_log_class = NULL;
+
 	metaslab_class_destroy(spa->spa_dedup_class);
 	spa->spa_dedup_class = NULL;
 
@@ -9092,6 +9101,8 @@ spa_async_thread(void *arg)
 		old_space += metaslab_class_get_space(spa_dedup_class(spa));
 		old_space += metaslab_class_get_space(
 		    spa_embedded_log_class(spa));
+		old_space += metaslab_class_get_space(
+		    spa_special_embedded_log_class(spa));
 
 		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
 
@@ -9100,6 +9111,8 @@ spa_async_thread(void *arg)
 		new_space += metaslab_class_get_space(spa_dedup_class(spa));
 		new_space += metaslab_class_get_space(
 		    spa_embedded_log_class(spa));
+		new_space += metaslab_class_get_space(
+		    spa_special_embedded_log_class(spa));
 		mutex_exit(&spa_namespace_lock);
 
 		/*
@@ -10309,7 +10322,7 @@ spa_sync(spa_t *spa, uint64_t txg)
 
 	metaslab_class_evict_old(spa->spa_normal_class, txg);
 	metaslab_class_evict_old(spa->spa_log_class, txg);
-	/* spa_embedded_log_class has only one metaslab per vdev. */
+	/* Embedded log classes have only one metaslab per vdev. */
 	metaslab_class_evict_old(spa->spa_special_class, txg);
 	metaslab_class_evict_old(spa->spa_dedup_class, txg);
 
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index f054e4290bb..d2ba1f954e9 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -1308,6 +1308,7 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error,
 	metaslab_class_validate(spa_log_class(spa));
 	metaslab_class_validate(spa_embedded_log_class(spa));
 	metaslab_class_validate(spa_special_class(spa));
+	metaslab_class_validate(spa_special_embedded_log_class(spa));
 	metaslab_class_validate(spa_dedup_class(spa));
 
 	spa_config_exit(spa, SCL_ALL, spa);
@@ -1896,6 +1897,8 @@ spa_get_slop_space(spa_t *spa)
 	 */
 	uint64_t embedded_log =
 	    metaslab_class_get_dspace(spa_embedded_log_class(spa));
+	embedded_log += metaslab_class_get_dspace(
+	    spa_special_embedded_log_class(spa));
 	slop -= MIN(embedded_log, slop >> 1);
 
 	/*
@@ -2000,6 +2003,12 @@ spa_special_class(spa_t *spa)
 	return (spa->spa_special_class);
 }
 
+metaslab_class_t *
+spa_special_embedded_log_class(spa_t *spa)
+{
+	return (spa->spa_special_embedded_log_class);
+}
+
 metaslab_class_t *
 spa_dedup_class(spa_t *spa)
 {
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 01758b0c54c..aa4038a7526 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -282,12 +282,15 @@ vdev_getops(const char *type)
  * Given a vdev and a metaslab class, find which metaslab group we're
  * interested in. All vdevs may belong to two different metaslab classes.
  * Dedicated slog devices use only the primary metaslab group, rather than a
- * separate log group. For embedded slogs, the vdev_log_mg will be non-NULL.
+ * separate log group.  For embedded slogs, vdev_log_mg will be non-NULL and
+ * will point to a metaslab group of either embedded_log_class (for normal
+ * vdevs) or special_embedded_log_class (for special vdevs).
  */
 metaslab_group_t *
 vdev_get_mg(vdev_t *vd, metaslab_class_t *mc)
 {
-	if (mc == spa_embedded_log_class(vd->vdev_spa) &&
+	if ((mc == spa_embedded_log_class(vd->vdev_spa) ||
+	    mc == spa_special_embedded_log_class(vd->vdev_spa)) &&
 	    vd->vdev_log_mg != NULL)
 		return (vd->vdev_log_mg);
 	else
@@ -1508,8 +1511,13 @@ vdev_metaslab_group_create(vdev_t *vd)
 		vd->vdev_mg = metaslab_group_create(mc, vd);
 
 		if (!vd->vdev_islog) {
-			vd->vdev_log_mg = metaslab_group_create(
-			    spa_embedded_log_class(spa), vd);
+			if (mc == spa_special_class(spa)) {
+				vd->vdev_log_mg = metaslab_group_create(
+				    spa_special_embedded_log_class(spa), vd);
+			} else {
+				vd->vdev_log_mg = metaslab_group_create(
+				    spa_embedded_log_class(spa), vd);
+			}
 		}
 
 		/*
@@ -1624,9 +1632,10 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 	/*
 	 * Find the emptiest metaslab on the vdev and mark it for use for
 	 * embedded slog by moving it from the regular to the log metaslab
-	 * group.
+	 * group.  This works for normal and special vdevs.
 	 */
-	if (vd->vdev_mg->mg_class == spa_normal_class(spa) &&
+	if ((vd->vdev_mg->mg_class == spa_normal_class(spa) ||
+	    vd->vdev_mg->mg_class == spa_special_class(spa)) &&
 	    vd->vdev_ms_count > zfs_embedded_slog_min_ms &&
 	    avl_is_empty(&vd->vdev_log_mg->mg_metaslab_tree)) {
 		uint64_t slog_msid = 0;
diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c
index 2ce25b72b28..2f61ecfd9b3 100644
--- a/module/zfs/zfs_log.c
+++ b/module/zfs/zfs_log.c
@@ -607,8 +607,6 @@ zfs_log_rename_whiteout(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
  * called as soon as the write is on stable storage (be it via a DMU sync or a
  * ZIL commit).
  */
-static uint_t zfs_immediate_write_sz = 32768;
-
 void
 zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
     znode_t *zp, offset_t off, ssize_t resid, boolean_t commit,
@@ -626,15 +624,8 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
 		return;
 	}
 
-	if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT || o_direct)
-		write_state = WR_INDIRECT;
-	else if (!spa_has_slogs(zilog->zl_spa) &&
-	    resid >= zfs_immediate_write_sz)
-		write_state = WR_INDIRECT;
-	else if (commit)
-		write_state = WR_COPIED;
-	else
-		write_state = WR_NEED_COPY;
+	write_state = zil_write_state(zilog, resid, blocksize, o_direct,
+	    commit);
 
 	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(ZTOZSB(zp)), &gen,
 	    sizeof (gen));
@@ -938,6 +929,3 @@ zfs_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp,
 		len -= partlen;
 	}
 }
-
-ZFS_MODULE_PARAM(zfs, zfs_, immediate_write_sz, UINT, ZMOD_RW,
-	"Largest data block to write to zil");
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index 139f147d193..3aa188a9581 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -2095,6 +2095,19 @@ zil_max_waste_space(zilog_t *zilog)
  */
 static uint_t zil_maxcopied = 7680;
 
+/*
+ * Largest write size to store the data directly into ZIL.
+ */
+uint_t zfs_immediate_write_sz = 32768;
+
+/*
+ * When enabled and blocks go to normal vdev, treat special vdevs as SLOG,
+ * writing data to ZIL (WR_COPIED/WR_NEED_COPY).  Disabling this forces the
+ * indirect writes (WR_INDIRECT) to preserve special vdev throughput and
+ * endurance, likely at the cost of normal vdev latency.
+ */
+int zil_special_is_slog = 1;
+
 uint64_t
 zil_max_copied_data(zilog_t *zilog)
 {
@@ -2102,6 +2115,46 @@ zil_max_copied_data(zilog_t *zilog)
 	return (MIN(max_data, zil_maxcopied));
 }
 
+/*
+ * Determine the appropriate write state for ZIL transactions based on
+ * pool configuration, data placement, write size, and logbias settings.
+ */
+itx_wr_state_t
+zil_write_state(zilog_t *zilog, uint64_t size, uint32_t blocksize,
+    boolean_t o_direct, boolean_t commit)
+{
+	if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT || o_direct)
+		return (WR_INDIRECT);
+
+	/*
+	 * Don't use indirect for too small writes to reduce overhead.
+	 * Don't use indirect if written less than a half of a block if
+	 * we are going to commit it immediately, since next write might
+	 * rewrite the same block again, causing inflation.  If commit
+	 * is not planned, then next writes might coalesce, and so the
+	 * indirect may be perfect.
+	 */
+	boolean_t indirect = (size >= zfs_immediate_write_sz &&
+	    (size >= blocksize / 2 || !commit));
+
+	if (spa_has_slogs(zilog->zl_spa)) {
+		/* Dedicated slogs: never use indirect */
+		indirect = B_FALSE;
+	} else if (spa_has_special(zilog->zl_spa)) {
+		/* Special vdevs: only when beneficial */
+		boolean_t on_special = (blocksize <=
+		    zilog->zl_os->os_zpl_special_smallblock);
+		indirect &= (on_special || !zil_special_is_slog);
+	}
+
+	if (indirect)
+		return (WR_INDIRECT);
+	else if (commit)
+		return (WR_COPIED);
+	else
+		return (WR_NEED_COPY);
+}
+
 static uint64_t
 zil_itx_record_size(itx_t *itx)
 {
@@ -4418,3 +4471,9 @@ ZFS_MODULE_PARAM(zfs_zil, zil_, maxblocksize, UINT, ZMOD_RW,
 
 ZFS_MODULE_PARAM(zfs_zil, zil_, maxcopied, UINT, ZMOD_RW,
 	"Limit in bytes WR_COPIED size");
+
+ZFS_MODULE_PARAM(zfs, zfs_, immediate_write_sz, UINT, ZMOD_RW,
+	"Largest write size to store data into ZIL");
+
+ZFS_MODULE_PARAM(zfs_zil, zil_, special_is_slog, INT, ZMOD_RW,
+	"Treat special vdevs as SLOG");
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 67ee3d5ba2e..c3d96c049d3 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -4433,14 +4433,34 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
 	int allocator = (uint_t)cityhash1(os->os_dsl_dataset->ds_object)
 	    % spa->spa_alloc_count;
 	ZIOSTAT_BUMP(ziostat_total_allocations);
+
+	/* Try log class (dedicated slog devices) first */
 	error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
 	    txg, NULL, flags, &io_alloc_list, allocator, NULL);
 	*slog = (error == 0);
+
+	/* Try special_embedded_log class (reserved on special vdevs) */
+	if (error != 0) {
+		error = metaslab_alloc(spa, spa_special_embedded_log_class(spa),
+		    size, new_bp, 1, txg, NULL, flags, &io_alloc_list,
+		    allocator, NULL);
+	}
+
+	/* Try special class (general special vdev allocation) */
+	if (error != 0) {
+		error = metaslab_alloc(spa, spa_special_class(spa), size,
+		    new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator,
+		    NULL);
+	}
+
+	/* Try embedded_log class (reserved on normal vdevs) */
 	if (error != 0) {
 		error = metaslab_alloc(spa, spa_embedded_log_class(spa), size,
 		    new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator,
 		    NULL);
 	}
+
+	/* Finally fall back to normal class */
 	if (error != 0) {
 		ZIOSTAT_BUMP(ziostat_alloc_class_fallbacks);
 		error = metaslab_alloc(spa, spa_normal_class(spa), size,
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
index 3568d4f43fc..4116e16133b 100644
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -859,13 +859,8 @@ zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = {
 };
 
 /*
- * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
- *
- * We store data in the log buffers if it's small enough.
- * Otherwise we will later flush the data out via dmu_sync().
+ * zvol_log_write() handles TX_WRITE transactions.
  */
-static const ssize_t zvol_immediate_write_sz = 32768;
-
 void
 zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
     uint64_t size, boolean_t commit)
@@ -878,15 +873,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
 	if (zil_replaying(zilog, tx))
 		return;
 
-	if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
-		write_state = WR_INDIRECT;
-	else if (!spa_has_slogs(zilog->zl_spa) &&
-	    size >= blocksize && blocksize > zvol_immediate_write_sz)
-		write_state = WR_INDIRECT;
-	else if (commit)
-		write_state = WR_COPIED;
-	else
-		write_state = WR_NEED_COPY;
+	write_state = zil_write_state(zilog, size, blocksize, B_FALSE, commit);
 
 	while (size) {
 		itx_t *itx;

From a7a144e655850b4160943e4ba315eb9a5dc2b2fe Mon Sep 17 00:00:00 2001
From: shodanshok <g.danti@assyoma.it>
Date: Mon, 21 Jul 2025 19:32:01 +0200
Subject: [PATCH 34/72] enforce arc_dnode_limit

Linux kernel shrinker in the context of null/root memcg does not scan
dentry and inode caches added by a task running in non-root memcg. For
ZFS this means that dnode cache routinely overflows, evicting valuable
meta/data and putting additional memory pressure on the system.

This patch restores zfs_prune_aliases as fallback when the kernel
shrinker does nothing, enabling zfs to actually free dnodes. Moreover,
it (indirectly) calls arc_evict when dnode_size > dnode_limit.

Reviewed-by: Rob Norris <robn@despairlabs.com>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Gionatan Danti <g.danti@assyoma.it>
Closes #17487
Closes #17542
---
 include/sys/arc_impl.h           |  2 +-
 module/os/linux/zfs/zfs_vfsops.c | 65 ++++++++++++++++++++++++++++++++
 module/zfs/arc.c                 | 22 ++++++-----
 3 files changed, 78 insertions(+), 11 deletions(-)

diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h
index 1b30389107c..b55d5da3378 100644
--- a/include/sys/arc_impl.h
+++ b/include/sys/arc_impl.h
@@ -954,7 +954,7 @@ typedef struct arc_sums {
 	wmsum_t arcstat_data_size;
 	wmsum_t arcstat_metadata_size;
 	wmsum_t arcstat_dbuf_size;
-	wmsum_t arcstat_dnode_size;
+	aggsum_t arcstat_dnode_size;
 	wmsum_t arcstat_bonus_size;
 	wmsum_t arcstat_l2_hits;
 	wmsum_t arcstat_l2_misses;
diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c
index a3837f78466..396faef8f64 100644
--- a/module/os/linux/zfs/zfs_vfsops.c
+++ b/module/os/linux/zfs/zfs_vfsops.c
@@ -1216,6 +1216,63 @@ zfs_root(zfsvfs_t *zfsvfs, struct inode **ipp)
 	return (error);
 }
 
+/*
+ * Dentry and inode caches referenced by a task in non-root memcg are
+ * not going to be scanned by the kernel-provided shrinker. So, if
+ * kernel prunes nothing, fall back to this manual walk to free dnodes.
+ * To avoid scanning the same znodes multiple times they are always rotated
+ * to the end of the z_all_znodes list. New znodes are inserted at the
+ * end of the list so we're always scanning the oldest znodes first.
+ */
+static int
+zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan)
+{
+	znode_t **zp_array, *zp;
+	int max_array = MIN(nr_to_scan, PAGE_SIZE * 8 / sizeof (znode_t *));
+	int objects = 0;
+	int i = 0, j = 0;
+
+	zp_array = vmem_zalloc(max_array * sizeof (znode_t *), KM_SLEEP);
+
+	mutex_enter(&zfsvfs->z_znodes_lock);
+	while ((zp = list_head(&zfsvfs->z_all_znodes)) != NULL) {
+
+		if ((i++ > nr_to_scan) || (j >= max_array))
+			break;
+
+		ASSERT(list_link_active(&zp->z_link_node));
+		list_remove(&zfsvfs->z_all_znodes, zp);
+		list_insert_tail(&zfsvfs->z_all_znodes, zp);
+
+		/* Skip active znodes and .zfs entries */
+		if (MUTEX_HELD(&zp->z_lock) || zp->z_is_ctldir)
+			continue;
+
+		if (igrab(ZTOI(zp)) == NULL)
+			continue;
+
+		zp_array[j] = zp;
+		j++;
+	}
+	mutex_exit(&zfsvfs->z_znodes_lock);
+
+	for (i = 0; i < j; i++) {
+		zp = zp_array[i];
+
+		ASSERT3P(zp, !=, NULL);
+		d_prune_aliases(ZTOI(zp));
+
+		if (atomic_read(&ZTOI(zp)->i_count) == 1)
+			objects++;
+
+		zrele(zp);
+	}
+
+	vmem_free(zp_array, max_array * sizeof (znode_t *));
+
+	return (objects);
+}
+
 /*
  * The ARC has requested that the filesystem drop entries from the dentry
  * and inode caches.  This can occur when the ARC needs to free meta data
@@ -1267,6 +1324,14 @@ zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects)
 	*objects = (*shrinker->scan_objects)(shrinker, &sc);
 #endif
 
+	/*
+	 * Fall back to zfs_prune_aliases if kernel's shrinker did nothing
+	 * due to dentry and inode caches being referenced by a task running
+	 * in non-root memcg.
+	 */
+	if (*objects == 0)
+		*objects = zfs_prune_aliases(zfsvfs, nr_to_scan);
+
 	zfs_exit(zfsvfs, FTAG);
 
 	dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 04ca32356a6..a2cb3b8a53e 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -2631,7 +2631,7 @@ arc_space_consume(uint64_t space, arc_space_type_t type)
 		ARCSTAT_INCR(arcstat_bonus_size, space);
 		break;
 	case ARC_SPACE_DNODE:
-		ARCSTAT_INCR(arcstat_dnode_size, space);
+		aggsum_add(&arc_sums.arcstat_dnode_size, space);
 		break;
 	case ARC_SPACE_DBUF:
 		ARCSTAT_INCR(arcstat_dbuf_size, space);
@@ -2677,7 +2677,7 @@ arc_space_return(uint64_t space, arc_space_type_t type)
 		ARCSTAT_INCR(arcstat_bonus_size, -space);
 		break;
 	case ARC_SPACE_DNODE:
-		ARCSTAT_INCR(arcstat_dnode_size, -space);
+		aggsum_add(&arc_sums.arcstat_dnode_size, -space);
 		break;
 	case ARC_SPACE_DBUF:
 		ARCSTAT_INCR(arcstat_dbuf_size, -space);
@@ -4490,7 +4490,7 @@ arc_evict(void)
 	 * target is not evictable or if they go over arc_dnode_limit.
 	 */
 	int64_t prune = 0;
-	int64_t dn = wmsum_value(&arc_sums.arcstat_dnode_size);
+	int64_t dn = aggsum_value(&arc_sums.arcstat_dnode_size);
 	int64_t nem = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA])
 	    + zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA])
 	    - zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA])
@@ -5082,11 +5082,13 @@ arc_is_overflowing(boolean_t lax, boolean_t use_reserve)
 	 * in the ARC. In practice, that's in the tens of MB, which is low
 	 * enough to be safe.
 	 */
-	int64_t over = aggsum_lower_bound(&arc_sums.arcstat_size) - arc_c -
+	int64_t arc_over = aggsum_lower_bound(&arc_sums.arcstat_size) - arc_c -
 	    zfs_max_recordsize;
+	int64_t dn_over = aggsum_lower_bound(&arc_sums.arcstat_dnode_size) -
+	    arc_dnode_limit;
 
 	/* Always allow at least one block of overflow. */
-	if (over < 0)
+	if (arc_over < 0 && dn_over <= 0)
 		return (ARC_OVF_NONE);
 
 	/* If we are under memory pressure, report severe overflow. */
@@ -5097,7 +5099,7 @@ arc_is_overflowing(boolean_t lax, boolean_t use_reserve)
 	int64_t overflow = (arc_c >> zfs_arc_overflow_shift) / 2;
 	if (use_reserve)
 		overflow *= 3;
-	return (over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE);
+	return (arc_over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE);
 }
 
 static abd_t *
@@ -7326,7 +7328,7 @@ arc_kstat_update(kstat_t *ksp, int rw)
 #if defined(COMPAT_FREEBSD11)
 	as->arcstat_other_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_bonus_size) +
-	    wmsum_value(&arc_sums.arcstat_dnode_size) +
+	    aggsum_value(&arc_sums.arcstat_dnode_size) +
 	    wmsum_value(&arc_sums.arcstat_dbuf_size);
 #endif
 
@@ -7368,7 +7370,7 @@ arc_kstat_update(kstat_t *ksp, int rw)
 	    &as->arcstat_uncached_evictable_metadata);
 
 	as->arcstat_dnode_size.value.ui64 =
-	    wmsum_value(&arc_sums.arcstat_dnode_size);
+	    aggsum_value(&arc_sums.arcstat_dnode_size);
 	as->arcstat_bonus_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_bonus_size);
 	as->arcstat_l2_hits.value.ui64 =
@@ -7738,7 +7740,7 @@ arc_state_init(void)
 	wmsum_init(&arc_sums.arcstat_data_size, 0);
 	wmsum_init(&arc_sums.arcstat_metadata_size, 0);
 	wmsum_init(&arc_sums.arcstat_dbuf_size, 0);
-	wmsum_init(&arc_sums.arcstat_dnode_size, 0);
+	aggsum_init(&arc_sums.arcstat_dnode_size, 0);
 	wmsum_init(&arc_sums.arcstat_bonus_size, 0);
 	wmsum_init(&arc_sums.arcstat_l2_hits, 0);
 	wmsum_init(&arc_sums.arcstat_l2_misses, 0);
@@ -7897,7 +7899,7 @@ arc_state_fini(void)
 	wmsum_fini(&arc_sums.arcstat_data_size);
 	wmsum_fini(&arc_sums.arcstat_metadata_size);
 	wmsum_fini(&arc_sums.arcstat_dbuf_size);
-	wmsum_fini(&arc_sums.arcstat_dnode_size);
+	aggsum_fini(&arc_sums.arcstat_dnode_size);
 	wmsum_fini(&arc_sums.arcstat_bonus_size);
 	wmsum_fini(&arc_sums.arcstat_l2_hits);
 	wmsum_fini(&arc_sums.arcstat_l2_misses);

From cecff09faa9650d91b3093af715edaf4533fb0f3 Mon Sep 17 00:00:00 2001
From: shodanshok <g.danti@assyoma.it>
Date: Wed, 23 Jul 2025 00:06:09 +0200
Subject: [PATCH 35/72] add uncompressed_size to arc_summary

Add uncompressed ARC size to statistics reported by arc_summary.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Gionatan Danti <g.danti@assyoma.it>
Closes #17556
---
 cmd/arc_summary | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cmd/arc_summary b/cmd/arc_summary
index c1319573220..e60c6b64e8a 100755
--- a/cmd/arc_summary
+++ b/cmd/arc_summary
@@ -559,6 +559,7 @@ def section_arc(kstats_dict):
     print()
 
     compressed_size = arc_stats['compressed_size']
+    uncompressed_size = arc_stats['uncompressed_size']
     overhead_size = arc_stats['overhead_size']
     bonus_size = arc_stats['bonus_size']
     dnode_size = arc_stats['dnode_size']
@@ -671,6 +672,8 @@ def section_arc(kstats_dict):
     print()
 
     print('ARC misc:')
+    prt_i2('Uncompressed size:', f_perc(uncompressed_size, compressed_size),
+           f_bytes(uncompressed_size))
     prt_i1('Memory throttles:', arc_stats['memory_throttle_count'])
     prt_i1('Memory direct reclaims:', arc_stats['memory_direct_count'])
     prt_i1('Memory indirect reclaims:', arc_stats['memory_indirect_count'])

From 96d20d7d59143ec481d7932cd1d731df0f094ab4 Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Tue, 15 Jul 2025 22:43:42 +1000
Subject: [PATCH 36/72] linux/kmem: remove PF_FSTRANS and PF_MEMALLOC_NOIO
 compat

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Sponsored-by: https://despairlabs.com/sponsor/
Closes #17551
---
 include/os/linux/spl/sys/kmem.h   | 34 +++++--------------------------
 include/sys/zfs_context.h         |  1 -
 lib/libzpool/kernel.c             |  6 ------
 module/os/linux/zfs/zfs_file_os.c | 23 ---------------------
 4 files changed, 5 insertions(+), 59 deletions(-)

diff --git a/include/os/linux/spl/sys/kmem.h b/include/os/linux/spl/sys/kmem.h
index 995236117dd..4c5baa331d0 100644
--- a/include/os/linux/spl/sys/kmem.h
+++ b/include/os/linux/spl/sys/kmem.h
@@ -61,7 +61,7 @@ void *spl_kvmalloc(size_t size, gfp_t flags);
 /*
  * Convert a KM_* flags mask to its Linux GFP_* counterpart.  The conversion
  * function is context aware which means that KM_SLEEP allocations can be
- * safely used in syncing contexts which have set PF_FSTRANS.
+ * safely used in syncing contexts which have set SPL_FSTRANS.
  */
 static inline gfp_t
 kmem_flags_convert(int flags)
@@ -91,25 +91,11 @@ typedef struct {
 } fstrans_cookie_t;
 
 /*
- * Introduced in Linux 3.9, however this cannot be solely relied on before
- * Linux 3.18 as it doesn't turn off __GFP_FS as it should.
+ * SPL_FSTRANS is the set of flags that indicate that the task is in a
+ * filesystem or IO codepath, and so any allocation must not call back into
+ * those codepaths (eg to swap).
  */
-#ifdef PF_MEMALLOC_NOIO
-#define	__SPL_PF_MEMALLOC_NOIO (PF_MEMALLOC_NOIO)
-#else
-#define	__SPL_PF_MEMALLOC_NOIO (0)
-#endif
-
-/*
- * PF_FSTRANS is removed from Linux 4.12
- */
-#ifdef PF_FSTRANS
-#define	__SPL_PF_FSTRANS (PF_FSTRANS)
-#else
-#define	__SPL_PF_FSTRANS (0)
-#endif
-
-#define	SPL_FSTRANS (__SPL_PF_FSTRANS|__SPL_PF_MEMALLOC_NOIO)
+#define	SPL_FSTRANS (PF_MEMALLOC_NOIO)
 
 static inline fstrans_cookie_t
 spl_fstrans_mark(void)
@@ -141,16 +127,6 @@ spl_fstrans_check(void)
 	return (current->flags & SPL_FSTRANS);
 }
 
-/*
- * specifically used to check PF_FSTRANS flag, cannot be relied on for
- * checking spl_fstrans_mark().
- */
-static inline int
-__spl_pf_fstrans_check(void)
-{
-	return (current->flags & __SPL_PF_FSTRANS);
-}
-
 /*
  * Kernel compatibility for GFP flags
  */
diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h
index 31edab919f0..0f76c7adcf8 100644
--- a/include/sys/zfs_context.h
+++ b/include/sys/zfs_context.h
@@ -766,7 +766,6 @@ typedef int fstrans_cookie_t;
 
 extern fstrans_cookie_t spl_fstrans_mark(void);
 extern void spl_fstrans_unmark(fstrans_cookie_t);
-extern int __spl_pf_fstrans_check(void);
 extern int kmem_cache_reap_active(void);
 
 
diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c
index e397fc851cc..072332e41ca 100644
--- a/lib/libzpool/kernel.c
+++ b/lib/libzpool/kernel.c
@@ -1024,12 +1024,6 @@ spl_fstrans_unmark(fstrans_cookie_t cookie)
 	(void) cookie;
 }
 
-int
-__spl_pf_fstrans_check(void)
-{
-	return (0);
-}
-
 int
 kmem_cache_reap_active(void)
 {
diff --git a/module/os/linux/zfs/zfs_file_os.c b/module/os/linux/zfs/zfs_file_os.c
index d193eb80dca..c729947369c 100644
--- a/module/os/linux/zfs/zfs_file_os.c
+++ b/module/os/linux/zfs/zfs_file_os.c
@@ -260,24 +260,12 @@ zfs_file_fsync(zfs_file_t *filp, int flags)
 {
 	int datasync = 0;
 	int error;
-	int fstrans;
 
 	if (flags & O_DSYNC)
 		datasync = 1;
 
-	/*
-	 * May enter XFS which generates a warning when PF_FSTRANS is set.
-	 * To avoid this the flag is cleared over vfs_sync() and then reset.
-	 */
-	fstrans = __spl_pf_fstrans_check();
-	if (fstrans)
-		current->flags &= ~(__SPL_PF_FSTRANS);
-
 	error = -vfs_fsync(filp, datasync);
 
-	if (fstrans)
-		current->flags |= __SPL_PF_FSTRANS;
-
 	return (error);
 }
 
@@ -291,14 +279,6 @@ zfs_file_fsync(zfs_file_t *filp, int flags)
 int
 zfs_file_deallocate(zfs_file_t *fp, loff_t offset, loff_t len)
 {
-	/*
-	 * May enter XFS which generates a warning when PF_FSTRANS is set.
-	 * To avoid this the flag is cleared over vfs_sync() and then reset.
-	 */
-	int fstrans = __spl_pf_fstrans_check();
-	if (fstrans)
-		current->flags &= ~(__SPL_PF_FSTRANS);
-
 	/*
 	 * When supported by the underlying file system preferentially
 	 * use the fallocate() callback to preallocate the space.
@@ -308,9 +288,6 @@ zfs_file_deallocate(zfs_file_t *fp, loff_t offset, loff_t len)
 		error = -fp->f_op->fallocate(fp,
 		    FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, len);
 
-	if (fstrans)
-		current->flags |= __SPL_PF_FSTRANS;
-
 	if (error)
 		return (SET_ERROR(error));
 

From 1c483cf3d0f5d6987677a17ce0a94d16d005bddb Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Wed, 16 Jul 2025 10:46:03 +1000
Subject: [PATCH 37/72] linux/kmem: remove long-obsolete __GFP compat flags

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Sponsored-by: https://despairlabs.com/sponsor/
Closes #17551
---
 include/os/linux/spl/sys/kmem.h | 12 ------------
 module/os/linux/zfs/abd_os.c    |  4 ----
 2 files changed, 16 deletions(-)

diff --git a/include/os/linux/spl/sys/kmem.h b/include/os/linux/spl/sys/kmem.h
index 4c5baa331d0..3d624f38b8c 100644
--- a/include/os/linux/spl/sys/kmem.h
+++ b/include/os/linux/spl/sys/kmem.h
@@ -127,18 +127,6 @@ spl_fstrans_check(void)
 	return (current->flags & SPL_FSTRANS);
 }
 
-/*
- * Kernel compatibility for GFP flags
- */
-/* < 4.13 */
-#ifndef __GFP_RETRY_MAYFAIL
-#define	__GFP_RETRY_MAYFAIL	__GFP_REPEAT
-#endif
-/* < 4.4 */
-#ifndef __GFP_RECLAIM
-#define	__GFP_RECLAIM		__GFP_WAIT
-#endif
-
 #ifdef HAVE_ATOMIC64_T
 #define	kmem_alloc_used_add(size)	atomic64_add(size, &kmem_alloc_used)
 #define	kmem_alloc_used_sub(size)	atomic64_sub(size, &kmem_alloc_used)
diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
index e1140b31a97..248c9b7a6d3 100644
--- a/module/os/linux/zfs/abd_os.c
+++ b/module/os/linux/zfs/abd_os.c
@@ -256,10 +256,6 @@ abd_unmark_zfs_page(struct page *page)
 
 #ifndef CONFIG_HIGHMEM
 
-#ifndef __GFP_RECLAIM
-#define	__GFP_RECLAIM		__GFP_WAIT
-#endif
-
 /*
  * The goal is to minimize fragmentation by preferentially populating ABDs
  * with higher order compound pages from a single zone.  Allocation size is

From 9292071565b96f0945d9f0e794b0cb51856f5cef Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Fri, 18 Jul 2025 15:18:33 +1000
Subject: [PATCH 38/72] linux/kmem: remove HAVE_ATOMIC64_T and kmem_alloc_used
 wrappers

Seems like we haven't set it since the SPL was pulled into the main ZFS
tree. In removing the define, I've taken the 64-bit version (ie the one
that _hasn't_ been running since back then) because it looks like its
closer to the intended width by the way its used.

Since the macros ar eno longer needed as a selector, pull those too.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Sponsored-by: https://despairlabs.com/sponsor/
Closes #17551
---
 include/os/linux/spl/sys/kmem.h | 15 +--------------
 module/os/linux/spl/spl-kmem.c  | 22 +++++++++-------------
 module/os/linux/spl/spl-proc.c  | 10 +---------
 3 files changed, 11 insertions(+), 36 deletions(-)

diff --git a/include/os/linux/spl/sys/kmem.h b/include/os/linux/spl/sys/kmem.h
index 3d624f38b8c..fe34de9c179 100644
--- a/include/os/linux/spl/sys/kmem.h
+++ b/include/os/linux/spl/sys/kmem.h
@@ -127,21 +127,8 @@ spl_fstrans_check(void)
 	return (current->flags & SPL_FSTRANS);
 }
 
-#ifdef HAVE_ATOMIC64_T
-#define	kmem_alloc_used_add(size)	atomic64_add(size, &kmem_alloc_used)
-#define	kmem_alloc_used_sub(size)	atomic64_sub(size, &kmem_alloc_used)
-#define	kmem_alloc_used_read()		atomic64_read(&kmem_alloc_used)
-#define	kmem_alloc_used_set(size)	atomic64_set(&kmem_alloc_used, size)
 extern atomic64_t kmem_alloc_used;
-extern unsigned long long kmem_alloc_max;
-#else  /* HAVE_ATOMIC64_T */
-#define	kmem_alloc_used_add(size)	atomic_add(size, &kmem_alloc_used)
-#define	kmem_alloc_used_sub(size)	atomic_sub(size, &kmem_alloc_used)
-#define	kmem_alloc_used_read()		atomic_read(&kmem_alloc_used)
-#define	kmem_alloc_used_set(size)	atomic_set(&kmem_alloc_used, size)
-extern atomic_t kmem_alloc_used;
-extern unsigned long long kmem_alloc_max;
-#endif /* HAVE_ATOMIC64_T */
+extern uint64_t kmem_alloc_max;
 
 extern unsigned int spl_kmem_alloc_warn;
 extern unsigned int spl_kmem_alloc_max;
diff --git a/module/os/linux/spl/spl-kmem.c b/module/os/linux/spl/spl-kmem.c
index 337a4bcf76a..9fe008cef86 100644
--- a/module/os/linux/spl/spl-kmem.c
+++ b/module/os/linux/spl/spl-kmem.c
@@ -302,13 +302,8 @@ spl_kmem_free_impl(const void *buf, size_t size)
 #ifdef DEBUG_KMEM
 
 /* Shim layer memory accounting */
-#ifdef HAVE_ATOMIC64_T
 atomic64_t kmem_alloc_used = ATOMIC64_INIT(0);
-unsigned long long kmem_alloc_max = 0;
-#else  /* HAVE_ATOMIC64_T */
-atomic_t kmem_alloc_used = ATOMIC_INIT(0);
-unsigned long long kmem_alloc_max = 0;
-#endif /* HAVE_ATOMIC64_T */
+uint64_t kmem_alloc_max = 0;
 
 EXPORT_SYMBOL(kmem_alloc_used);
 EXPORT_SYMBOL(kmem_alloc_max);
@@ -320,9 +315,9 @@ spl_kmem_alloc_debug(size_t size, int flags, int node)
 
 	ptr = spl_kmem_alloc_impl(size, flags, node);
 	if (ptr) {
-		kmem_alloc_used_add(size);
-		if (unlikely(kmem_alloc_used_read() > kmem_alloc_max))
-			kmem_alloc_max = kmem_alloc_used_read();
+		atomic64_add(size, &kmem_alloc_used);
+		if (unlikely(atomic64_read(&kmem_alloc_used) > kmem_alloc_max))
+			kmem_alloc_max = atomic64_read(&kmem_alloc_used);
 	}
 
 	return (ptr);
@@ -331,7 +326,7 @@ spl_kmem_alloc_debug(size_t size, int flags, int node)
 inline void
 spl_kmem_free_debug(const void *ptr, size_t size)
 {
-	kmem_alloc_used_sub(size);
+	atomic64_sub(size, &kmem_alloc_used);
 	spl_kmem_free_impl(ptr, size);
 }
 
@@ -595,7 +590,7 @@ spl_kmem_init(void)
 {
 
 #ifdef DEBUG_KMEM
-	kmem_alloc_used_set(0);
+	atomic64_set(&kmem_alloc_used, 0);
 
 
 
@@ -617,9 +612,10 @@ spl_kmem_fini(void)
 	 * at that address to aid in debugging.  Performance is not
 	 * a serious concern here since it is module unload time.
 	 */
-	if (kmem_alloc_used_read() != 0)
+	if (atomic64_read(&kmem_alloc_used) != 0)
 		printk(KERN_WARNING "kmem leaked %ld/%llu bytes\n",
-		    (unsigned long)kmem_alloc_used_read(), kmem_alloc_max);
+		    (unsigned long)atomic64_read(&kmem_alloc_used),
+		    kmem_alloc_max);
 
 #ifdef DEBUG_KMEM_TRACKING
 	spl_kmem_fini_tracking(&kmem_list, &kmem_lock);
diff --git a/module/os/linux/spl/spl-proc.c b/module/os/linux/spl/spl-proc.c
index 4ed0deedd5b..8cdd5fc5cfe 100644
--- a/module/os/linux/spl/spl-proc.c
+++ b/module/os/linux/spl/spl-proc.c
@@ -82,11 +82,7 @@ proc_domemused(CONST_CTL_TABLE *table, int write,
 	if (write) {
 		*ppos += *lenp;
 	} else {
-#ifdef HAVE_ATOMIC64_T
 		val = atomic64_read((atomic64_t *)table->data);
-#else
-		val = atomic_read((atomic_t *)table->data);
-#endif /* HAVE_ATOMIC64_T */
 		rc = proc_doulongvec_minmax(&dummy, write, buffer, lenp, ppos);
 	}
 
@@ -315,18 +311,14 @@ static struct ctl_table spl_kmem_table[] = {
 	{
 		.procname	= "kmem_used",
 		.data		= &kmem_alloc_used,
-#ifdef HAVE_ATOMIC64_T
 		.maxlen		= sizeof (atomic64_t),
-#else
-		.maxlen		= sizeof (atomic_t),
-#endif /* HAVE_ATOMIC64_T */
 		.mode		= 0444,
 		.proc_handler	= &proc_domemused,
 	},
 	{
 		.procname	= "kmem_max",
 		.data		= &kmem_alloc_max,
-		.maxlen		= sizeof (unsigned long),
+		.maxlen		= sizeof (uint64_t),
 		.extra1		= &table_min,
 		.extra2		= &table_max,
 		.mode		= 0444,

From 2755e2aa60f278282f1a56238d46554581f8ef0f Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Fri, 27 Jun 2025 12:54:47 +1000
Subject: [PATCH 39/72] spa_activity_check: narrow scope of MMP vars

They aren't used outside these very small blocks, and their initial
values are never used at all.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Sponsored-by: https://despairlabs.com/sponsor/
Closes #17551
---
 module/zfs/spa.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 46794cc62e3..c0876c93540 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -3777,20 +3777,17 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config,
 	 * ZPOOL_CONFIG_MMP_HOSTID   - hostid from the active pool
 	 */
 	if (error == EREMOTEIO) {
-		const char *hostname = "<unknown>";
-		uint64_t hostid = 0;
-
 		if (mmp_label) {
 			if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) {
-				hostname = fnvlist_lookup_string(mmp_label,
-				    ZPOOL_CONFIG_HOSTNAME);
+				const char *hostname = fnvlist_lookup_string(
+				    mmp_label, ZPOOL_CONFIG_HOSTNAME);
 				fnvlist_add_string(spa->spa_load_info,
 				    ZPOOL_CONFIG_MMP_HOSTNAME, hostname);
 			}
 
 			if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) {
-				hostid = fnvlist_lookup_uint64(mmp_label,
-				    ZPOOL_CONFIG_HOSTID);
+				uint64_t hostid = fnvlist_lookup_uint64(
+				    mmp_label, ZPOOL_CONFIG_HOSTID);
 				fnvlist_add_uint64(spa->spa_load_info,
 				    ZPOOL_CONFIG_MMP_HOSTID, hostid);
 			}

From e9d249d7e415fa84a378833e775ae0f01503ba99 Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Fri, 27 Jun 2025 12:56:36 +1000
Subject: [PATCH 40/72] test/draid: fix error return

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Sponsored-by: https://despairlabs.com/sponsor/
Closes #17551
---
 tests/zfs-tests/cmd/draid.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/zfs-tests/cmd/draid.c b/tests/zfs-tests/cmd/draid.c
index 8d0bdc450f8..2c1ab1b9bec 100644
--- a/tests/zfs-tests/cmd/draid.c
+++ b/tests/zfs-tests/cmd/draid.c
@@ -204,7 +204,7 @@ write_map(const char *filename, nvlist_t *allcfgs)
 		error = errno;
 		free(buf);
 		free(tmpname);
-		return (errno);
+		return (error);
 	}
 
 	ssize_t rc, bytes = 0;

From d2b9e66b8805b992012911141c562b8c8d117183 Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Fri, 27 Jun 2025 12:57:09 +1000
Subject: [PATCH 41/72] vdev_raidz: asize/psize: remove unnecessary var
 initialisation

It would have been optimised away anyway so it doesn't matter, but it
does make things a little tougher to read.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Sponsored-by: https://despairlabs.com/sponsor/
Closes #17551
---
 module/zfs/vdev_raidz.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
index 7a6a01603da..ecb6c7f50b4 100644
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -2249,10 +2249,9 @@ vdev_raidz_asize_to_psize(vdev_t *vd, uint64_t asize, uint64_t txg)
 	vdev_raidz_t *vdrz = vd->vdev_tsd;
 	uint64_t psize;
 	uint64_t ashift = vd->vdev_top->vdev_ashift;
-	uint64_t cols = vdrz->vd_original_width;
 	uint64_t nparity = vdrz->vd_nparity;
 
-	cols = vdev_raidz_get_logical_width(vdrz, txg);
+	uint64_t cols = vdev_raidz_get_logical_width(vdrz, txg);
 
 	ASSERT0(asize % (1 << ashift));
 
@@ -2285,10 +2284,9 @@ vdev_raidz_psize_to_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
 	vdev_raidz_t *vdrz = vd->vdev_tsd;
 	uint64_t asize;
 	uint64_t ashift = vd->vdev_top->vdev_ashift;
-	uint64_t cols = vdrz->vd_original_width;
 	uint64_t nparity = vdrz->vd_nparity;
 
-	cols = vdev_raidz_get_logical_width(vdrz, txg);
+	uint64_t cols = vdev_raidz_get_logical_width(vdrz, txg);
 
 	asize = ((psize - 1) >> ashift) + 1;
 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));

From bf38c15071ea036d4e1c60648c29fdd7c9863121 Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Fri, 27 Jun 2025 12:56:03 +1000
Subject: [PATCH 42/72] everywhere: misc unnecessary var init/update

These are all cases where we initialise or update a variable, and then
never use it. None of them particularly matter, as the compiler should
optimise them all away during dead store elimination, but some static
analysers complain about them and they are extra work for casual readers
to follow, so worth removing.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Sponsored-by: https://despairlabs.com/sponsor/
Closes #17551
---
 cmd/zdb/zdb.c                    |  8 +++-----
 cmd/zed/zed_event.c              |  2 +-
 cmd/zfs/zfs_main.c               | 10 ++++------
 cmd/zpool/zpool_iter.c           |  5 +++--
 cmd/zpool/zpool_main.c           |  9 +++------
 cmd/zpool/zpool_vdev.c           |  5 ++---
 cmd/ztest.c                      |  2 +-
 lib/libspl/include/umem.h        |  2 +-
 lib/libzfs/libzfs_crypto.c       |  2 +-
 lib/libzfs/libzfs_pool.c         |  4 ++--
 lib/libzpool/kernel.c            |  4 ++--
 module/zfs/zap.c                 |  2 +-
 module/zfs/zio.c                 |  2 +-
 module/zstd/zfs_zstd.c           |  3 +--
 tests/zfs-tests/cmd/idmap_util.c |  2 +-
 tests/zfs-tests/cmd/mktree.c     |  2 +-
 tests/zfs-tests/cmd/mmapwrite.c  |  4 ++--
 17 files changed, 30 insertions(+), 38 deletions(-)

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 2dcf6404c09..6a48658991b 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -1991,7 +1991,7 @@ dump_ddt_log(ddt_t *ddt)
 				c += strlcpy(&flagstr[c], " UNKNOWN",
 				    sizeof (flagstr) - c);
 			flagstr[1] = '[';
-			flagstr[c++] = ']';
+			flagstr[c] = ']';
 		}
 
 		uint64_t count = avl_numnodes(&ddl->ddl_tree);
@@ -8800,7 +8800,6 @@ zdb_decompress_block(abd_t *pabd, void *buf, void *lbuf, uint64_t lsize,
 	(void) buf;
 	uint64_t orig_lsize = lsize;
 	boolean_t tryzle = ((getenv("ZDB_NO_ZLE") == NULL));
-	boolean_t found = B_FALSE;
 	/*
 	 * We don't know how the data was compressed, so just try
 	 * every decompress function at every inflated blocksize.
@@ -8843,20 +8842,19 @@ zdb_decompress_block(abd_t *pabd, void *buf, void *lbuf, uint64_t lsize,
 		for (cfuncp = cfuncs; *cfuncp; cfuncp++) {
 			if (try_decompress_block(pabd, lsize, psize, flags,
 			    *cfuncp, lbuf, lbuf2)) {
-				found = B_TRUE;
+				tryzle = B_FALSE;
 				break;
 			}
 		}
 		if (*cfuncp != 0)
 			break;
 	}
-	if (!found && tryzle) {
+	if (tryzle) {
 		for (lsize = orig_lsize; lsize <= maxlsize;
 		    lsize += SPA_MINBLOCKSIZE) {
 			if (try_decompress_block(pabd, lsize, psize, flags,
 			    ZIO_COMPRESS_ZLE, lbuf, lbuf2)) {
 				*cfuncp = ZIO_COMPRESS_ZLE;
-				found = B_TRUE;
 				break;
 			}
 		}
diff --git a/cmd/zed/zed_event.c b/cmd/zed/zed_event.c
index 296c222ca38..ba7cba304b1 100644
--- a/cmd/zed/zed_event.c
+++ b/cmd/zed/zed_event.c
@@ -110,7 +110,7 @@ zed_event_fini(struct zed_conf *zcp)
 static void
 _bump_event_queue_length(void)
 {
-	int zzlm = -1, wr;
+	int zzlm, wr;
 	char qlen_buf[12] = {0}; /* parameter is int => max "-2147483647\n" */
 	long int qlen, orig_qlen;
 
diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c
index 842e5d088d4..81727224b04 100644
--- a/cmd/zfs/zfs_main.c
+++ b/cmd/zfs/zfs_main.c
@@ -1974,9 +1974,8 @@ fill_dataset_info(nvlist_t *list, zfs_handle_t *zhp, boolean_t as_int)
 	}
 
 	if (type == ZFS_TYPE_SNAPSHOT) {
-		char *ds, *snap;
-		ds = snap = strdup(zfs_get_name(zhp));
-		ds = strsep(&snap, "@");
+		char *snap = strdup(zfs_get_name(zhp));
+		char *ds = strsep(&snap, "@");
 		fnvlist_add_string(list, "dataset", ds);
 		fnvlist_add_string(list, "snapshot_name", snap);
 		free(ds);
@@ -2019,8 +2018,7 @@ get_callback(zfs_handle_t *zhp, void *data)
 	nvlist_t *user_props = zfs_get_user_props(zhp);
 	zprop_list_t *pl = cbp->cb_proplist;
 	nvlist_t *propval;
-	nvlist_t *item, *d, *props;
-	item = d = props = NULL;
+	nvlist_t *item, *d = NULL, *props = NULL;
 	const char *strval;
 	const char *sourceval;
 	boolean_t received = is_recvd_column(cbp);
@@ -5879,7 +5877,7 @@ parse_fs_perm_set(fs_perm_set_t *fspset, nvlist_t *nvl)
 static inline const char *
 deleg_perm_comment(zfs_deleg_note_t note)
 {
-	const char *str = "";
+	const char *str;
 
 	/* subcommands */
 	switch (note) {
diff --git a/cmd/zpool/zpool_iter.c b/cmd/zpool/zpool_iter.c
index 2ec189b9865..2eec9a95e24 100644
--- a/cmd/zpool/zpool_iter.c
+++ b/cmd/zpool/zpool_iter.c
@@ -379,8 +379,8 @@ process_unique_cmd_columns(vdev_cmd_data_list_t *vcdl)
 static int
 vdev_process_cmd_output(vdev_cmd_data_t *data, char *line)
 {
-	char *col = NULL;
-	char *val = line;
+	char *col;
+	char *val;
 	char *equals;
 	char **tmp;
 
@@ -397,6 +397,7 @@ vdev_process_cmd_output(vdev_cmd_data_t *data, char *line)
 		col = line;
 		val = equals + 1;
 	} else {
+		col = NULL;
 		val = line;
 	}
 
diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index f7cd73085f0..b0c060aa5da 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -1788,7 +1788,7 @@ zpool_do_labelclear(int argc, char **argv)
 {
 	char vdev[MAXPATHLEN];
 	char *name = NULL;
-	int c, fd = -1, ret = 0;
+	int c, fd, ret = 0;
 	nvlist_t *config;
 	pool_state_t state;
 	boolean_t inuse = B_FALSE;
@@ -6157,7 +6157,6 @@ static void
 get_interval_count_filter_guids(int *argc, char **argv, float *interval,
     unsigned long *count, iostat_cbdata_t *cb)
 {
-	char **tmpargv = argv;
 	int argc_for_interval = 0;
 
 	/* Is the last arg an interval value?  Or a guid? */
@@ -6181,7 +6180,7 @@ get_interval_count_filter_guids(int *argc, char **argv, float *interval,
 	}
 
 	/* Point to our list of possible intervals */
-	tmpargv = &argv[*argc - argc_for_interval];
+	char **tmpargv = &argv[*argc - argc_for_interval];
 
 	*argc = *argc - argc_for_interval;
 	get_interval_count(&argc_for_interval, tmpargv,
@@ -6377,7 +6376,6 @@ zpool_do_iostat(int argc, char **argv)
 	int npools;
 	float interval = 0;
 	unsigned long count = 0;
-	int winheight = 24;
 	zpool_list_t *list;
 	boolean_t verbose = B_FALSE;
 	boolean_t latency = B_FALSE, l_histo = B_FALSE, rq_histo = B_FALSE;
@@ -6673,7 +6671,7 @@ zpool_do_iostat(int argc, char **argv)
 			 * even when terminal window has its height
 			 * changed.
 			 */
-			winheight = terminal_height();
+			int winheight = terminal_height();
 			/*
 			 * Are we connected to TTY? If not, headers_once
 			 * should be true, to avoid breaking scripts.
@@ -10706,7 +10704,6 @@ status_callback_json(zpool_handle_t *zhp, void *data)
 	uint_t c;
 	vdev_stat_t *vs;
 	nvlist_t *item, *d, *load_info, *vds;
-	item = d = NULL;
 
 	/* If dedup stats were requested, also fetch dedupcached. */
 	if (cbp->cb_dedup_stats > 1)
diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c
index 9d7a9b74bb8..684b46a2d67 100644
--- a/cmd/zpool/zpool_vdev.c
+++ b/cmd/zpool/zpool_vdev.c
@@ -574,7 +574,6 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)
 				nvlist_t *cnv = child[c];
 				const char *path;
 				struct stat64 statbuf;
-				int64_t size = -1LL;
 				const char *childtype;
 				int fd, err;
 
@@ -656,7 +655,7 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)
 				    statbuf.st_size == MAXOFFSET_T)
 					continue;
 
-				size = statbuf.st_size;
+				int64_t size = statbuf.st_size;
 
 				/*
 				 * Also make sure that devices and
@@ -1365,7 +1364,7 @@ is_grouping(const char *type, int *mindev, int *maxdev)
 static int
 draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children)
 {
-	uint64_t nparity = 1;
+	uint64_t nparity;
 	uint64_t nspares = 0;
 	uint64_t ndata = UINT64_MAX;
 	uint64_t ngroups = 1;
diff --git a/cmd/ztest.c b/cmd/ztest.c
index ec1efd638f1..2e88ae3e799 100644
--- a/cmd/ztest.c
+++ b/cmd/ztest.c
@@ -919,7 +919,7 @@ ztest_parse_name_value(const char *input, ztest_shared_opts_t *zo)
 {
 	char name[32];
 	char *value;
-	int state = ZTEST_VDEV_CLASS_RND;
+	int state;
 
 	(void) strlcpy(name, input, sizeof (name));
 
diff --git a/lib/libspl/include/umem.h b/lib/libspl/include/umem.h
index 6945aae9f3c..3e44610e4e2 100644
--- a/lib/libspl/include/umem.h
+++ b/lib/libspl/include/umem.h
@@ -102,7 +102,7 @@ static inline void *
 umem_alloc_aligned(size_t size, size_t align, int flags)
 {
 	void *ptr = NULL;
-	int rc = EINVAL;
+	int rc;
 
 	do {
 		rc = posix_memalign(&ptr, align, size);
diff --git a/lib/libzfs/libzfs_crypto.c b/lib/libzfs/libzfs_crypto.c
index 8907802ec25..b34a44c30eb 100644
--- a/lib/libzfs/libzfs_crypto.c
+++ b/lib/libzfs/libzfs_crypto.c
@@ -584,7 +584,7 @@ get_key_material_https(libzfs_handle_t *hdl, const char *uri,
 		goto end;
 	}
 
-	int kfd = -1;
+	int kfd;
 #ifdef O_TMPFILE
 	kfd = open(getenv("TMPDIR") ?: "/tmp",
 	    O_RDWR | O_TMPFILE | O_EXCL | O_CLOEXEC, 0600);
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index dc2fb1a8c09..c19e51f0ff5 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -896,7 +896,7 @@ int
 zpool_set_prop(zpool_handle_t *zhp, const char *propname, const char *propval)
 {
 	zfs_cmd_t zc = {"\0"};
-	int ret = -1;
+	int ret;
 	char errbuf[ERRBUFLEN];
 	nvlist_t *nvl = NULL;
 	nvlist_t *realprops;
@@ -4310,7 +4310,7 @@ zpool_set_guid(zpool_handle_t *zhp, const uint64_t *guid)
 	libzfs_handle_t *hdl = zhp->zpool_hdl;
 	nvlist_t *nvl = NULL;
 	zfs_cmd_t zc = {"\0"};
-	int error = -1;
+	int error;
 
 	if (guid != NULL) {
 		if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c
index 072332e41ca..48f6b0ca4e1 100644
--- a/lib/libzpool/kernel.c
+++ b/lib/libzpool/kernel.c
@@ -1067,8 +1067,8 @@ zvol_rename_minors(spa_t *spa, const char *oldname, const char *newname,
 int
 zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fpp)
 {
-	int fd = -1;
-	int dump_fd = -1;
+	int fd;
+	int dump_fd;
 	int err;
 	int old_umask = 0;
 	zfs_file_t *fp;
diff --git a/module/zfs/zap.c b/module/zfs/zap.c
index 9711c91d7e4..0896690c97e 100644
--- a/module/zfs/zap.c
+++ b/module/zfs/zap.c
@@ -1304,7 +1304,7 @@ zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
 int
 fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
 {
-	int err = ENOENT;
+	int err;
 	zap_entry_handle_t zeh;
 	zap_leaf_t *l;
 
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index c3d96c049d3..7e4caaa83ee 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -4408,7 +4408,7 @@ int
 zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
     uint64_t size, boolean_t *slog)
 {
-	int error = 1;
+	int error;
 	zio_alloc_list_t io_alloc_list;
 
 	ASSERT(txg > spa_syncing_txg(spa));
diff --git a/module/zstd/zfs_zstd.c b/module/zstd/zfs_zstd.c
index b42066fdb7c..950fc6f48bf 100644
--- a/module/zstd/zfs_zstd.c
+++ b/module/zstd/zfs_zstd.c
@@ -569,11 +569,10 @@ zfs_zstd_compress_buf(void *s_start, void *d_start, size_t s_len, size_t d_len,
 	size_t actual_abort_size = zstd_abort_size;
 	if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level &&
 	    s_len >= actual_abort_size) {
-		int pass_len = 1;
 		abd_t sabd, dabd;
 		abd_get_from_buf_struct(&sabd, s_start, s_len);
 		abd_get_from_buf_struct(&dabd, d_start, d_len);
-		pass_len = zfs_lz4_compress(&sabd, &dabd, s_len, d_len, 0);
+		int pass_len = zfs_lz4_compress(&sabd, &dabd, s_len, d_len, 0);
 		abd_free(&dabd);
 		abd_free(&sabd);
 		if (pass_len < d_len) {
diff --git a/tests/zfs-tests/cmd/idmap_util.c b/tests/zfs-tests/cmd/idmap_util.c
index 416e80714f9..f332677f520 100644
--- a/tests/zfs-tests/cmd/idmap_util.c
+++ b/tests/zfs-tests/cmd/idmap_util.c
@@ -301,7 +301,7 @@ static int
 write_idmap(pid_t pid, char *buf, size_t buf_size, idmap_type_t type)
 {
 	char path[PATH_MAX];
-	int fd = -EBADF;
+	int fd;
 	int ret;
 
 	(void) snprintf(path, sizeof (path), "/proc/%d/%cid_map",
diff --git a/tests/zfs-tests/cmd/mktree.c b/tests/zfs-tests/cmd/mktree.c
index 297cf6dea41..9a5253468bd 100644
--- a/tests/zfs-tests/cmd/mktree.c
+++ b/tests/zfs-tests/cmd/mktree.c
@@ -152,7 +152,7 @@ getfdname(char *pdir, char type, int level, int dir, int file)
 static void
 crtfile(char *pname)
 {
-	int fd = -1;
+	int fd;
 	int i, size;
 	const char *context = "0123456789ABCDF";
 	char *pbuf;
diff --git a/tests/zfs-tests/cmd/mmapwrite.c b/tests/zfs-tests/cmd/mmapwrite.c
index 61fcdc35af1..31d61ffb07d 100644
--- a/tests/zfs-tests/cmd/mmapwrite.c
+++ b/tests/zfs-tests/cmd/mmapwrite.c
@@ -59,7 +59,7 @@ static void *
 normal_writer(void *filename)
 {
 	char *file_path = filename;
-	int fd = -1;
+	int fd;
 	ssize_t write_num = 0;
 	int page_size = getpagesize();
 
@@ -93,7 +93,7 @@ normal_writer(void *filename)
 static void *
 map_writer(void *filename)
 {
-	int fd = -1;
+	int fd;
 	int ret = 0;
 	char *buf = NULL;
 	int page_size = getpagesize();

From 00ce064d8f9c2e1ab8154631add7d4006ea16f70 Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Thu, 24 Jul 2025 23:50:23 +1000
Subject: [PATCH 43/72] spa: update blkptr diagram to include vdev padding on
 encrypted blocks

Probably just an oversight in 4d044c4c1d. SPA_VDEVBITS is always 24,
regardless of whether or not the bp is for an encrypted block, and it
wouldn't make sense for it to be different anyway.

Sponsored-by: https://despairlabs.com/sponsor/
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Closes #17564
---
 include/sys/spa.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/sys/spa.h b/include/sys/spa.h
index e5ec39b64dc..e0eed831d30 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -190,11 +190,11 @@ typedef struct zio_cksum_salt {
  *
  *	64	56	48	40	32	24	16	8	0
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 0	|		vdev1		| pad   |	  ASIZE		|
+ * 0	|  pad  |	  vdev1         | pad   |	  ASIZE		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 1	|G|			 offset1				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 2	|		vdev2		| pad   |	  ASIZE		|
+ * 2	|  pad  |	  vdev2         | pad   |	  ASIZE		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 3	|G|			 offset2				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+

From 5a9b9c7f87894d44c29fd1301bf576af35369051 Mon Sep 17 00:00:00 2001
From: Coleman Kane <ckane@colemankane.org>
Date: Thu, 24 Jul 2025 18:38:58 -0400
Subject: [PATCH 44/72] linux: Fix out-of-src builds

The linux kernel modules haven't been building successfully when the
build occurs in a separate directory than the source code, which is a
common build pattern in Linux. Was not able to determine the root cause,
but the %.o targets in subdirectories are no longer being matched by the
pattern targets in the Linux Kbuild system. This change fixes the issue
by dynamically creating the missing ones inside our Kbuild.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Coleman Kane <ckane@colemankane.org>
Closes #17517
---
 module/Kbuild.in | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/module/Kbuild.in b/module/Kbuild.in
index 667f061c6e1..ece603fee73 100644
--- a/module/Kbuild.in
+++ b/module/Kbuild.in
@@ -494,3 +494,34 @@ UBSAN_SANITIZE_zfs/sa.o := n
 ifeq ($(CONFIG_ALTIVEC),y)
 $(obj)/zfs/vdev_raidz_math_powerpc_altivec.o : c_flags += -maltivec
 endif
+
+# The following recipes attempt to fix out of src-tree builds, where $(src) != $(obj), so that the
+# subdir %.c/%.S -> %.o targets will work as expected. The in-kernel pattern targets do not seem to
+# be working on subdirs since about ~6.10
+zobjdirs = $(dir $(zfs-objs)) $(dir $(spl-objs))                                             \
+  $(dir $(zfs-$(CONFIG_X86))) $(dir $(zfs-$(CONFIG_UML_X86))) $(dir $(zfs-$(CONFIG_ARM64)))  \
+  $(dir $(zfs-$(CONFIG_PPC64))) $(dir $(zfs-$(CONFIG_PPC)))
+
+z_cdirs = $(sort $(filter-out lua/setjmp/ $(addprefix icp/asm-aarch64/, aes/ blake3/ modes/ sha2/) \
+  $(addprefix icp/asm-x86_64/, aes/ blake3/ modes/ sha2/)                                          \
+  $(addprefix icp/asm-ppc/, aes/ blake3/ modes/ sha2/)                                             \
+  $(addprefix icp/asm-ppc64/, aes/ blake3/ modes/ sha2/), $(zobjdirs)))
+z_sdirs = $(sort $(filter lua/setjmp/ $(addprefix icp/asm-aarch64/, aes/ blake3/ modes/ sha2/)     \
+  $(addprefix icp/asm-x86_64/, aes/ blake3/ modes/ sha2/)                                          \
+  $(addprefix icp/asm-ppc/, aes/ blake3/ modes/ sha2/)                                             \
+  $(addprefix icp/asm-ppc64/, aes/ blake3/ modes/ sha2/), $(zobjdirs)))
+
+define ZKMOD_C_O_MAKE_TARGET
+$1%.o: $(src)/$1%.c FORCE
+	$$(call if_changed_rule,cc_o_c)
+	$$(call cmd,force_checksrc)
+endef
+
+define ZKMOD_S_O_MAKE_TARGET
+$1%.o: $(src)/$1%.S FORCE
+	$$(call if_changed_rule,as_o_S)
+	$$(call cmd,force_checksrc)
+endef
+
+$(foreach target,$(z_cdirs), $(eval $(call ZKMOD_C_O_MAKE_TARGET,$(target))))
+$(foreach target,$(z_sdirs), $(eval $(call ZKMOD_S_O_MAKE_TARGET,$(target))))

From a8646a81865be305b08305d5ab354c49ce8780ec Mon Sep 17 00:00:00 2001
From: Ameer Hamza <ahamza@ixsystems.com>
Date: Fri, 25 Jul 2025 03:47:46 +0500
Subject: [PATCH 45/72] ZED: Fix device type detection and pool iteration logic

During hotplug REMOVED events, devid matching fails for partition-based
spares because devid information is not stored in pool config for
partitioned devices. However, when devid is populated by the hotplug
event, the original code skipped the search logic entirely, skipping
vdev_guid matching and resulting in wrong device type detection that
caused spares to be incorrectly identified as l2arc devices.
Additionally, fix zfs_agent_iter_pool() to use the return value from
zfs_agent_iter_vdev() instead of relying on search parameters, which
was previously ignored. Also add pool_guid optimization to enable
targeted pool searching when pool_guid is available.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Ameer Hamza <ahamza@ixsystems.com>
Closes #17545
---
 cmd/zed/agents/zfs_agents.c | 67 ++++++++++++++++++++-----------------
 1 file changed, 36 insertions(+), 31 deletions(-)

diff --git a/cmd/zed/agents/zfs_agents.c b/cmd/zed/agents/zfs_agents.c
index 8718dbde03b..c0590edc751 100644
--- a/cmd/zed/agents/zfs_agents.c
+++ b/cmd/zed/agents/zfs_agents.c
@@ -134,11 +134,13 @@ zfs_agent_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *arg)
 	 * of blkid cache and L2ARC VDEV does not contain pool guid in its
 	 * blkid, so this is a special case for L2ARC VDEV.
 	 */
-	else if (gsp->gs_vdev_guid != 0 && gsp->gs_devid == NULL &&
+	else if (gsp->gs_vdev_guid != 0 &&
 	    nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID, &vdev_guid) == 0 &&
 	    gsp->gs_vdev_guid == vdev_guid) {
-		(void) nvlist_lookup_string(nvl, ZPOOL_CONFIG_DEVID,
-		    &gsp->gs_devid);
+		if (gsp->gs_devid == NULL) {
+			(void) nvlist_lookup_string(nvl, ZPOOL_CONFIG_DEVID,
+			    &gsp->gs_devid);
+		}
 		(void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_EXPANSION_TIME,
 		    &gsp->gs_vdev_expandtime);
 		return (B_TRUE);
@@ -156,22 +158,28 @@ zfs_agent_iter_pool(zpool_handle_t *zhp, void *arg)
 	/*
 	 * For each vdev in this pool, look for a match by devid
 	 */
-	if ((config = zpool_get_config(zhp, NULL)) != NULL) {
-		if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
-		    &nvl) == 0) {
-			(void) zfs_agent_iter_vdev(zhp, nvl, gsp);
-		}
-	}
-	/*
-	 * if a match was found then grab the pool guid
-	 */
-	if (gsp->gs_vdev_guid && gsp->gs_devid) {
-		(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
-		    &gsp->gs_pool_guid);
-	}
+	boolean_t found = B_FALSE;
+	uint64_t pool_guid;
 
+	/* Get pool configuration and extract pool GUID */
+	if ((config = zpool_get_config(zhp, NULL)) == NULL ||
+	    nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+	    &pool_guid) != 0)
+		goto out;
+
+	/* Skip this pool if we're looking for a specific pool */
+	if (gsp->gs_pool_guid != 0 && pool_guid != gsp->gs_pool_guid)
+		goto out;
+
+	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) == 0)
+		found = zfs_agent_iter_vdev(zhp, nvl, gsp);
+
+	if (found && gsp->gs_pool_guid == 0)
+		gsp->gs_pool_guid = pool_guid;
+
+out:
 	zpool_close(zhp);
-	return (gsp->gs_devid != NULL && gsp->gs_vdev_guid != 0);
+	return (found);
 }
 
 void
@@ -233,20 +241,17 @@ zfs_agent_post_event(const char *class, const char *subclass, nvlist_t *nvl)
 		 * For multipath, spare and l2arc devices ZFS_EV_VDEV_GUID or
 		 * ZFS_EV_POOL_GUID may be missing so find them.
 		 */
-		if (devid == NULL || pool_guid == 0 || vdev_guid == 0) {
-			if (devid == NULL)
-				search.gs_vdev_guid = vdev_guid;
-			else
-				search.gs_devid = devid;
-			zpool_iter(g_zfs_hdl, zfs_agent_iter_pool, &search);
-			if (devid == NULL)
-				devid = search.gs_devid;
-			if (pool_guid == 0)
-				pool_guid = search.gs_pool_guid;
-			if (vdev_guid == 0)
-				vdev_guid = search.gs_vdev_guid;
-			devtype = search.gs_vdev_type;
-		}
+		search.gs_devid = devid;
+		search.gs_vdev_guid = vdev_guid;
+		search.gs_pool_guid = pool_guid;
+		zpool_iter(g_zfs_hdl, zfs_agent_iter_pool, &search);
+		if (devid == NULL)
+			devid = search.gs_devid;
+		if (pool_guid == 0)
+			pool_guid = search.gs_pool_guid;
+		if (vdev_guid == 0)
+			vdev_guid = search.gs_vdev_guid;
+		devtype = search.gs_vdev_type;
 
 		/*
 		 * We want to avoid reporting "remove" events coming from

From 4bd7a2eaa587cf2e408af020dddcfdc79f8d3431 Mon Sep 17 00:00:00 2001
From: Andriy Tkachuk <andriy.tkachuk@seagate.com>
Date: Fri, 25 Jul 2025 02:24:15 +0100
Subject: [PATCH 46/72] zdb: fix checksum calculation for decompressed blocks

Currently, when reading compressed blocks with -R and decompressing
them with :d option and specifying lsize, which is normally bigger
than psize for compressed blocks, the checksum is calculated on
decompressed data. But it makes no sense since zfs always calculates
checksum on physical, i.e. compressed data. So reading the same block
produces different checksum results depending on how we read it,
whether we decompress it or not, which, again, makes no sense.

Fix: use psize instead of lsize when calculating the checksum so that
it is always calculated on the physical block size, no matter was it
compressed or not.

Signed-off-by: Andriy Tkachuk <andriy.tkachuk@seagate.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Closes #17547
---
 cmd/zdb/zdb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 6a48658991b..06b28670462 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -9138,7 +9138,7 @@ zdb_read_block(char *thing, spa_t *spa)
 				ck_zio->io_offset =
 				    DVA_GET_OFFSET(&bp->blk_dva[0]);
 				ck_zio->io_bp = bp;
-				zio_checksum_compute(ck_zio, ck, pabd, lsize);
+				zio_checksum_compute(ck_zio, ck, pabd, psize);
 				printf(
 				    "%12s\t"
 				    "cksum=%016llx:%016llx:%016llx:%016llx\n",

From cf146460c119d8a7b0286de370341d7054f7171c Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Fri, 25 Jul 2025 07:42:23 -0700
Subject: [PATCH 47/72] Default to zfs_bclone_wait_dirty=1

Update the default FICLONE and FICLONERANGE ioctl behavior to wait
on dirty blocks.  While this does remove some control from the
application, in practice ZFS is better positioned to the optimial
thing and immediately force a TXG sync.

Reviewed-by: Rob Norris <robn@despairlabs.com>
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #17455
---
 man/man4/zfs.4                                | 15 ++++++-------
 module/zfs/zfs_vnops.c                        | 13 ++++++------
 ...loning_copyfilerange_fallback_same_txg.ksh | 21 +++++++++++++++++++
 .../functional/cp_files/cp_files_002_pos.ksh  |  4 +++-
 4 files changed, 39 insertions(+), 14 deletions(-)

diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index fba91d1e28b..e00b1848b41 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -1399,14 +1399,15 @@ If this setting is 0, then even if feature@block_cloning is enabled,
 using functions and system calls that attempt to clone blocks will act as
 though the feature is disabled.
 .
-.It Sy zfs_bclone_wait_dirty Ns = Ns Sy 0 Ns | Ns 1 Pq int
-When set to 1 the FICLONE and FICLONERANGE ioctls wait for dirty data to be
-written to disk.
-This allows the clone operation to reliably succeed when a file is
+.It Sy zfs_bclone_wait_dirty Ns = Ns Sy 1 Ns | Ns 0 Pq int
+When set to 1 the FICLONE and FICLONERANGE ioctls will wait for any dirty
+data to be written to disk before proceeding.
+This ensures that the clone operation reliably succeeds, even if a file is
 modified and then immediately cloned.
-For small files this may be slower than making a copy of the file.
-Therefore, this setting defaults to 0 which causes a clone operation to
-immediately fail when encountering a dirty block.
+Note that for small files this may be slower than simply copying the file.
+When set to 0 the clone operation will immediately fail if it encounters
+any dirty blocks.
+By default waiting is enabled.
 .
 .It Sy zfs_blake3_impl Ns = Ns Sy fastest Pq string
 Select a BLAKE3 implementation.
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index 656ca4dc22f..dfffcc4a404 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -67,13 +67,14 @@
 int zfs_bclone_enabled = 1;
 
 /*
- * When set zfs_clone_range() waits for dirty data to be written to disk.
- * This allows the clone operation to reliably succeed when a file is modified
- * and then immediately cloned. For small files this may be slower than making
- * a copy of the file and is therefore not the default.  However, in certain
- * scenarios this behavior may be desirable so a tunable is provided.
+ * When set to 1 the FICLONE and FICLONERANGE ioctls will wait for any dirty
+ * data to be written to disk before proceeding. This ensures that the clone
+ * operation reliably succeeds, even if a file is modified and then immediately
+ * cloned. Note that for small files this may be slower than simply copying
+ * the file. When set to 0 the clone operation will immediately fail if it
+ * encounters any dirty blocks. By default waiting is enabled.
  */
-int zfs_bclone_wait_dirty = 0;
+int zfs_bclone_wait_dirty = 1;
 
 /*
  * Enable Direct I/O. If this setting is 0, then all I/O requests will be
diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh
index 54ffdc75669..4cede26b913 100755
--- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh
+++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh
@@ -41,16 +41,22 @@ function cleanup
 {
 	datasetexists $TESTPOOL && destroy_pool $TESTPOOL
 	set_tunable64 TXG_TIMEOUT $timeout
+	log_must restore_tunable BCLONE_WAIT_DIRTY
 }
 
 log_onexit cleanup
 
+log_must save_tunable BCLONE_WAIT_DIRTY
+
 log_must set_tunable64 TXG_TIMEOUT 5000
 
 log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS
 
 log_must sync_pool $TESTPOOL true
 
+# Verify fallback to copy when there are dirty blocks
+log_must set_tunable32 BCLONE_WAIT_DIRTY 0
+
 log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=128K count=4
 log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 0 0 524288
 
@@ -61,5 +67,20 @@ log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone
 typeset blocks=$(get_same_blocks $TESTPOOL file $TESTPOOL clone)
 log_must [ "$blocks" = "" ]
 
+log_must rm /$TESTPOOL/file /$TESTPOOL/clone
+
+# Verify blocks are cloned even when there are dirty blocks
+log_must set_tunable32 BCLONE_WAIT_DIRTY 1
+
+log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=128K count=4
+log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 0 0 524288
+
+log_must sync_pool $TESTPOOL
+
+log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone
+
+typeset blocks=$(get_same_blocks $TESTPOOL file $TESTPOOL clone)
+log_must [ "$blocks" = "0 1 2 3" ]
+
 log_pass $claim
 
diff --git a/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh b/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh
index 8f3e6d12e53..449dedacb30 100755
--- a/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh
@@ -56,7 +56,7 @@ function cleanup
 {
 	datasetexists $TESTPOOL/cp-reflink && \
 	    destroy_dataset $$TESTPOOL/cp-reflink -f
-	log_must set_tunable32 BCLONE_WAIT_DIRTY 0
+	log_must restore_tunable BCLONE_WAIT_DIRTY
 }
 
 function verify_copy
@@ -81,6 +81,8 @@ SRC_SIZE=$((1024 + $RANDOM % 1024))
 # A smaller recordsize is used merely to speed up the test.
 RECORDSIZE=4096
 
+log_must save_tunable BCLONE_WAIT_DIRTY
+
 log_must zfs create -o recordsize=$RECORDSIZE $TESTPOOL/cp-reflink
 CP_TESTDIR=$(get_prop mountpoint $TESTPOOL/cp-reflink)
 

From f23e040a37e8db773ae4293486d13a8e0831d294 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Fri, 25 Jul 2025 15:47:21 -0700
Subject: [PATCH 48/72] CI: Remove Debian backports

The latest Debian 11 image includes bullseye-backports as a default
repository in the /etc/apt/sources.list.  However, this repository
has gone end of life which effectively breaks the default install.

We shouldn't need anything in backports so lets unconditionally
remove backports on all Debian builders to resolve the issue.

Reviewed-by: George Melikov <mail@gmelikov.ru>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #17569
---
 .github/workflows/scripts/qemu-3-deps-vm.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/scripts/qemu-3-deps-vm.sh b/.github/workflows/scripts/qemu-3-deps-vm.sh
index 904fbfbf1e1..c41ecd09d52 100755
--- a/.github/workflows/scripts/qemu-3-deps-vm.sh
+++ b/.github/workflows/scripts/qemu-3-deps-vm.sh
@@ -28,6 +28,7 @@ function debian() {
   export DEBIAN_FRONTEND="noninteractive"
 
   echo "##[group]Running apt-get update+upgrade"
+  sudo sed -i '/[[:alpha:]]-backports/d' /etc/apt/sources.list
   sudo apt-get update -y
   sudo apt-get upgrade -y
   echo "##[endgroup]"

From fc885f308f52f1e51d815b8c4b9422b1075cdee0 Mon Sep 17 00:00:00 2001
From: Paul Dagnelie <pcd@delphix.com>
Date: Tue, 29 Jul 2025 14:28:01 -0700
Subject: [PATCH 49/72] Don't use wrong weight when passivating group

When we're passivating a metaslab group we start by passivating the
metaslabs that have been activated for each of the allocators.  To do
that, we need to provide a weight. However, currently this erroneously
always uses a segment-based weight, even if segment-based weighting is
disabled.

Use the normal weight function, which will decide which type of weight
to use.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Signed-off-by: Paul Dagnelie <paul.dagnelie@klarasystems.com>
Closes #17566
---
 module/zfs/metaslab.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 2f91f2bb364..69484d404ee 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -1184,14 +1184,16 @@ metaslab_group_passivate(metaslab_group_t *mg)
 		if (msp != NULL) {
 			mutex_enter(&msp->ms_lock);
 			metaslab_passivate(msp,
-			    metaslab_weight_from_range_tree(msp));
+			    metaslab_weight(msp, B_TRUE) &
+			    ~METASLAB_ACTIVE_MASK);
 			mutex_exit(&msp->ms_lock);
 		}
 		msp = mga->mga_secondary;
 		if (msp != NULL) {
 			mutex_enter(&msp->ms_lock);
 			metaslab_passivate(msp,
-			    metaslab_weight_from_range_tree(msp));
+			    metaslab_weight(msp, B_TRUE) &
+			    ~METASLAB_ACTIVE_MASK);
 			mutex_exit(&msp->ms_lock);
 		}
 	}

From b6e8db509d6c3bbd0950e64d78ead16e1497d963 Mon Sep 17 00:00:00 2001
From: Akash B <akash-b@hpe.com>
Date: Wed, 30 Jul 2025 03:20:44 +0530
Subject: [PATCH 50/72] zpool/zfs: Add '-a|--all' option to scrub, trim,
 initialize

Add support for the '-a | --all' option to perform trim,
scrub, and initialize operations on all pools.
Previously, specifying a pool name was mandatory for
these operations. With this enhancement, users can now
execute these operations across all pools at once,
without needing to manually iterate over each pool
from the command line.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Signed-off-by: Akash B <akash-b@hpe.com>
Closes #17524
---
 cmd/zpool/zpool_main.c                        | 185 ++++++++++--------
 include/libzfs.h                              |  13 ++
 lib/libuutil/libuutil.abi                     |  10 +-
 lib/libzfs/libzfs.abi                         |  29 ++-
 lib/libzfs/libzfs_pool.c                      |  77 ++++++++
 lib/libzfs_core/libzfs_core.abi               |  10 +-
 man/man8/zpool-initialize.8                   |   7 +-
 man/man8/zpool-scrub.8                        |   9 +-
 man/man8/zpool-trim.8                         |   7 +-
 tests/runfiles/common.run                     |   6 +-
 tests/zfs-tests/tests/Makefile.am             |   3 +
 .../zpool_initialize_multiple_pools.ksh       | 131 +++++++++++++
 .../zpool_scrub/zpool_scrub_001_neg.ksh       |   3 +-
 .../zpool_scrub_multiple_pools.ksh            | 128 ++++++++++++
 .../zpool_trim/zpool_trim_multiple_pools.ksh  | 123 ++++++++++++
 15 files changed, 637 insertions(+), 104 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_multiple_pools.ksh
 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_multiple_pools.ksh
 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_multiple_pools.ksh

diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index b0c060aa5da..23cc590cc30 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -34,7 +34,7 @@
  * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>
  * Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
  * Copyright (c) 2021, 2023, Klara Inc.
- * Copyright [2021] Hewlett Packard Enterprise Development LP
+ * Copyright (c) 2021, 2025 Hewlett Packard Enterprise Development LP.
  */
 
 #include <assert.h>
@@ -510,16 +510,16 @@ get_usage(zpool_help_t idx)
 	case HELP_REOPEN:
 		return (gettext("\treopen [-n] <pool>\n"));
 	case HELP_INITIALIZE:
-		return (gettext("\tinitialize [-c | -s | -u] [-w] <pool> "
-		    "[<device> ...]\n"));
+		return (gettext("\tinitialize [-c | -s | -u] [-w] <-a | <pool> "
+		    "[<device> ...]>\n"));
 	case HELP_SCRUB:
-		return (gettext("\tscrub [-e | -s | -p | -C] [-w] "
-		    "<pool> ...\n"));
+		return (gettext("\tscrub [-e | -s | -p | -C] [-w] <-a | "
+		    "<pool> [<pool> ...]>\n"));
 	case HELP_RESILVER:
 		return (gettext("\tresilver <pool> ...\n"));
 	case HELP_TRIM:
-		return (gettext("\ttrim [-dw] [-r <rate>] [-c | -s] <pool> "
-		    "[<device> ...]\n"));
+		return (gettext("\ttrim [-dw] [-r <rate>] [-c | -s] "
+		    "<-a | <pool> [<device> ...]>\n"));
 	case HELP_STATUS:
 		return (gettext("\tstatus [-DdegiLPpstvx] "
 		    "[-c script1[,script2,...]] ...\n"
@@ -560,33 +560,6 @@ get_usage(zpool_help_t idx)
 	}
 }
 
-static void
-zpool_collect_leaves(zpool_handle_t *zhp, nvlist_t *nvroot, nvlist_t *res)
-{
-	uint_t children = 0;
-	nvlist_t **child;
-	uint_t i;
-
-	(void) nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
-	    &child, &children);
-
-	if (children == 0) {
-		char *path = zpool_vdev_name(g_zfs, zhp, nvroot,
-		    VDEV_NAME_PATH);
-
-		if (strcmp(path, VDEV_TYPE_INDIRECT) != 0 &&
-		    strcmp(path, VDEV_TYPE_HOLE) != 0)
-			fnvlist_add_boolean(res, path);
-
-		free(path);
-		return;
-	}
-
-	for (i = 0; i < children; i++) {
-		zpool_collect_leaves(zhp, child[i], res);
-	}
-}
-
 /*
  * Callback routine that will print out a pool property value.
  */
@@ -794,22 +767,26 @@ zpool_do_initialize(int argc, char **argv)
 	int c;
 	char *poolname;
 	zpool_handle_t *zhp;
-	nvlist_t *vdevs;
 	int err = 0;
 	boolean_t wait = B_FALSE;
+	boolean_t initialize_all = B_FALSE;
 
 	struct option long_options[] = {
 		{"cancel",	no_argument,		NULL, 'c'},
 		{"suspend",	no_argument,		NULL, 's'},
 		{"uninit",	no_argument,		NULL, 'u'},
 		{"wait",	no_argument,		NULL, 'w'},
+		{"all", 	no_argument,		NULL, 'a'},
 		{0, 0, 0, 0}
 	};
 
 	pool_initialize_func_t cmd_type = POOL_INITIALIZE_START;
-	while ((c = getopt_long(argc, argv, "csuw", long_options,
+	while ((c = getopt_long(argc, argv, "acsuw", long_options,
 	    NULL)) != -1) {
 		switch (c) {
+		case 'a':
+			initialize_all = B_TRUE;
+			break;
 		case 'c':
 			if (cmd_type != POOL_INITIALIZE_START &&
 			    cmd_type != POOL_INITIALIZE_CANCEL) {
@@ -856,7 +833,18 @@ zpool_do_initialize(int argc, char **argv)
 	argc -= optind;
 	argv += optind;
 
-	if (argc < 1) {
+	initialize_cbdata_t cbdata = {
+		.wait = wait,
+		.cmd_type = cmd_type
+	};
+
+	if (initialize_all && argc > 0) {
+		(void) fprintf(stderr, gettext("-a cannot be combined with "
+		    "individual pools or vdevs\n"));
+		usage(B_FALSE);
+	}
+
+	if (argc < 1 && !initialize_all) {
 		(void) fprintf(stderr, gettext("missing pool name argument\n"));
 		usage(B_FALSE);
 		return (-1);
@@ -868,30 +856,35 @@ zpool_do_initialize(int argc, char **argv)
 		usage(B_FALSE);
 	}
 
-	poolname = argv[0];
-	zhp = zpool_open(g_zfs, poolname);
-	if (zhp == NULL)
-		return (-1);
-
-	vdevs = fnvlist_alloc();
-	if (argc == 1) {
-		/* no individual leaf vdevs specified, so add them all */
-		nvlist_t *config = zpool_get_config(zhp, NULL);
-		nvlist_t *nvroot = fnvlist_lookup_nvlist(config,
-		    ZPOOL_CONFIG_VDEV_TREE);
-		zpool_collect_leaves(zhp, nvroot, vdevs);
+	if (argc == 0 && initialize_all) {
+		/* Initilize each pool recursively */
+		err = for_each_pool(argc, argv, B_TRUE, NULL, ZFS_TYPE_POOL,
+		    B_FALSE, zpool_initialize_one, &cbdata);
+		return (err);
+	} else if (argc == 1) {
+		/* no individual leaf vdevs specified, initialize the pool */
+		poolname = argv[0];
+		zhp = zpool_open(g_zfs, poolname);
+		if (zhp == NULL)
+			return (-1);
+		err = zpool_initialize_one(zhp, &cbdata);
 	} else {
+		/* individual leaf vdevs specified, initialize them */
+		poolname = argv[0];
+		zhp = zpool_open(g_zfs, poolname);
+		if (zhp == NULL)
+			return (-1);
+		nvlist_t *vdevs = fnvlist_alloc();
 		for (int i = 1; i < argc; i++) {
 			fnvlist_add_boolean(vdevs, argv[i]);
 		}
+		if (wait)
+			err = zpool_initialize_wait(zhp, cmd_type, vdevs);
+		else
+			err = zpool_initialize(zhp, cmd_type, vdevs);
+		fnvlist_free(vdevs);
 	}
 
-	if (wait)
-		err = zpool_initialize_wait(zhp, cmd_type, vdevs);
-	else
-		err = zpool_initialize(zhp, cmd_type, vdevs);
-
-	fnvlist_free(vdevs);
 	zpool_close(zhp);
 
 	return (err);
@@ -8452,10 +8445,14 @@ zpool_do_scrub(int argc, char **argv)
 	boolean_t is_pause = B_FALSE;
 	boolean_t is_stop = B_FALSE;
 	boolean_t is_txg_continue = B_FALSE;
+	boolean_t scrub_all = B_FALSE;
 
 	/* check options */
-	while ((c = getopt(argc, argv, "spweC")) != -1) {
+	while ((c = getopt(argc, argv, "aspweC")) != -1) {
 		switch (c) {
+		case 'a':
+			scrub_all = B_TRUE;
+			break;
 		case 'e':
 			is_error_scrub = B_TRUE;
 			break;
@@ -8519,7 +8516,7 @@ zpool_do_scrub(int argc, char **argv)
 	argc -= optind;
 	argv += optind;
 
-	if (argc < 1) {
+	if (argc < 1 && !scrub_all) {
 		(void) fprintf(stderr, gettext("missing pool name argument\n"));
 		usage(B_FALSE);
 	}
@@ -8591,6 +8588,7 @@ zpool_do_trim(int argc, char **argv)
 		{"rate",	required_argument,	NULL,	'r'},
 		{"suspend",	no_argument,		NULL,	's'},
 		{"wait",	no_argument,		NULL,	'w'},
+		{"all",		no_argument,		NULL,	'a'},
 		{0, 0, 0, 0}
 	};
 
@@ -8598,11 +8596,16 @@ zpool_do_trim(int argc, char **argv)
 	uint64_t rate = 0;
 	boolean_t secure = B_FALSE;
 	boolean_t wait = B_FALSE;
+	boolean_t trimall = B_FALSE;
+	int error;
 
 	int c;
-	while ((c = getopt_long(argc, argv, "cdr:sw", long_options, NULL))
+	while ((c = getopt_long(argc, argv, "acdr:sw", long_options, NULL))
 	    != -1) {
 		switch (c) {
+		case 'a':
+			trimall = B_TRUE;
+			break;
 		case 'c':
 			if (cmd_type != POOL_TRIM_START &&
 			    cmd_type != POOL_TRIM_CANCEL) {
@@ -8661,7 +8664,18 @@ zpool_do_trim(int argc, char **argv)
 	argc -= optind;
 	argv += optind;
 
-	if (argc < 1) {
+	trimflags_t trim_flags = {
+		.secure = secure,
+		.rate = rate,
+		.wait = wait,
+	};
+
+	trim_cbdata_t cbdata = {
+		.trim_flags = trim_flags,
+		.cmd_type = cmd_type
+	};
+
+	if (argc < 1 && !trimall) {
 		(void) fprintf(stderr, gettext("missing pool name argument\n"));
 		usage(B_FALSE);
 		return (-1);
@@ -8669,41 +8683,46 @@ zpool_do_trim(int argc, char **argv)
 
 	if (wait && (cmd_type != POOL_TRIM_START)) {
 		(void) fprintf(stderr, gettext("-w cannot be used with -c or "
-		    "-s\n"));
+		    "-s options\n"));
 		usage(B_FALSE);
 	}
 
-	char *poolname = argv[0];
-	zpool_handle_t *zhp = zpool_open(g_zfs, poolname);
-	if (zhp == NULL)
-		return (-1);
+	if (trimall && argc > 0) {
+		(void) fprintf(stderr, gettext("-a cannot be combined with "
+		    "individual zpools or vdevs\n"));
+		usage(B_FALSE);
+	}
 
-	trimflags_t trim_flags = {
-		.secure = secure,
-		.rate = rate,
-		.wait = wait,
-	};
-
-	nvlist_t *vdevs = fnvlist_alloc();
-	if (argc == 1) {
+	if (argc == 0 && trimall) {
+		cbdata.trim_flags.fullpool = B_TRUE;
+		/* Trim each pool recursively */
+		error = for_each_pool(argc, argv, B_TRUE, NULL, ZFS_TYPE_POOL,
+		    B_FALSE, zpool_trim_one, &cbdata);
+	} else if (argc == 1) {
+		char *poolname = argv[0];
+		zpool_handle_t *zhp = zpool_open(g_zfs, poolname);
+		if (zhp == NULL)
+			return (-1);
 		/* no individual leaf vdevs specified, so add them all */
-		nvlist_t *config = zpool_get_config(zhp, NULL);
-		nvlist_t *nvroot = fnvlist_lookup_nvlist(config,
-		    ZPOOL_CONFIG_VDEV_TREE);
-		zpool_collect_leaves(zhp, nvroot, vdevs);
-		trim_flags.fullpool = B_TRUE;
+		error = zpool_trim_one(zhp, &cbdata);
+		zpool_close(zhp);
 	} else {
-		trim_flags.fullpool = B_FALSE;
+		char *poolname = argv[0];
+		zpool_handle_t *zhp = zpool_open(g_zfs, poolname);
+		if (zhp == NULL)
+			return (-1);
+		/* leaf vdevs specified, trim only those */
+		cbdata.trim_flags.fullpool = B_FALSE;
+		nvlist_t *vdevs = fnvlist_alloc();
 		for (int i = 1; i < argc; i++) {
 			fnvlist_add_boolean(vdevs, argv[i]);
 		}
+		error = zpool_trim(zhp, cbdata.cmd_type, vdevs,
+		    &cbdata.trim_flags);
+		fnvlist_free(vdevs);
+		zpool_close(zhp);
 	}
 
-	int error = zpool_trim(zhp, cmd_type, vdevs, &trim_flags);
-
-	fnvlist_free(vdevs);
-	zpool_close(zhp);
-
 	return (error);
 }
 
diff --git a/include/libzfs.h b/include/libzfs.h
index 485af793862..187d7b44936 100644
--- a/include/libzfs.h
+++ b/include/libzfs.h
@@ -30,6 +30,7 @@
  * Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
  * Copyright (c) 2019 Datto Inc.
  * Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
+ * Copyright (c) 2025 Hewlett Packard Enterprise Development LP.
  */
 
 #ifndef	_LIBZFS_H
@@ -288,10 +289,20 @@ typedef struct trimflags {
 	uint64_t rate;
 } trimflags_t;
 
+typedef struct trim_cbdata {
+    trimflags_t trim_flags;
+    pool_trim_func_t cmd_type;
+} trim_cbdata_t;
+
+typedef struct initialize_cbdata {
+	boolean_t wait;
+	pool_initialize_func_t cmd_type;
+} initialize_cbdata_t;
 /*
  * Functions to manipulate pool and vdev state
  */
 _LIBZFS_H int zpool_scan(zpool_handle_t *, pool_scan_func_t, pool_scrub_cmd_t);
+_LIBZFS_H int zpool_initialize_one(zpool_handle_t *, void *);
 _LIBZFS_H int zpool_initialize(zpool_handle_t *, pool_initialize_func_t,
     nvlist_t *);
 _LIBZFS_H int zpool_initialize_wait(zpool_handle_t *, pool_initialize_func_t,
@@ -304,7 +315,9 @@ _LIBZFS_H int zpool_reguid(zpool_handle_t *);
 _LIBZFS_H int zpool_set_guid(zpool_handle_t *, const uint64_t *);
 _LIBZFS_H int zpool_reopen_one(zpool_handle_t *, void *);
 
+_LIBZFS_H void zpool_collect_leaves(zpool_handle_t *, nvlist_t *, nvlist_t *);
 _LIBZFS_H int zpool_sync_one(zpool_handle_t *, void *);
+_LIBZFS_H int zpool_trim_one(zpool_handle_t *, void *);
 
 _LIBZFS_H int zpool_ddt_prune(zpool_handle_t *, zpool_ddt_prune_unit_t,
     uint64_t);
diff --git a/lib/libuutil/libuutil.abi b/lib/libuutil/libuutil.abi
index 744b5312762..6c736c61e4a 100644
--- a/lib/libuutil/libuutil.abi
+++ b/lib/libuutil/libuutil.abi
@@ -1475,6 +1475,11 @@
       <parameter type-id='80f4b756' name='name'/>
       <return type-id='a27af98c'/>
     </function-decl>
+    <function-decl name='zfs_tunable_iter' mangled-name='zfs_tunable_iter' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_tunable_iter'>
+      <parameter type-id='d8d5f4ab' name='cb'/>
+      <parameter type-id='eaa32e2f' name='arg'/>
+      <return type-id='48b5725f'/>
+    </function-decl>
     <function-decl name='zfs_tunable_set' mangled-name='zfs_tunable_set' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_tunable_set'>
       <parameter type-id='a27af98c' name='zt'/>
       <parameter type-id='80f4b756' name='val'/>
@@ -1486,11 +1491,6 @@
       <parameter type-id='b59d7dce' name='valsz'/>
       <return type-id='95e97e5e'/>
     </function-decl>
-    <function-decl name='zfs_tunable_iter' mangled-name='zfs_tunable_iter' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_tunable_iter'>
-      <parameter type-id='d8d5f4ab' name='cb'/>
-      <parameter type-id='eaa32e2f' name='arg'/>
-      <return type-id='48b5725f'/>
-    </function-decl>
     <function-type size-in-bits='64' id='92f86508'>
       <parameter type-id='a27af98c'/>
       <parameter type-id='eaa32e2f'/>
diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi
index 06e74387f4b..0c3e8106ca6 100644
--- a/lib/libzfs/libzfs.abi
+++ b/lib/libzfs/libzfs.abi
@@ -483,6 +483,7 @@
     <elf-symbol name='zpool_clear' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_clear_label' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_close' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='zpool_collect_leaves' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_collect_unsup_feat' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_create' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_ddt_prune' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@@ -532,6 +533,7 @@
     <elf-symbol name='zpool_import_status' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_in_use' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_initialize' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='zpool_initialize_one' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_initialize_wait' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_is_draid_spare' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_iter' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@@ -581,6 +583,7 @@
     <elf-symbol name='zpool_state_to_name' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_sync_one' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_trim' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='zpool_trim_one' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_upgrade' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_vdev_attach' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_vdev_clear' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@@ -1655,6 +1658,11 @@
       <parameter type-id='80f4b756' name='name'/>
       <return type-id='a27af98c'/>
     </function-decl>
+    <function-decl name='zfs_tunable_iter' mangled-name='zfs_tunable_iter' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_tunable_iter'>
+      <parameter type-id='d8d5f4ab' name='cb'/>
+      <parameter type-id='eaa32e2f' name='arg'/>
+      <return type-id='48b5725f'/>
+    </function-decl>
     <function-decl name='zfs_tunable_set' mangled-name='zfs_tunable_set' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_tunable_set'>
       <parameter type-id='a27af98c' name='zt'/>
       <parameter type-id='80f4b756' name='val'/>
@@ -1666,11 +1674,6 @@
       <parameter type-id='b59d7dce' name='valsz'/>
       <return type-id='95e97e5e'/>
     </function-decl>
-    <function-decl name='zfs_tunable_iter' mangled-name='zfs_tunable_iter' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_tunable_iter'>
-      <parameter type-id='d8d5f4ab' name='cb'/>
-      <parameter type-id='eaa32e2f' name='arg'/>
-      <return type-id='48b5725f'/>
-    </function-decl>
     <function-type size-in-bits='64' id='92f86508'>
       <parameter type-id='a27af98c'/>
       <parameter type-id='eaa32e2f'/>
@@ -6901,6 +6904,11 @@
       <parameter type-id='95e97e5e' name='flags'/>
       <return type-id='95e97e5e'/>
     </function-decl>
+    <function-decl name='zpool_initialize_one' mangled-name='zpool_initialize_one' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_initialize_one'>
+      <parameter type-id='4c81de99' name='zhp'/>
+      <parameter type-id='eaa32e2f' name='data'/>
+      <return type-id='95e97e5e'/>
+    </function-decl>
     <function-decl name='zpool_initialize' mangled-name='zpool_initialize' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_initialize'>
       <parameter type-id='4c81de99' name='zhp'/>
       <parameter type-id='7063e1ab' name='cmd_type'/>
@@ -6913,6 +6921,17 @@
       <parameter type-id='5ce45b60' name='vds'/>
       <return type-id='95e97e5e'/>
     </function-decl>
+    <function-decl name='zpool_collect_leaves' mangled-name='zpool_collect_leaves' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_collect_leaves'>
+      <parameter type-id='4c81de99' name='zhp'/>
+      <parameter type-id='5ce45b60' name='nvroot'/>
+      <parameter type-id='5ce45b60' name='res'/>
+      <return type-id='48b5725f'/>
+    </function-decl>
+    <function-decl name='zpool_trim_one' mangled-name='zpool_trim_one' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_trim_one'>
+      <parameter type-id='4c81de99' name='zhp'/>
+      <parameter type-id='eaa32e2f' name='data'/>
+      <return type-id='95e97e5e'/>
+    </function-decl>
     <function-decl name='zpool_trim' mangled-name='zpool_trim' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_trim'>
       <parameter type-id='4c81de99' name='zhp'/>
       <parameter type-id='b1146b8d' name='cmd_type'/>
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index c19e51f0ff5..b6fb153c496 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -31,6 +31,7 @@
  * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>
  * Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
  * Copyright (c) 2021, 2023, Klara Inc.
+ * Copyright (c) 2025 Hewlett Packard Enterprise Development LP.
  */
 
 #include <errno.h>
@@ -2436,6 +2437,30 @@ xlate_init_err(int err)
 	return (err);
 }
 
+int
+zpool_initialize_one(zpool_handle_t *zhp, void *data)
+{
+	int error;
+	libzfs_handle_t *hdl = zpool_get_handle(zhp);
+	const char *pool_name = zpool_get_name(zhp);
+	if (zpool_open_silent(hdl, pool_name, &zhp) != 0)
+		return (-1);
+	initialize_cbdata_t *cb = data;
+	nvlist_t *vdevs = fnvlist_alloc();
+
+	nvlist_t *config = zpool_get_config(zhp, NULL);
+	nvlist_t *nvroot = fnvlist_lookup_nvlist(config,
+	    ZPOOL_CONFIG_VDEV_TREE);
+	zpool_collect_leaves(zhp, nvroot, vdevs);
+	if (cb->wait)
+		error = zpool_initialize_wait(zhp, cb->cmd_type, vdevs);
+	else
+		error = zpool_initialize(zhp, cb->cmd_type, vdevs);
+	fnvlist_free(vdevs);
+
+	return (error);
+}
+
 /*
  * Begin, suspend, cancel, or uninit (clear) the initialization (initializing
  * of all free blocks) for the given vdevs in the given pool.
@@ -2556,6 +2581,58 @@ xlate_trim_err(int err)
 	return (err);
 }
 
+void
+zpool_collect_leaves(zpool_handle_t *zhp, nvlist_t *nvroot, nvlist_t *res)
+{
+	libzfs_handle_t *hdl = zhp->zpool_hdl;
+	uint_t children = 0;
+	nvlist_t **child;
+	uint_t i;
+
+	(void) nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children);
+
+	if (children == 0) {
+		char *path = zpool_vdev_name(hdl, zhp, nvroot,
+		    VDEV_NAME_PATH);
+
+		if (strcmp(path, VDEV_TYPE_INDIRECT) != 0 &&
+		    strcmp(path, VDEV_TYPE_HOLE) != 0)
+			fnvlist_add_boolean(res, path);
+
+		free(path);
+		return;
+	}
+
+	for (i = 0; i < children; i++) {
+		zpool_collect_leaves(zhp, child[i], res);
+	}
+}
+
+int
+zpool_trim_one(zpool_handle_t *zhp, void *data)
+{
+	int error;
+	libzfs_handle_t *hdl = zpool_get_handle(zhp);
+	const char *pool_name = zpool_get_name(zhp);
+	if (zpool_open_silent(hdl, pool_name, &zhp) != 0)
+		return (-1);
+
+	trim_cbdata_t *cb = data;
+	nvlist_t *vdevs = fnvlist_alloc();
+
+	/* no individual leaf vdevs specified, so add them all */
+	nvlist_t *config = zpool_get_config(zhp, NULL);
+	nvlist_t *nvroot = fnvlist_lookup_nvlist(config,
+	    ZPOOL_CONFIG_VDEV_TREE);
+
+	zpool_collect_leaves(zhp, nvroot, vdevs);
+	error = zpool_trim(zhp, cb->cmd_type, vdevs, &cb->trim_flags);
+	fnvlist_free(vdevs);
+
+	return (error);
+}
+
 static int
 zpool_trim_wait(zpool_handle_t *zhp, nvlist_t *vdev_guids)
 {
diff --git a/lib/libzfs_core/libzfs_core.abi b/lib/libzfs_core/libzfs_core.abi
index 2af20894853..7464b3adb25 100644
--- a/lib/libzfs_core/libzfs_core.abi
+++ b/lib/libzfs_core/libzfs_core.abi
@@ -1426,6 +1426,11 @@
       <parameter type-id='80f4b756' name='name'/>
       <return type-id='a27af98c'/>
     </function-decl>
+    <function-decl name='zfs_tunable_iter' mangled-name='zfs_tunable_iter' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_tunable_iter'>
+      <parameter type-id='d8d5f4ab' name='cb'/>
+      <parameter type-id='eaa32e2f' name='arg'/>
+      <return type-id='48b5725f'/>
+    </function-decl>
     <function-decl name='zfs_tunable_set' mangled-name='zfs_tunable_set' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_tunable_set'>
       <parameter type-id='a27af98c' name='zt'/>
       <parameter type-id='80f4b756' name='val'/>
@@ -1437,11 +1442,6 @@
       <parameter type-id='b59d7dce' name='valsz'/>
       <return type-id='95e97e5e'/>
     </function-decl>
-    <function-decl name='zfs_tunable_iter' mangled-name='zfs_tunable_iter' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_tunable_iter'>
-      <parameter type-id='d8d5f4ab' name='cb'/>
-      <parameter type-id='eaa32e2f' name='arg'/>
-      <return type-id='48b5725f'/>
-    </function-decl>
     <function-type size-in-bits='64' id='92f86508'>
       <parameter type-id='a27af98c'/>
       <parameter type-id='eaa32e2f'/>
diff --git a/man/man8/zpool-initialize.8 b/man/man8/zpool-initialize.8
index d7c9d22aba9..39579a58010 100644
--- a/man/man8/zpool-initialize.8
+++ b/man/man8/zpool-initialize.8
@@ -26,6 +26,7 @@
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
+.\" Copyright (c) 2025 Hewlett Packard Enterprise Development LP.
 .\"
 .Dd May 27, 2021
 .Dt ZPOOL-INITIALIZE 8
@@ -39,7 +40,7 @@
 .Cm initialize
 .Op Fl c Ns | Ns Fl s | Ns Fl u
 .Op Fl w
-.Ar pool
+.Fl a Ns | Ns Ar pool
 .Oo Ar device Oc Ns …
 .
 .Sh DESCRIPTION
@@ -48,6 +49,10 @@ devices, or all eligible devices in the pool if no individual devices are
 specified.
 Only leaf data or log devices may be initialized.
 .Bl -tag -width Ds
+.It Fl a , -all
+Begin, cancel, suspend initializing on
+all
+pools.
 .It Fl c , -cancel
 Cancel initializing on the specified devices, or all eligible devices if none
 are specified.
diff --git a/man/man8/zpool-scrub.8 b/man/man8/zpool-scrub.8
index 21bd6735ede..9b4cf132c83 100644
--- a/man/man8/zpool-scrub.8
+++ b/man/man8/zpool-scrub.8
@@ -26,6 +26,7 @@
 .\" Copyright (c) 2018, 2021 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
+.\" Copyright (c) 2025 Hewlett Packard Enterprise Development LP.
 .\"
 .Dd November 18, 2024
 .Dt ZPOOL-SCRUB 8
@@ -39,7 +40,7 @@
 .Cm scrub
 .Op Ns Fl e | Ns Fl p | Fl s Ns | Fl C Ns
 .Op Fl w
-.Ar pool Ns …
+.Fl a Ns | Ns Ar pool Ns …
 .
 .Sh DESCRIPTION
 Begins a scrub or resumes a paused scrub.
@@ -89,6 +90,12 @@ During this period, no completion time estimate will be provided.
 .
 .Sh OPTIONS
 .Bl -tag -width "-s"
+.It Fl a , -all
+Begin, pause, stop scrub on
+all
+pools.
+Initiating scrubs on multiple pools can put considerable load and memory
+pressure on the system, so this operation should be performed with caution.
 .It Fl s
 Stop scrubbing.
 .It Fl p
diff --git a/man/man8/zpool-trim.8 b/man/man8/zpool-trim.8
index 06cbd5abf7e..18723e1be0d 100644
--- a/man/man8/zpool-trim.8
+++ b/man/man8/zpool-trim.8
@@ -26,6 +26,7 @@
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
+.\" Copyright (c) 2025 Hewlett Packard Enterprise Development LP.
 .\"
 .Dd May 27, 2021
 .Dt ZPOOL-TRIM 8
@@ -40,7 +41,7 @@
 .Op Fl dw
 .Op Fl r Ar rate
 .Op Fl c Ns | Ns Fl s
-.Ar pool
+.Fl a Ns | Ns Ar pool
 .Oo Ar device Ns Oc Ns …
 .
 .Sh DESCRIPTION
@@ -57,6 +58,10 @@ See the documentation for the
 .Sy autotrim
 property above for the types of vdev devices which can be trimmed.
 .Bl -tag -width Ds
+.It Fl a , -all
+Perform TRIM operation on
+all
+pools.
 .It Fl d , -secure
 Causes a secure TRIM to be initiated.
 When performing a secure TRIM, the
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index 3eda5d4d904..7cc7a3cf94f 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -497,6 +497,7 @@ tags = ['functional', 'cli_root', 'zpool_labelclear']
 tests = ['zpool_initialize_attach_detach_add_remove',
     'zpool_initialize_fault_export_import_online',
     'zpool_initialize_import_export',
+    'zpool_initialize_multiple_pools',
     'zpool_initialize_offline_export_import_online',
     'zpool_initialize_online_offline',
     'zpool_initialize_split',
@@ -542,6 +543,7 @@ tests = ['zpool_scrub_001_neg', 'zpool_scrub_002_pos', 'zpool_scrub_003_pos',
     'zpool_scrub_004_pos', 'zpool_scrub_005_pos',
     'zpool_scrub_encrypted_unloaded', 'zpool_scrub_print_repairing',
     'zpool_scrub_offline_device', 'zpool_scrub_multiple_copies',
+    'zpool_scrub_multiple_pools',
     'zpool_error_scrub_001_pos', 'zpool_error_scrub_002_pos',
     'zpool_error_scrub_003_pos', 'zpool_error_scrub_004_pos']
 tags = ['functional', 'cli_root', 'zpool_scrub']
@@ -574,8 +576,8 @@ tags = ['functional', 'cli_root', 'zpool_sync']
 
 [tests/functional/cli_root/zpool_trim]
 tests = ['zpool_trim_attach_detach_add_remove',
-    'zpool_trim_fault_export_import_online',
-    'zpool_trim_import_export', 'zpool_trim_multiple', 'zpool_trim_neg',
+    'zpool_trim_fault_export_import_online', 'zpool_trim_import_export',
+    'zpool_trim_multiple', 'zpool_trim_multiple_pools', 'zpool_trim_neg',
     'zpool_trim_offline_export_import_online', 'zpool_trim_online_offline',
     'zpool_trim_partial', 'zpool_trim_rate', 'zpool_trim_rate_neg',
     'zpool_trim_secure', 'zpool_trim_split', 'zpool_trim_start_and_cancel_neg',
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index 194ae4169e4..388a4160736 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -1176,6 +1176,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/cli_root/zpool_initialize/zpool_initialize_attach_detach_add_remove.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_fault_export_import_online.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_import_export.ksh \
+	functional/cli_root/zpool_initialize/zpool_initialize_multiple_pools.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_offline_export_import_online.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_online_offline.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_split.ksh \
@@ -1239,6 +1240,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/cli_root/zpool_scrub/zpool_scrub_005_pos.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_encrypted_unloaded.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_multiple_copies.ksh \
+	functional/cli_root/zpool_scrub/zpool_scrub_multiple_pools.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_offline_device.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_print_repairing.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_txg_continue_from_last.ksh \
@@ -1291,6 +1293,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/cli_root/zpool_trim/zpool_trim_fault_export_import_online.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_import_export.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_multiple.ksh \
+	functional/cli_root/zpool_trim/zpool_trim_multiple_pools.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_neg.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_offline_export_import_online.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_online_offline.ksh \
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_multiple_pools.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_multiple_pools.ksh
new file mode 100755
index 00000000000..cc7bca5445d
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_multiple_pools.ksh
@@ -0,0 +1,131 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2025 Hewlett Packard Enterprise Development LP.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib
+
+#
+# DESCRIPTION:
+#	Verify 'zpool initialize -a' works correctly with multiple pools
+#
+# STRATEGY:
+#	1. Create multiple pools.
+#	2. Start a initialize operation on all pools using 'zpool initialize -a'.
+#	3. Verify that the initializing is active on all pools.
+#	4. Wait for the initialize operation to complete.
+#	5. Verify that the initialize operation is complete on all pools.
+#	6. Start a initializing on all pools using 'zpool initialize -w -a'.
+#	7. Verify that the initialize operation is complete on all pools.
+#	8. Now test the -u, -c and -s options on multiple pools with -a.
+#	9. Verify that the initialize status is correctly updated on all pools.
+#
+
+verify_runnable "global"
+
+cleanup() {
+    for pool in {1..4}; do
+        zpool destroy $TESTPOOL${pool}
+        rm -rf $TESTDIR${pool}
+    done
+    rm -f $DISK1 $DISK2 $DISK3 $DISK4
+}
+
+log_onexit cleanup
+
+log_assert "Verify if 'zpool initialize -a' works correctly with multiple pools."
+
+DEVSIZE='5G'
+TESTDIR="$TEST_BASE_DIR/zpool_initialize_multiple_pools"
+DISK1="$TEST_BASE_DIR/zpool_disk1.dat"
+DISK2="$TEST_BASE_DIR/zpool_disk2.dat"
+DISK3="$TEST_BASE_DIR/zpool_disk3.dat"
+DISK4="$TEST_BASE_DIR/zpool_disk4.dat"
+
+truncate -s $DEVSIZE $DISK1
+truncate -s $DEVSIZE $DISK2
+truncate -s $DEVSIZE $DISK3
+truncate -s $DEVSIZE $DISK4
+
+for pool in {1..4}; do
+    DISK[$pool]="$TEST_BASE_DIR/zpool_disk${pool}.dat"
+    truncate -s $DEVSIZE ${DISK[$pool]}
+    log_must zpool create $TESTPOOL${pool} ${DISK[$pool]}
+done
+sync_all_pools
+
+# Start an initialize operation on all pools using 'zpool initialize -a'.
+log_must zpool initialize -a
+
+# Verify that the initializing is active on all pools.
+for pool in {1..4}; do
+    if [[ -z "$(initialize_progress $TESTPOOL${pool} ${DISK[$pool]})" ]]; then
+        log_fail "Initializing did not start on pool $TESTPOOL${pool}"
+    fi
+done
+
+# Wait for the initialize operation to complete on all pools.
+for pool in {1..4}; do
+    log_must zpool wait -t initialize $TESTPOOL${pool}
+done
+
+# Verify that the initialize operation is complete on all pools.
+complete_count=$(zpool status -i | grep -c "completed")
+if [[ $complete_count -ne 4 ]]; then
+    log_fail "Expected 4 pools to have initialize status 'completed', but found ${complete_count}."
+fi
+
+# Start an initialize operation on all pools using 'zpool initialize -w -a'.
+log_must zpool initialize -w -a
+
+# Verify that the initialize operation is complete on all pools.
+complete_count=$(zpool status -i | grep -c "completed")
+if [[ $complete_count -ne 4 ]]; then
+    log_fail "Expected 4 pools to have initialize status 'completed', but found ${complete_count}."
+fi
+
+# Now test the -u, -c and -s options on multiple pools with -a.
+log_must zpool initialize -u -a
+complete_count=$(zpool status -i | grep -c "uninitialized")
+if [[ $complete_count -ne 4 ]]; then
+    log_fail "Expected 4 pools to have initialize status 'uninitialized', but found ${complete_count}."
+fi
+
+log_must zpool initialize -a
+
+for pool in {1..4}; do
+    if [[ -z "$(initialize_progress $TESTPOOL${pool} ${DISK[$pool]})" ]]; then
+        log_fail "Initializing did not start on pool $TESTPOOL${pool}"
+    fi
+done
+
+log_must zpool initialize -a -s
+complete_count=$(zpool status -i | grep -c "suspended")
+if [[ $complete_count -ne 4 ]]; then
+    log_fail "Expected 4 pools to have initialize status 'suspended', but found ${complete_count}."
+fi
+
+log_must zpool initialize -a -c
+for pool in {1..4}; do
+    [[ -z "$(initialize_progress $TESTPOOL${pool} ${DISK[$pool]})" ]] || \
+    log_fail "Initialize did not stop on pool $TESTPOOL${pool}"
+done
+
+log_pass "Initialize '-a' works on multiple pools correctly."
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_001_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_001_neg.ksh
index 43156805347..5ffba803342 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_001_neg.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_001_neg.ksh
@@ -28,6 +28,7 @@
 
 #
 # Copyright (c) 2016 by Delphix. All rights reserved.
+# Copyright (c) 2025 Hewlett Packard Enterprise Development LP.
 #
 
 . $STF_SUITE/include/libtest.shlib
@@ -46,7 +47,7 @@
 verify_runnable "global"
 
 set -A args "" "-?" "blah blah" "-%" "--?" "-*" "-=" \
-    "-a" "-b" "-c" "-d" "-e" "-f" "-g" "-h" "-i" "-j" "-k" "-l" \
+    "-b" "-c" "-d" "-e" "-f" "-g" "-h" "-i" "-j" "-k" "-l" \
     "-m" "-n" "-o" "-p" "-q" "-r" "-s" "-t" "-u" "-v" "-w" "-x" "-y" "-z" \
     "-A" "-B" "-C" "-D" "-E" "-F" "-G" "-H" "-I" "-J" "-K" "-L" \
     "-M" "-N" "-O" "-P" "-Q" "-R" "-S" "-T" "-U" "-V" "-W" "-X" "-W" "-Z"
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_multiple_pools.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_multiple_pools.ksh
new file mode 100755
index 00000000000..b8647e20864
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_multiple_pools.ksh
@@ -0,0 +1,128 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2025 Hewlett Packard Enterprise Development LP.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_scrub/zpool_scrub.cfg
+
+#
+# DESCRIPTION:
+#	Verify 'zpool scrub -a' works correctly with multiple pools
+#
+# STRATEGY:
+#	1. Create multiple pools.
+#	2. Start a scrub on all pools using 'zpool scrub -a'.
+#	3. Verify that the scrub is running on all pools.
+#	4. Wait for the scrub to complete.
+#	5. Verify that the scrub status is complete on all pools.
+#	6. Start a scrub on all pools using 'zpool scrub -w -a'.
+#	7. Verify that the scrub status is complete on all pools.
+#	8. Now test the -p and -s options on multiple pools with -a.
+#	9. Verify that the scrub status is correct for each option.
+#
+
+verify_runnable "global"
+
+cleanup() {
+    log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0
+    for pool in {1..4}; do
+        zpool destroy $TESTPOOL${pool}
+        rm -rf $TESTDIR${pool}
+    done
+    rm -f $DISK1 $DISK2 $DISK3 $DISK4
+    # Import the testpool
+    zpool import -a
+}
+
+log_onexit cleanup
+
+log_assert "Verify if scrubbing multiple pools works correctly."
+
+# Export the testpool created by setup and Import them later.
+log_must zpool export -a
+
+DEVSIZE='128m'
+FILESIZE='50m'
+TESTDIR="$TEST_BASE_DIR/zpool_scrub_multiple_pools"
+DISK1="$TEST_BASE_DIR/zpool_disk1.dat"
+DISK2="$TEST_BASE_DIR/zpool_disk2.dat"
+DISK3="$TEST_BASE_DIR/zpool_disk3.dat"
+DISK4="$TEST_BASE_DIR/zpool_disk4.dat"
+
+truncate -s $DEVSIZE $DISK1
+truncate -s $DEVSIZE $DISK2
+truncate -s $DEVSIZE $DISK3
+truncate -s $DEVSIZE $DISK4
+
+for pool in {1..4}; do
+    DISK[$pool]="$TEST_BASE_DIR/zpool_disk${pool}.dat"
+    truncate -s $DEVSIZE ${DISK[$pool]}
+    log_must zpool create -O mountpoint=$TESTDIR${pool} $TESTPOOL${pool} ${DISK[$pool]}
+    log_must zfs create -o compression=off $TESTPOOL${pool}/testfs${pool}
+    typeset mntpnt=$(get_prop mountpoint $TESTPOOL${pool}/testfs${pool})
+    # Fill some data into the filesystem.
+    log_must mkfile $FILESIZE $mntpnt/file${pool}.dat
+done
+sync_all_pools
+
+# Start a scrub on all pools using 'zpool scrub -a'.
+log_must zpool scrub -a
+# Wait for the scrub to complete on all pools.
+for pool in {1..4}; do
+    log_must zpool wait -t scrub $TESTPOOL${pool}
+done
+
+# Verify that the scrub status is complete on all pools.
+complete_count=$(zpool status -v | grep -c "scrub repaired")
+if [[ $complete_count -ne 4 ]]; then
+    log_fail "Expected 4 pools to have scrub status 'scrub repaired', but found $complete_count."
+fi
+
+# Start a error scrub on all pools using 'zpool scrub -w -a'
+log_must zpool scrub -w -a
+
+# Verify that the scrub status is complete on all pools.
+complete_count=$(zpool status -v | grep -c "scrub repaired")
+if [[ $complete_count -ne 4 ]]; then
+    log_fail "Expected 4 pools to have scrub status 'scrub repaired', but found $complete_count."
+fi
+
+# Now test the -p and -s options on multiple pools with -a.
+log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1
+
+log_must zpool scrub -a
+complete_count=$(zpool status -v | grep -c "scrub in progress since")
+if [[ $complete_count -ne 4 ]]; then
+    log_fail "Expected 4 pools to have scrub status 'scrub in progress since', but found $complete_count."
+fi
+
+log_must zpool scrub -a -p
+complete_count=$(zpool status -v | grep -c "scrub paused since")
+if [[ $complete_count -ne 4 ]]; then
+    log_fail "Expected 4 pools to have scrub status 'scrub paused since', but found $complete_count."
+fi
+
+log_must zpool scrub -a -s
+complete_count=$(zpool status -v | grep -c "scrub canceled")
+if [[ $complete_count -ne 4 ]]; then
+    log_fail "Expected 4 pools to have scrub status 'scrub canceled', but found $complete_count."
+fi
+
+log_pass "Scrubbing multiple pools works correctly."
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_multiple_pools.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_multiple_pools.ksh
new file mode 100755
index 00000000000..4348eecc698
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_multiple_pools.ksh
@@ -0,0 +1,123 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2025 Hewlett Packard Enterprise Development LP.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_trim/zpool_trim.kshlib
+
+#
+# DESCRIPTION:
+#	Verify 'zpool trim -a' works correctly with multiple pools
+#
+# STRATEGY:
+#	1. Create multiple pools.
+#	2. Start a trim on all pools using 'zpool trim -a'.
+#	3. Verify that the trim is started on all pools.
+#	4. Wait for the trim to complete.
+#	5. Verify that the trim is complete on all pools.
+#	6. Start a trim on all pools using 'zpool trim -w -a'.
+#	7. Verify that the trim is complete on all pools.
+#	8. Now test the -c and -s options on multiple pools with -a.
+#	9. Verify that the trim status is correct for each option.
+#
+
+verify_runnable "global"
+
+cleanup() {
+    for pool in {1..4}; do
+        zpool destroy $TESTPOOL${pool}
+        rm -rf $TESTDIR${pool}
+    done
+    rm -f $DISK1 $DISK2 $DISK3 $DISK4
+}
+
+log_onexit cleanup
+
+log_assert "Verify if trim '-a' works on multiple pools correctly."
+
+DEVSIZE='5G'
+TESTDIR="$TEST_BASE_DIR/zpool_trim_multiple_pools"
+DISK1="$TEST_BASE_DIR/zpool_disk1.dat"
+DISK2="$TEST_BASE_DIR/zpool_disk2.dat"
+DISK3="$TEST_BASE_DIR/zpool_disk3.dat"
+DISK4="$TEST_BASE_DIR/zpool_disk4.dat"
+
+truncate -s $DEVSIZE $DISK1
+truncate -s $DEVSIZE $DISK2
+truncate -s $DEVSIZE $DISK3
+truncate -s $DEVSIZE $DISK4
+
+for pool in {1..4}; do
+    DISK[$pool]="$TEST_BASE_DIR/zpool_disk${pool}.dat"
+    truncate -s $DEVSIZE ${DISK[$pool]}
+    log_must zpool create $TESTPOOL${pool} ${DISK[$pool]}
+done
+sync_all_pools
+
+# Start a trim on all pools using 'zpool trim -a'.
+log_must zpool trim -a
+
+# Verify that the trim is started on all pools.
+for pool in {1..4}; do
+    [[ -z "$(trim_progress $TESTPOOL${pool} ${DISK[$pool]})" ]] && \
+    log_fail "Trim did not start on pool $TESTPOOL${pool}"
+done
+
+# Wait for the trim to complete on all pools.
+for pool in {1..4}; do
+	log_must zpool wait -t trim $TESTPOOL${pool}
+done
+
+# Verify that the trim status is complete on all pools.
+complete_count=$(zpool status -t | grep -c "completed")
+if [[ $complete_count -ne 4 ]]; then
+	log_fail "Expected 4 pools to have trim status 'completed', but found ${complete_count}."
+fi
+
+# Start a trim on all pools using 'zpool trim -w -a'
+log_must zpool trim -w -a
+
+# Verify that the trim status is complete on all pools.
+complete_count=$(zpool status -t | grep -c "completed")
+if [[ $complete_count -ne 4 ]]; then
+	log_fail "Expected 4 pools to have trim status 'completed', but found ${complete_count}."
+fi
+
+# Now test the -s and -c options on multiple pools with -a.
+log_must zpool trim -r 1 -a
+
+for pool in {1..4}; do
+	[[ -z "$(trim_progress $TESTPOOL${pool} ${DISK[$pool]})" ]] && \
+        log_fail "Trim did not start"
+done
+
+log_must zpool trim -a -s
+complete_count=$(zpool status -t | grep -c "suspended")
+if [[ $complete_count -ne 4 ]]; then
+	log_fail "Expected 4 pools to have trim status 'suspended', but found $complete_count."
+fi
+
+log_must zpool trim -a -c
+for pool in {1..4}; do
+    [[ -z "$(trim_progress $TESTPOOL${pool} ${DISK[$pool]})" ]] || \
+    log_fail "TRIM did not stop on pool $TESTPOOL${pool}"
+done
+
+log_pass "Trim '-a' works on multiple pools correctly."

From 10a78e26479325a40ace80198249a36906b4d446 Mon Sep 17 00:00:00 2001
From: Tino Reichardt <milky-zfs@mcmilk.de>
Date: Wed, 30 Jul 2025 02:09:48 +0200
Subject: [PATCH 51/72] Faster checksum benchmark on system boot

While booting, only the needed 256KiB benchmarks are done now.

The delay for checking all checksums occurs when requested via:
- Linux: cat /proc/spl/kstat/zfs/chksum_bench
- FreeBSD: sysctl kstat.zfs.misc.chksum_bench

Reported by: Lahiru Gunathilake <gunathilakebllg@gmail.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Tino Reichardt <milky-zfs@mcmilk.de>
Co-authored-by: Colin Percival <cperciva@tarsnap.com>
Closes #17563
Closes #17560
---
 module/zfs/zfs_chksum.c | 69 ++++++++++++++++++++++++-----------------
 1 file changed, 40 insertions(+), 29 deletions(-)

diff --git a/module/zfs/zfs_chksum.c b/module/zfs/zfs_chksum.c
index 5c92be21c0c..21852bf3d86 100644
--- a/module/zfs/zfs_chksum.c
+++ b/module/zfs/zfs_chksum.c
@@ -32,9 +32,6 @@
 #include <sys/blake3.h>
 #include <sys/sha2.h>
 
-/* limit benchmarking to max 256KiB, when EdonR is slower then this: */
-#define	LIMIT_PERF_MBS	300
-
 typedef struct {
 	const char *name;
 	const char *impl;
@@ -52,9 +49,15 @@ typedef struct {
 	zio_checksum_tmpl_free_t *(free);
 } chksum_stat_t;
 
+#define	AT_STARTUP	0
+#define	AT_BENCHMARK	1
+#define	AT_DONE		2
+
 static chksum_stat_t *chksum_stat_data = 0;
-static int chksum_stat_cnt = 0;
 static kstat_t *chksum_kstat = NULL;
+static int chksum_stat_limit = AT_STARTUP;
+static int chksum_stat_cnt = 0;
+static void chksum_benchmark(void);
 
 /*
  * Sample output on i3-1005G1 System:
@@ -129,6 +132,9 @@ chksum_kstat_data(char *buf, size_t size, void *data)
 static void *
 chksum_kstat_addr(kstat_t *ksp, loff_t n)
 {
+	/* full benchmark */
+	chksum_benchmark();
+
 	if (n < chksum_stat_cnt)
 		ksp->ks_private = (void *)(chksum_stat_data + n);
 	else
@@ -176,47 +182,36 @@ chksum_run(chksum_stat_t *cs, abd_t *abd, void *ctx, int round,
 	kpreempt_enable();
 
 	run_bw = size * run_count * NANOSEC;
-	run_bw /= run_time_ns;	/* B/s */
+	run_bw /= run_time_ns; /* B/s */
 	*result = run_bw/1024/1024; /* MiB/s */
 }
 
-#define	LIMIT_INIT	0
-#define	LIMIT_NEEDED	1
-#define	LIMIT_NOLIMIT	2
-
 static void
 chksum_benchit(chksum_stat_t *cs)
 {
 	abd_t *abd;
 	void *ctx = 0;
 	void *salt = &cs->salt.zcs_bytes;
-	static int chksum_stat_limit = LIMIT_INIT;
 
 	memset(salt, 0, sizeof (cs->salt.zcs_bytes));
 	if (cs->init)
 		ctx = cs->init(&cs->salt);
 
+	/* benchmarks in startup mode */
+	if (chksum_stat_limit == AT_STARTUP) {
+		abd = abd_alloc_linear(1<<18, B_FALSE);
+		chksum_run(cs, abd, ctx, 5, &cs->bs256k);
+		goto done;
+	}
+
 	/* allocate test memory via abd linear interface */
 	abd = abd_alloc_linear(1<<20, B_FALSE);
+
+	/* benchmarks when requested */
 	chksum_run(cs, abd, ctx, 1, &cs->bs1k);
 	chksum_run(cs, abd, ctx, 2, &cs->bs4k);
 	chksum_run(cs, abd, ctx, 3, &cs->bs16k);
 	chksum_run(cs, abd, ctx, 4, &cs->bs64k);
-	chksum_run(cs, abd, ctx, 5, &cs->bs256k);
-
-	/* check if we ran on a slow cpu */
-	if (chksum_stat_limit == LIMIT_INIT) {
-		if (cs->bs1k < LIMIT_PERF_MBS) {
-			chksum_stat_limit = LIMIT_NEEDED;
-		} else {
-			chksum_stat_limit = LIMIT_NOLIMIT;
-		}
-	}
-
-	/* skip benchmarks >= 1MiB when the CPU is to slow */
-	if (chksum_stat_limit == LIMIT_NEEDED)
-		goto abort;
-
 	chksum_run(cs, abd, ctx, 6, &cs->bs1m);
 	abd_free(abd);
 
@@ -225,7 +220,7 @@ chksum_benchit(chksum_stat_t *cs)
 	chksum_run(cs, abd, ctx, 7, &cs->bs4m);
 	chksum_run(cs, abd, ctx, 8, &cs->bs16m);
 
-abort:
+done:
 	abd_free(abd);
 
 	/* free up temp memory */
@@ -243,7 +238,6 @@ chksum_benchmark(void)
 	/* we need the benchmark only for the kernel module */
 	return;
 #endif
-
 	chksum_stat_t *cs;
 	uint64_t max;
 	uint32_t id, cbid = 0, id_save;
@@ -251,8 +245,14 @@ chksum_benchmark(void)
 	const zfs_impl_t *sha256 = zfs_impl_get_ops("sha256");
 	const zfs_impl_t *sha512 = zfs_impl_get_ops("sha512");
 
+	/* benchmarks are done */
+	if (chksum_stat_limit == AT_DONE)
+		return;
+
+
 	/* count implementations */
-	chksum_stat_cnt = 2;
+	chksum_stat_cnt = 1;  /* edonr */
+	chksum_stat_cnt += 1; /* skein */
 	chksum_stat_cnt += sha256->getcnt();
 	chksum_stat_cnt += sha512->getcnt();
 	chksum_stat_cnt += blake3->getcnt();
@@ -332,6 +332,17 @@ chksum_benchmark(void)
 		}
 	}
 	blake3->setid(id_save);
+
+	switch (chksum_stat_limit) {
+	case AT_STARTUP:
+		/* next time we want a full benchmark */
+		chksum_stat_limit = AT_BENCHMARK;
+		break;
+	case AT_BENCHMARK:
+		/* no further benchmarks */
+		chksum_stat_limit = AT_DONE;
+		break;
+	}
 }
 
 void
@@ -341,7 +352,7 @@ chksum_init(void)
 	blake3_per_cpu_ctx_init();
 #endif
 
-	/* Benchmark supported implementations */
+	/* 256KiB benchmark */
 	chksum_benchmark();
 
 	/* Install kstats for all implementations */

From f70c85086bbd67a195d4ad540aa8b2f252c0aae0 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Wed, 30 Jul 2025 12:42:47 -0400
Subject: [PATCH 52/72] BRT: Fix ZAP entry endianness

During original block cloning implementation a mistake was made,
making BRT ZAP entries an array of 8 1-byte entries instead of 1
entry of 8 bytes. This makes the pools non-endian-safe.

This commit introduces a new read-compatible pool feature
"com.truenas:block_cloning_endian", fixing the endianness issue
for new pools while maintaining compatibility with existing ones.

The feature is automatically activated when creating the first BRT
ZAP (ensuring we don't activate it on pools that already have BRT
entries in the old format).  When active, BRT entries are stored
as single 8-byte values.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <alexander.motin@TrueNAS.com>
Closes #17572
---
 include/zfeature_common.h                     |  1 +
 lib/libzfs/libzfs.abi                         | 11 +--
 man/man7/zpool-features.7                     | 11 +++
 module/zcommon/zfeature_common.c              |  6 ++
 module/zfs/brt.c                              | 70 +++++++++++++++----
 .../cli_root/zpool_get/zpool_get.cfg          |  1 +
 6 files changed, 81 insertions(+), 19 deletions(-)

diff --git a/include/zfeature_common.h b/include/zfeature_common.h
index 53e1ecae379..4877df4b114 100644
--- a/include/zfeature_common.h
+++ b/include/zfeature_common.h
@@ -88,6 +88,7 @@ typedef enum spa_feature {
 	SPA_FEATURE_LONGNAME,
 	SPA_FEATURE_LARGE_MICROZAP,
 	SPA_FEATURE_DYNAMIC_GANG_HEADER,
+	SPA_FEATURE_BLOCK_CLONING_ENDIAN,
 	SPA_FEATURES
 } spa_feature_t;
 
diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi
index 0c3e8106ca6..bd2ab646802 100644
--- a/lib/libzfs/libzfs.abi
+++ b/lib/libzfs/libzfs.abi
@@ -638,7 +638,7 @@
     <elf-symbol name='fletcher_4_superscalar_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='libzfs_config_ops' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='sa_protocol_names' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
-    <elf-symbol name='spa_feature_table' size='2520' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='spa_feature_table' size='2576' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zfeature_checks_disable' size='4' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zfs_deleg_perm_tab' size='528' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zfs_history_event_names' size='328' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@@ -6397,7 +6397,8 @@
       <enumerator name='SPA_FEATURE_LONGNAME' value='42'/>
       <enumerator name='SPA_FEATURE_LARGE_MICROZAP' value='43'/>
       <enumerator name='SPA_FEATURE_DYNAMIC_GANG_HEADER' value='44'/>
-      <enumerator name='SPA_FEATURES' value='45'/>
+      <enumerator name='SPA_FEATURE_BLOCK_CLONING_ENDIAN' value='45'/>
+      <enumerator name='SPA_FEATURES' value='46'/>
     </enum-decl>
     <typedef-decl name='spa_feature_t' type-id='33ecb627' id='d6618c78'/>
     <qualified-type-def type-id='80f4b756' const='yes' id='b99c00c9'/>
@@ -9604,8 +9605,8 @@
     </function-decl>
   </abi-instr>
   <abi-instr address-size='64' path='module/zcommon/zfeature_common.c' language='LANG_C99'>
-    <array-type-def dimensions='1' type-id='83f29ca2' size-in-bits='20160' id='b948da70'>
-      <subrange length='45' type-id='7359adad' id='cb8ddca0'/>
+    <array-type-def dimensions='1' type-id='83f29ca2' size-in-bits='20608' id='b9408bab'>
+      <subrange length='46' type-id='7359adad' id='8b86bc1b'/>
     </array-type-def>
     <enum-decl name='zfeature_flags' id='6db816a4'>
       <underlying-type type-id='9cac1fee'/>
@@ -9683,7 +9684,7 @@
     <pointer-type-def type-id='611586a1' size-in-bits='64' id='2e243169'/>
     <qualified-type-def type-id='eaa32e2f' const='yes' id='83be723c'/>
     <pointer-type-def type-id='83be723c' size-in-bits='64' id='7acd98a2'/>
-    <var-decl name='spa_feature_table' type-id='b948da70' mangled-name='spa_feature_table' visibility='default' elf-symbol-id='spa_feature_table'/>
+    <var-decl name='spa_feature_table' type-id='b9408bab' mangled-name='spa_feature_table' visibility='default' elf-symbol-id='spa_feature_table'/>
     <var-decl name='zfeature_checks_disable' type-id='c19b74c3' mangled-name='zfeature_checks_disable' visibility='default' elf-symbol-id='zfeature_checks_disable'/>
     <function-decl name='opendir' visibility='default' binding='global' size-in-bits='64'>
       <parameter type-id='80f4b756'/>
diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7
index 7ec27116440..66aa100b714 100644
--- a/man/man7/zpool-features.7
+++ b/man/man7/zpool-features.7
@@ -401,6 +401,17 @@ This feature becomes
 .Sy active
 when first block is cloned.
 When the last cloned block is freed, it goes back to the enabled state.
+.feature com.truenas block_cloning_endian yes
+This feature corrects ZAP entry endianness issues in the Block Reference
+Table (BRT) used by block cloning.
+During the original block cloning implementation, BRT ZAP entries were
+mistakenly stored as arrays of 8 single-byte entries instead of single
+8-byte entries, making pools non-endian-safe.
+.Pp
+This feature is activated when the first BRT ZAP is created (that way
+ensuring compatibility with existing pools).
+When active, new BRT entries are stored in the correct endian-safe format.
+The feature becomes inactive when all BRT ZAPs are destroyed.
 .feature com.delphix bookmarks yes extensible_dataset
 This feature enables use of the
 .Nm zfs Cm bookmark
diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c
index 8ac1c7cabd6..0b37530b0e1 100644
--- a/module/zcommon/zfeature_common.c
+++ b/module/zcommon/zfeature_common.c
@@ -732,6 +732,12 @@ zpool_feature_init(void)
 	    ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL,
 	    sfeatures);
 
+	zfeature_register(SPA_FEATURE_BLOCK_CLONING_ENDIAN,
+	    "com.truenas:block_cloning_endian", "block_cloning_endian",
+	    "Fixes BRT ZAP endianness on new pools.",
+	    ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL,
+	    sfeatures);
+
 	zfeature_register(SPA_FEATURE_AVZ_V2,
 	    "com.klarasystems:vdev_zaps_v2", "vdev_zaps_v2",
 	    "Support for root vdev ZAP.",
diff --git a/module/zfs/brt.c b/module/zfs/brt.c
index 27d9ed7ea2b..40664354aa7 100644
--- a/module/zfs/brt.c
+++ b/module/zfs/brt.c
@@ -478,6 +478,18 @@ brt_vdev_create(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx)
 	    sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx));
 	BRT_DEBUG("Pool directory object created, object=%s", name);
 
+	/*
+	 * Activate the endian-fixed feature if this is the first BRT ZAP
+	 * (i.e., BLOCK_CLONING is not yet active) and the feature is enabled.
+	 */
+	if (spa_feature_is_enabled(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN) &&
+	    !spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) {
+		spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN, tx);
+	} else if (spa_feature_is_active(spa,
+	    SPA_FEATURE_BLOCK_CLONING_ENDIAN)) {
+		spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN, tx);
+	}
+
 	spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING, tx);
 }
 
@@ -658,6 +670,8 @@ brt_vdev_destroy(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx)
 	rw_exit(&brtvd->bv_lock);
 
 	spa_feature_decr(spa, SPA_FEATURE_BLOCK_CLONING, tx);
+	if (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN))
+		spa_feature_decr(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN, tx);
 }
 
 static void
@@ -855,16 +869,29 @@ brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp)
 	*vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]);
 }
 
+static boolean_t
+brt_has_endian_fixed(spa_t *spa)
+{
+	return (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN));
+}
+
 static int
-brt_entry_lookup(brt_vdev_t *brtvd, brt_entry_t *bre)
+brt_entry_lookup(spa_t *spa, brt_vdev_t *brtvd, brt_entry_t *bre)
 {
 	uint64_t off = BRE_OFFSET(bre);
 
 	if (brtvd->bv_mos_entries == 0)
 		return (SET_ERROR(ENOENT));
 
-	return (zap_lookup_uint64_by_dnode(brtvd->bv_mos_entries_dnode,
-	    &off, BRT_KEY_WORDS, 1, sizeof (bre->bre_count), &bre->bre_count));
+	if (brt_has_endian_fixed(spa)) {
+		return (zap_lookup_uint64_by_dnode(brtvd->bv_mos_entries_dnode,
+		    &off, BRT_KEY_WORDS, sizeof (bre->bre_count), 1,
+		    &bre->bre_count));
+	} else {
+		return (zap_lookup_uint64_by_dnode(brtvd->bv_mos_entries_dnode,
+		    &off, BRT_KEY_WORDS, 1, sizeof (bre->bre_count),
+		    &bre->bre_count));
+	}
 }
 
 /*
@@ -1056,7 +1083,7 @@ brt_entry_decref(spa_t *spa, const blkptr_t *bp)
 	}
 	rw_exit(&brtvd->bv_lock);
 
-	error = brt_entry_lookup(brtvd, &bre_search);
+	error = brt_entry_lookup(spa, brtvd, &bre_search);
 	/* bre_search now contains correct bre_count */
 	if (error == ENOENT) {
 		BRTSTAT_BUMP(brt_decref_no_entry);
@@ -1118,7 +1145,7 @@ brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp)
 	bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
 	if (bre == NULL) {
 		rw_exit(&brtvd->bv_lock);
-		error = brt_entry_lookup(brtvd, &bre_search);
+		error = brt_entry_lookup(spa, brtvd, &bre_search);
 		if (error == ENOENT) {
 			refcnt = 0;
 		} else {
@@ -1270,10 +1297,18 @@ brt_pending_apply_vdev(spa_t *spa, brt_vdev_t *brtvd, uint64_t txg)
 		uint64_t off = BRE_OFFSET(bre);
 		if (brtvd->bv_mos_entries != 0 &&
 		    brt_vdev_lookup(spa, brtvd, off)) {
-			int error = zap_lookup_uint64_by_dnode(
-			    brtvd->bv_mos_entries_dnode, &off,
-			    BRT_KEY_WORDS, 1, sizeof (bre->bre_count),
-			    &bre->bre_count);
+			int error;
+			if (brt_has_endian_fixed(spa)) {
+				error = zap_lookup_uint64_by_dnode(
+				    brtvd->bv_mos_entries_dnode, &off,
+				    BRT_KEY_WORDS, sizeof (bre->bre_count), 1,
+				    &bre->bre_count);
+			} else {
+				error = zap_lookup_uint64_by_dnode(
+				    brtvd->bv_mos_entries_dnode, &off,
+				    BRT_KEY_WORDS, 1, sizeof (bre->bre_count),
+				    &bre->bre_count);
+			}
 			if (error == 0) {
 				BRTSTAT_BUMP(brt_addref_entry_on_disk);
 			} else {
@@ -1326,7 +1361,7 @@ brt_pending_apply(spa_t *spa, uint64_t txg)
 }
 
 static void
-brt_sync_entry(dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx)
+brt_sync_entry(spa_t *spa, dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx)
 {
 	uint64_t off = BRE_OFFSET(bre);
 
@@ -1337,9 +1372,15 @@ brt_sync_entry(dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx)
 		    BRT_KEY_WORDS, tx);
 		VERIFY(error == 0 || error == ENOENT);
 	} else {
-		VERIFY0(zap_update_uint64_by_dnode(dn, &off,
-		    BRT_KEY_WORDS, 1, sizeof (bre->bre_count),
-		    &bre->bre_count, tx));
+		if (brt_has_endian_fixed(spa)) {
+			VERIFY0(zap_update_uint64_by_dnode(dn, &off,
+			    BRT_KEY_WORDS, sizeof (bre->bre_count), 1,
+			    &bre->bre_count, tx));
+		} else {
+			VERIFY0(zap_update_uint64_by_dnode(dn, &off,
+			    BRT_KEY_WORDS, 1, sizeof (bre->bre_count),
+			    &bre->bre_count, tx));
+		}
 	}
 }
 
@@ -1368,7 +1409,8 @@ brt_sync_table(spa_t *spa, dmu_tx_t *tx)
 
 		void *c = NULL;
 		while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) {
-			brt_sync_entry(brtvd->bv_mos_entries_dnode, bre, tx);
+			brt_sync_entry(spa, brtvd->bv_mos_entries_dnode, bre,
+			    tx);
 			kmem_cache_free(brt_entry_cache, bre);
 		}
 
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
index 6de0869765a..3389dcf72f8 100644
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
@@ -115,5 +115,6 @@ if is_linux || is_freebsd; then
 	    "feature@fast_dedup"
 	    "feature@longname"
 	    "feature@large_microzap"
+	    "feature@block_cloning_endian"
 	)
 fi

From ce9c3b4b94be9ea70200b326a42ec8b1324015c6 Mon Sep 17 00:00:00 2001
From: Richard Yao <richard@ryao.dev>
Date: Wed, 30 Jul 2025 12:45:28 -0400
Subject: [PATCH 53/72] Add CodeQL mismatched dsl_dataset_hold/_rele pairs
 check

This check is currently limited to checking mismatches that occur in the
same stack frame. It does not detect across stack frames.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Richard Yao <richard@ryao.dev>
Closes #17352
---
 .github/codeql-cpp.yml                        |  1 +
 .../cpp/dslDatasetHoldReleMismatch.ql         | 34 +++++++++++++++++++
 2 files changed, 35 insertions(+)
 create mode 100644 .github/codeql/custom-queries/cpp/dslDatasetHoldReleMismatch.ql

diff --git a/.github/codeql-cpp.yml b/.github/codeql-cpp.yml
index 88b8c608602..d99cdb55924 100644
--- a/.github/codeql-cpp.yml
+++ b/.github/codeql-cpp.yml
@@ -2,3 +2,4 @@ name: "Custom CodeQL Analysis"
 
 queries:
   - uses: ./.github/codeql/custom-queries/cpp/deprecatedFunctionUsage.ql
+  - uses: ./.github/codeql/custom-queries/cpp/dslDatasetHoldReleMismatch.ql
diff --git a/.github/codeql/custom-queries/cpp/dslDatasetHoldReleMismatch.ql b/.github/codeql/custom-queries/cpp/dslDatasetHoldReleMismatch.ql
new file mode 100644
index 00000000000..fb5dae35092
--- /dev/null
+++ b/.github/codeql/custom-queries/cpp/dslDatasetHoldReleMismatch.ql
@@ -0,0 +1,34 @@
+/**
+ * @name Detect mismatched dsl_dataset_hold/_rele pairs
+ * @description Flags instances of issue #12014 where
+ *   - a dataset held with dsl_dataset_hold_obj() ends up in dsl_dataset_rele_flags(), or
+ *   - a dataset held with dsl_dataset_hold_obj_flags() ends up in dsl_dataset_rele().
+ * @kind problem
+ * @severity error
+ * @tags correctness
+ * @id cpp/dslDatasetHoldReleMismatch
+ */
+
+import cpp
+
+from Variable ds, Call holdCall, Call releCall, string message
+where
+    ds.getType().toString() = "dsl_dataset_t *" and
+    holdCall.getASuccessor*() = releCall and
+    (
+        (holdCall.getTarget().getName() = "dsl_dataset_hold_obj_flags" and
+         holdCall.getArgument(4).(AddressOfExpr).getOperand().(VariableAccess).getTarget() = ds and
+         releCall.getTarget().getName() = "dsl_dataset_rele" and
+         releCall.getArgument(0).(VariableAccess).getTarget() = ds and
+         message = "Held with dsl_dataset_hold_obj_flags but released with dsl_dataset_rele")
+        or
+        (holdCall.getTarget().getName() = "dsl_dataset_hold_obj" and
+         holdCall.getArgument(3).(AddressOfExpr).getOperand().(VariableAccess).getTarget() = ds and
+         releCall.getTarget().getName() = "dsl_dataset_rele_flags" and
+         releCall.getArgument(0).(VariableAccess).getTarget() = ds and
+         message = "Held with dsl_dataset_hold_obj but released with dsl_dataset_rele_flags")
+    )
+select releCall,
+       "Mismatched release: held with $@ but released with " + releCall.getTarget().getName() + " for dataset $@",
+       holdCall, holdCall.getTarget().getName(),
+       ds, ds.toString()

From dea0fc969ba96309990c00821b629afd576b4108 Mon Sep 17 00:00:00 2001
From: Fedor Uporov <60701163+fuporovvStack@users.noreply.github.com>
Date: Wed, 30 Jul 2025 19:46:34 +0300
Subject: [PATCH 54/72] ZVOL: Return early, if volmode is ZFS_VOLMODE_NONE on
 FreeBSD side

Return from zvol_os_create_minor() function immediately after
dsl_prop_get_integer() call if volmode property value is set to
'none', like it is doing on Linux side.

Sponsored-by: vStack, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Signed-off-by: Fedor Uporov <fuporov.vstack@gmail.com>
Closes #17405
---
 module/os/freebsd/zfs/zvol_os.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/module/os/freebsd/zfs/zvol_os.c b/module/os/freebsd/zfs/zvol_os.c
index 212ef560db0..5ec35a9de10 100644
--- a/module/os/freebsd/zfs/zvol_os.c
+++ b/module/os/freebsd/zfs/zvol_os.c
@@ -1402,9 +1402,13 @@ zvol_os_create_minor(const char *name)
 
 	error = dsl_prop_get_integer(name,
 	    zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL);
-	if (error || volmode == ZFS_VOLMODE_DEFAULT)
+	if (error)
+		goto out_dmu_objset_disown;
+
+	if (volmode == ZFS_VOLMODE_DEFAULT)
 		volmode = zvol_volmode;
-	error = 0;
+	if (volmode == ZFS_VOLMODE_NONE)
+		goto out_dmu_objset_disown;
 
 	/*
 	 * zvol_alloc equivalent ...
@@ -1496,7 +1500,7 @@ zvol_os_create_minor(const char *name)
 	}
 out_doi:
 	kmem_free(doi, sizeof (dmu_object_info_t));
-	if (error == 0) {
+	if (error == 0 && volmode != ZFS_VOLMODE_NONE) {
 		rw_enter(&zvol_state_lock, RW_WRITER);
 		zvol_insert(zv);
 		zvol_minors++;

From 2957eabbefa382f712db26430b768a4d4b49e094 Mon Sep 17 00:00:00 2001
From: rmacklem <64620010+rmacklem@users.noreply.github.com>
Date: Wed, 30 Jul 2025 09:49:43 -0700
Subject: [PATCH 55/72] Add support for FreeBSD's Solaris style extended
 attribute interface

FreeBSD commit 2ec2ba7e232d added the Solaris style syscall interface
for extended attributes.  This patch wires this interface into the
FreeBSD ZFS port, since this style of extended attributes is supported
by OpenZFS internally when the "xattr" property is set to "dir".

Some specific changes:
LOOKUP_NAMED_ATTR is defined to indicate the need to set V_NAMEDATTR
for calls to zfs_zaccess().
V_NAMEDATTR indicates that the access checking does need to be done
for FreeBSD.

The access checking code for extended attributes was copy/pasted from
the Linux port into zfs_zaccess() in the FreeBSD port.

Most of the changes are in zfs_freebsd_lookup() and
zfs_freebsd_create().
The semantics of these functions should remain unchanged unless named
attributes are being manipulated.

All the code changes are enabled for __FreeBSD_version 1500040 and
newer.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Signed-off-by: Rick Macklem <rmacklem@uoguelph.ca>
Closes #17540
---
 include/os/freebsd/spl/sys/vnode_impl.h |   1 +
 include/sys/xvattr.h                    |   1 +
 module/os/freebsd/zfs/zfs_acl.c         |  32 +++
 module/os/freebsd/zfs/zfs_vfsops.c      |  32 ++-
 module/os/freebsd/zfs/zfs_vnops_os.c    | 295 +++++++++++++++++++++++-
 5 files changed, 348 insertions(+), 13 deletions(-)

diff --git a/include/os/freebsd/spl/sys/vnode_impl.h b/include/os/freebsd/spl/sys/vnode_impl.h
index 0df3378c23e..b18836aa563 100644
--- a/include/os/freebsd/spl/sys/vnode_impl.h
+++ b/include/os/freebsd/spl/sys/vnode_impl.h
@@ -227,6 +227,7 @@ struct taskq;
 #define	LOOKUP_XATTR		0x02	/* lookup up extended attr dir */
 #define	CREATE_XATTR_DIR	0x04	/* Create extended attr dir */
 #define	LOOKUP_HAVE_SYSATTR_DIR	0x08	/* Already created virtual GFS dir */
+#define	LOOKUP_NAMED_ATTR	0x10	/* Lookup a named attribute */
 
 /*
  * Public vnode manipulation functions.
diff --git a/include/sys/xvattr.h b/include/sys/xvattr.h
index 447842d269b..5dadbdb4c61 100644
--- a/include/sys/xvattr.h
+++ b/include/sys/xvattr.h
@@ -311,6 +311,7 @@ xva_getxoptattr(xvattr_t *xvap)
  */
 #define	V_ACE_MASK	0x1	/* mask represents  NFSv4 ACE permissions */
 #define	V_APPEND	0x2	/* want to do append only check */
+#define	V_NAMEDATTR	0x4	/* is a named attribute check */
 
 /*
  * Structure used on VOP_GETSECATTR and VOP_SETSECATTR operations
diff --git a/module/os/freebsd/zfs/zfs_acl.c b/module/os/freebsd/zfs/zfs_acl.c
index 334264f6da2..5c5adc6cc12 100644
--- a/module/os/freebsd/zfs/zfs_acl.c
+++ b/module/os/freebsd/zfs/zfs_acl.c
@@ -2357,10 +2357,42 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr,
 	 * In FreeBSD, we don't care about permissions of individual ADS.
 	 * Note that not checking them is not just an optimization - without
 	 * this shortcut, EA operations may bogusly fail with EACCES.
+	 *
+	 * If this is a named attribute lookup, do the checks.
 	 */
+#if __FreeBSD_version >= 1500040
+	if ((zp->z_pflags & ZFS_XATTR) && (flags & V_NAMEDATTR) == 0)
+#else
 	if (zp->z_pflags & ZFS_XATTR)
+#endif
 		return (0);
 
+	/*
+	 * If a named attribute directory then validate against base file
+	 */
+	if (is_attr) {
+		if ((error = zfs_zget(ZTOZSB(zp),
+		    zp->z_xattr_parent, &xzp)) != 0) {
+			return (error);
+		}
+
+		check_zp = xzp;
+
+		/*
+		 * fixup mode to map to xattr perms
+		 */
+
+		if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) {
+			mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
+			mode |= ACE_WRITE_NAMED_ATTRS;
+		}
+
+		if (mode & (ACE_READ_DATA|ACE_EXECUTE)) {
+			mode &= ~(ACE_READ_DATA|ACE_EXECUTE);
+			mode |= ACE_READ_NAMED_ATTRS;
+		}
+	}
+
 	owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
 
 	/*
diff --git a/module/os/freebsd/zfs/zfs_vfsops.c b/module/os/freebsd/zfs/zfs_vfsops.c
index 493ac9f69ad..0456552ed07 100644
--- a/module/os/freebsd/zfs/zfs_vfsops.c
+++ b/module/os/freebsd/zfs/zfs_vfsops.c
@@ -1209,6 +1209,8 @@ zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
 	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
 }
 
+extern int zfs_xattr_compat;
+
 static int
 zfs_domount(vfs_t *vfsp, char *osname)
 {
@@ -1289,6 +1291,16 @@ zfs_domount(vfs_t *vfsp, char *osname)
 			goto out;
 	}
 
+#if __FreeBSD_version >= 1500040
+	/*
+	 * Named attributes can only work if the xattr property is set to
+	 * on/dir and not sa.  Also, zfs_xattr_compat must be set.
+	 */
+	if ((zfsvfs->z_flags & ZSB_XATTR) != 0 && !zfsvfs->z_xattr_sa &&
+	    zfs_xattr_compat)
+		vfsp->mnt_flag |= MNT_NAMEDATTR;
+#endif
+
 	vfs_mountedfrom(vfsp, osname);
 
 	if (!zfsvfs->z_issnap)
@@ -1812,6 +1824,14 @@ zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
 		err = vn_lock(*vpp, flags);
 		if (err != 0)
 			vrele(*vpp);
+#if __FreeBSD_version >= 1500040
+		else if ((zp->z_pflags & ZFS_XATTR) != 0) {
+			if ((*vpp)->v_type == VDIR)
+				vn_irflag_set_cond(*vpp, VIRF_NAMEDDIR);
+			else
+				vn_irflag_set_cond(*vpp, VIRF_NAMEDATTR);
+		}
+#endif
 	}
 	if (err != 0)
 		*vpp = NULL;
@@ -1964,9 +1984,17 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
 	*vpp = ZTOV(zp);
 	zfs_exit(zfsvfs, FTAG);
 	err = vn_lock(*vpp, flags);
-	if (err == 0)
+	if (err == 0) {
 		vnode_create_vobject(*vpp, zp->z_size, curthread);
-	else
+#if __FreeBSD_version >= 1500040
+		if ((zp->z_pflags & ZFS_XATTR) != 0) {
+			if ((*vpp)->v_type == VDIR)
+				vn_irflag_set_cond(*vpp, VIRF_NAMEDDIR);
+			else
+				vn_irflag_set_cond(*vpp, VIRF_NAMEDATTR);
+		}
+#endif
+	} else
 		*vpp = NULL;
 	return (err);
 }
diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c
index da6a1cc85b6..da05e931d66 100644
--- a/module/os/freebsd/zfs/zfs_vnops_os.c
+++ b/module/os/freebsd/zfs/zfs_vnops_os.c
@@ -115,6 +115,8 @@ typedef uint64_t cookie_t;
 typedef ulong_t cookie_t;
 #endif
 
+static int zfs_check_attrname(const char *name);
+
 /*
  * Programming rules.
  *
@@ -814,7 +816,12 @@ zfs_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp,
 		/*
 		 * Do we have permission to get into attribute directory?
 		 */
-		error = zfs_zaccess(zp, ACE_EXECUTE, 0, B_FALSE, cr, NULL);
+		if (flags & LOOKUP_NAMED_ATTR)
+			error = zfs_zaccess(zp, ACE_EXECUTE, V_NAMEDATTR,
+			    B_FALSE, cr, NULL);
+		else
+			error = zfs_zaccess(zp, ACE_EXECUTE, 0, B_FALSE, cr,
+			    NULL);
 		if (error) {
 			vrele(ZTOV(zp));
 		}
@@ -4766,8 +4773,16 @@ zfs_freebsd_access(struct vop_access_args *ap)
 	 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
 	 */
 	accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
-	if (accmode != 0)
-		error = zfs_access(zp, accmode, 0, ap->a_cred);
+	if (accmode != 0) {
+#if __FreeBSD_version >= 1500040
+		/* For named attributes, do the checks. */
+		if ((vn_irflag_read(vp) & VIRF_NAMEDATTR) != 0)
+			error = zfs_access(zp, accmode, V_NAMEDATTR,
+			    ap->a_cred);
+		else
+#endif
+			error = zfs_access(zp, accmode, 0, ap->a_cred);
+	}
 
 	/*
 	 * VADMIN has to be handled by vaccess().
@@ -4800,6 +4815,190 @@ struct vop_lookup_args {
 };
 #endif
 
+#if __FreeBSD_version >= 1500040
+static int
+zfs_lookup_nameddir(struct vnode *dvp, struct componentname *cnp,
+    struct vnode **vpp)
+{
+	struct vnode *xvp;
+	int error, flags;
+
+	*vpp = NULL;
+	flags = LOOKUP_XATTR | LOOKUP_NAMED_ATTR;
+	if ((cnp->cn_flags & CREATENAMED) != 0)
+		flags |= CREATE_XATTR_DIR;
+	error = zfs_lookup(dvp, NULL, &xvp, NULL, 0, cnp->cn_cred, flags,
+	    B_FALSE);
+	if (error == 0) {
+		if ((cnp->cn_flags & LOCKLEAF) != 0)
+			error = vn_lock(xvp, cnp->cn_lkflags);
+		if (error == 0) {
+			vn_irflag_set_cond(xvp, VIRF_NAMEDDIR);
+			*vpp = xvp;
+		} else {
+			vrele(xvp);
+		}
+	}
+	return (error);
+}
+
+static ssize_t
+zfs_readdir_named(struct vnode *vp, char *buf, ssize_t blen, off_t *offp,
+    int *eofflagp, struct ucred *cred, struct thread *td)
+{
+	struct uio io;
+	struct iovec iv;
+	zfs_uio_t uio;
+	int error;
+
+	io.uio_offset = *offp;
+	io.uio_segflg = UIO_SYSSPACE;
+	io.uio_rw = UIO_READ;
+	io.uio_td = td;
+	iv.iov_base = buf;
+	iv.iov_len = blen;
+	io.uio_iov = &iv;
+	io.uio_iovcnt = 1;
+	io.uio_resid = blen;
+	zfs_uio_init(&uio, &io);
+	error = zfs_readdir(vp, &uio, cred, eofflagp, NULL, NULL);
+	if (error != 0)
+		return (-1);
+	*offp = io.uio_offset;
+	return (blen - io.uio_resid);
+}
+
+static bool
+zfs_has_namedattr(struct vnode *vp, struct ucred *cred)
+{
+	struct componentname cn;
+	struct vnode *xvp;
+	struct dirent *dp;
+	off_t offs;
+	ssize_t rsize;
+	char *buf, *cp, *endcp;
+	int eofflag, error;
+	bool ret;
+
+	MNT_ILOCK(vp->v_mount);
+	if ((vp->v_mount->mnt_flag & MNT_NAMEDATTR) == 0) {
+		MNT_IUNLOCK(vp->v_mount);
+		return (false);
+	}
+	MNT_IUNLOCK(vp->v_mount);
+
+	/* Now see if a named attribute directory exists. */
+	cn.cn_flags = LOCKLEAF;
+	cn.cn_lkflags = LK_SHARED;
+	cn.cn_cred = cred;
+	error = zfs_lookup_nameddir(vp, &cn, &xvp);
+	if (error != 0)
+		return (false);
+
+	/* It exists, so see if there is any entry other than "." and "..". */
+	buf = malloc(DEV_BSIZE, M_TEMP, M_WAITOK);
+	ret = false;
+	offs = 0;
+	do {
+		rsize = zfs_readdir_named(xvp, buf, DEV_BSIZE, &offs, &eofflag,
+		    cred, curthread);
+		if (rsize <= 0)
+			break;
+		cp = buf;
+		endcp = &buf[rsize];
+		while (cp < endcp) {
+			dp = (struct dirent *)cp;
+			if (dp->d_fileno != 0 && (dp->d_type == DT_REG ||
+			    dp->d_type == DT_UNKNOWN) &&
+			    !ZFS_XA_NS_PREFIX_FORBIDDEN(dp->d_name) &&
+			    ((dp->d_namlen == 1 && dp->d_name[0] != '.') ||
+			    (dp->d_namlen == 2 && (dp->d_name[0] != '.' ||
+			    dp->d_name[1] != '.')) || dp->d_namlen > 2)) {
+				ret = true;
+				break;
+			}
+			cp += dp->d_reclen;
+		}
+	} while (!ret && rsize > 0 && eofflag == 0);
+	vput(xvp);
+	free(buf, M_TEMP);
+	return (ret);
+}
+
+static int
+zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached)
+{
+	struct componentname *cnp = ap->a_cnp;
+	char nm[NAME_MAX + 1];
+	int error;
+	struct vnode **vpp = ap->a_vpp, *dvp = ap->a_dvp, *xvp;
+	bool is_nameddir, needs_nameddir, opennamed = false;
+
+	/*
+	 * These variables are used to handle the named attribute cases:
+	 * opennamed - Is true when this is a call from open with O_NAMEDATTR
+	 *    specified and it is the last component.
+	 * is_nameddir - Is true when the directory is a named attribute dir.
+	 * needs_nameddir - Is set when the lookup needs to look for/create
+	 *    a named attribute directory.  It is only set when is_nameddir
+	 *    is_nameddir is false and opennamed is true.
+	 * xvp - Is the directory that the lookup needs to be done in.
+	 *    Usually dvp, unless needs_nameddir is true where it is the
+	 *    result of the first non-named directory lookup.
+	 * Note that name caching must be disabled for named attribute
+	 * handling.
+	 */
+	needs_nameddir = false;
+	xvp = dvp;
+	opennamed = (cnp->cn_flags & (OPENNAMED | ISLASTCN)) ==
+	    (OPENNAMED | ISLASTCN);
+	is_nameddir = (vn_irflag_read(dvp) & VIRF_NAMEDDIR) != 0;
+	if (is_nameddir && (cnp->cn_flags & ISLASTCN) == 0)
+		return (ENOATTR);
+	if (opennamed && !is_nameddir && (cnp->cn_flags & ISDOTDOT) != 0)
+		return (ENOATTR);
+	if (opennamed || is_nameddir)
+		cnp->cn_flags &= ~MAKEENTRY;
+	if (opennamed && !is_nameddir)
+		needs_nameddir = true;
+	ASSERT3U(cnp->cn_namelen, <, sizeof (nm));
+	error = 0;
+	*vpp = NULL;
+	if (needs_nameddir) {
+		if (VOP_ISLOCKED(dvp) != LK_EXCLUSIVE)
+			vn_lock(dvp, LK_UPGRADE | LK_RETRY);
+		error = zfs_lookup_nameddir(dvp, cnp, &xvp);
+		if (error == 0)
+			is_nameddir = true;
+	}
+	if (error == 0) {
+		if (!needs_nameddir || cnp->cn_namelen != 1 ||
+		    *cnp->cn_nameptr != '.') {
+			strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1,
+			    sizeof (nm)));
+			error = zfs_lookup(xvp, nm, vpp, cnp, cnp->cn_nameiop,
+			    cnp->cn_cred, 0, cached);
+			if (is_nameddir && error == 0 &&
+			    (cnp->cn_namelen != 1 || *cnp->cn_nameptr != '.') &&
+			    (cnp->cn_flags & ISDOTDOT) == 0) {
+				if ((*vpp)->v_type == VDIR)
+					vn_irflag_set_cond(*vpp, VIRF_NAMEDDIR);
+				else
+					vn_irflag_set_cond(*vpp,
+					    VIRF_NAMEDATTR);
+			}
+			if (needs_nameddir && xvp != *vpp)
+				vput(xvp);
+		} else {
+			/*
+			 * Lookup of "." when a named attribute dir is needed.
+			 */
+			*vpp = xvp;
+		}
+	}
+	return (error);
+}
+#else
 static int
 zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached)
 {
@@ -4812,6 +5011,7 @@ zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached)
 	return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
 	    cnp->cn_cred, 0, cached));
 }
+#endif
 
 static int
 zfs_freebsd_cachedlookup(struct vop_cachedlookup_args *ap)
@@ -4834,7 +5034,11 @@ zfs_cache_lookup(struct vop_lookup_args *ap)
 	zfsvfs_t *zfsvfs;
 
 	zfsvfs = ap->a_dvp->v_mount->mnt_data;
+#if __FreeBSD_version >= 1500040
+	if (zfsvfs->z_use_namecache && (ap->a_cnp->cn_flags & OPENNAMED) == 0)
+#else
 	if (zfsvfs->z_use_namecache)
+#endif
 		return (vfs_cache_lookup(ap));
 	else
 		return (zfs_freebsd_lookup(ap, B_FALSE));
@@ -4857,6 +5061,11 @@ zfs_freebsd_create(struct vop_create_args *ap)
 	vattr_t *vap = ap->a_vap;
 	znode_t *zp = NULL;
 	int rc, mode;
+	struct vnode *dvp = ap->a_dvp;
+#if __FreeBSD_version >= 1500040
+	struct vnode *xvp;
+	bool is_nameddir;
+#endif
 
 #if __FreeBSD_version < 1400068
 	ASSERT(cnp->cn_flags & SAVENAME);
@@ -4867,10 +5076,36 @@ zfs_freebsd_create(struct vop_create_args *ap)
 	zfsvfs = ap->a_dvp->v_mount->mnt_data;
 	*ap->a_vpp = NULL;
 
-	rc = zfs_create(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap, 0, mode,
-	    &zp, cnp->cn_cred, 0 /* flag */, NULL /* vsecattr */, NULL);
+	rc = 0;
+#if __FreeBSD_version >= 1500040
+	xvp = NULL;
+	is_nameddir = (vn_irflag_read(dvp) & VIRF_NAMEDDIR) != 0;
+	if (!is_nameddir && (cnp->cn_flags & OPENNAMED) != 0) {
+		/* Needs a named attribute directory. */
+		rc = zfs_lookup_nameddir(dvp, cnp, &xvp);
+		if (rc == 0) {
+			dvp = xvp;
+			is_nameddir = true;
+		}
+	}
+	if (is_nameddir && rc == 0)
+		rc = zfs_check_attrname(cnp->cn_nameptr);
+#endif
+
 	if (rc == 0)
+		rc = zfs_create(VTOZ(dvp), cnp->cn_nameptr, vap, 0, mode,
+		    &zp, cnp->cn_cred, 0 /* flag */, NULL /* vsecattr */, NULL);
+#if __FreeBSD_version >= 1500040
+	if (xvp != NULL)
+		vput(xvp);
+#endif
+	if (rc == 0) {
 		*ap->a_vpp = ZTOV(zp);
+#if __FreeBSD_version >= 1500040
+		if (is_nameddir)
+			vn_irflag_set_cond(*ap->a_vpp, VIRF_NAMEDATTR);
+#endif
+	}
 	if (zfsvfs->z_use_namecache &&
 	    rc == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
 		cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
@@ -4889,13 +5124,21 @@ struct vop_remove_args {
 static int
 zfs_freebsd_remove(struct vop_remove_args *ap)
 {
+	int error = 0;
 
 #if __FreeBSD_version < 1400068
 	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
 #endif
 
-	return (zfs_remove_(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
-	    ap->a_cnp->cn_cred));
+#if __FreeBSD_version >= 1500040
+	if ((vn_irflag_read(ap->a_dvp) & VIRF_NAMEDDIR) != 0)
+		error = zfs_check_attrname(ap->a_cnp->cn_nameptr);
+#endif
+
+	if (error == 0)
+		error = zfs_remove_(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
+		    ap->a_cnp->cn_cred);
+	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
@@ -5053,6 +5296,11 @@ zfs_freebsd_getattr(struct vop_getattr_args *ap)
 #undef	FLAG_CHECK
 	*vap = xvap.xva_vattr;
 	vap->va_flags = fflags;
+
+#if __FreeBSD_version >= 1500040
+	if ((vn_irflag_read(ap->a_vp) & (VIRF_NAMEDDIR | VIRF_NAMEDATTR)) != 0)
+		vap->va_bsdflags |= SFBSD_NAMEDATTR;
+#endif
 	return (0);
 }
 
@@ -5195,15 +5443,24 @@ zfs_freebsd_rename(struct vop_rename_args *ap)
 	vnode_t *fvp = ap->a_fvp;
 	vnode_t *tdvp = ap->a_tdvp;
 	vnode_t *tvp = ap->a_tvp;
-	int error;
+	int error = 0;
 
 #if __FreeBSD_version < 1400068
 	ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
 	ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
 #endif
 
-	error = zfs_do_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
-	    ap->a_tcnp, ap->a_fcnp->cn_cred);
+#if __FreeBSD_version >= 1500040
+	if ((vn_irflag_read(fdvp) & VIRF_NAMEDDIR) != 0) {
+		error = zfs_check_attrname(ap->a_fcnp->cn_nameptr);
+		if (error == 0)
+			error = zfs_check_attrname(ap->a_tcnp->cn_nameptr);
+	}
+#endif
+
+	if (error == 0)
+		error = zfs_do_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
+		    ap->a_tcnp, ap->a_fcnp->cn_cred);
 
 	vrele(fdvp);
 	vrele(fvp);
@@ -5457,6 +5714,22 @@ zfs_freebsd_pathconf(struct vop_pathconf_args *ap)
 			return (0);
 		}
 		return (EINVAL);
+#if __FreeBSD_version >= 1500040
+	case _PC_NAMEDATTR_ENABLED:
+		MNT_ILOCK(ap->a_vp->v_mount);
+		if ((ap->a_vp->v_mount->mnt_flag & MNT_NAMEDATTR) != 0)
+			*ap->a_retval = 1;
+		else
+			*ap->a_retval = 0;
+		MNT_IUNLOCK(ap->a_vp->v_mount);
+		return (0);
+	case _PC_HAS_NAMEDATTR:
+		if (zfs_has_namedattr(ap->a_vp, curthread->td_ucred))
+			*ap->a_retval = 1;
+		else
+			*ap->a_retval = 0;
+		return (0);
+#endif
 #ifdef _PC_HAS_HIDDENSYSTEM
 	case _PC_HAS_HIDDENSYSTEM:
 		*ap->a_retval = 1;
@@ -5467,7 +5740,7 @@ zfs_freebsd_pathconf(struct vop_pathconf_args *ap)
 	}
 }
 
-static int zfs_xattr_compat = 1;
+int zfs_xattr_compat = 1;
 
 static int
 zfs_check_attrname(const char *name)

From cb5e7e097d34ab6dec2998725916f314371090c9 Mon Sep 17 00:00:00 2001
From: Igor Ostapenko <igor.ostapenko@klarasystems.com>
Date: Thu, 31 Jul 2025 17:44:42 +0300
Subject: [PATCH 56/72] range_tree: Provide more debug details upon unexpected
 add/remove

Sponsored-by: Klara, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Signed-off-by: Igor Ostapenko <igor.ostapenko@klarasystems.com>
Closes #17581
---
 cmd/zdb/zdb.c                | 15 +++---
 include/sys/metaslab_impl.h  |  2 +
 include/sys/range_tree.h     |  9 ++++
 include/sys/vdev_impl.h      |  1 +
 module/zfs/dnode.c           | 19 ++++++-
 module/zfs/metaslab.c        | 64 ++++++++++++++++-------
 module/zfs/range_tree.c      | 98 +++++++++++++++++++++++++-----------
 module/zfs/vdev.c            | 36 ++++++++++---
 module/zfs/vdev_initialize.c |  5 +-
 module/zfs/vdev_raidz.c      |  6 ++-
 module/zfs/vdev_rebuild.c    |  5 +-
 module/zfs/vdev_removal.c    | 30 ++++++-----
 module/zfs/vdev_trim.c       | 24 ++++++---
 13 files changed, 227 insertions(+), 87 deletions(-)

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 06b28670462..6439b1bc96c 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -619,8 +619,9 @@ livelist_metaslab_validate(spa_t *spa)
 			    metaslab_calculate_range_tree_type(vd, m,
 			    &start, &shift);
 			metaslab_verify_t mv;
-			mv.mv_allocated = zfs_range_tree_create(NULL,
-			    type, NULL, start, shift);
+			mv.mv_allocated = zfs_range_tree_create_flags(
+			    NULL, type, NULL, start, shift,
+			    0, "livelist_metaslab_validate:mv_allocated");
 			mv.mv_vdid = vd->vdev_id;
 			mv.mv_msid = m->ms_id;
 			mv.mv_start = m->ms_start;
@@ -6322,8 +6323,9 @@ zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
 
 	ASSERT0(zfs_range_tree_space(svr->svr_allocd_segs));
 
-	zfs_range_tree_t *allocs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
-	    NULL, 0, 0);
+	zfs_range_tree_t *allocs = zfs_range_tree_create_flags(
+	    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+	    0, "zdb_claim_removing:allocs");
 	for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
 		metaslab_t *msp = vd->vdev_ms[msi];
 
@@ -8471,8 +8473,9 @@ dump_zpool(spa_t *spa)
 
 	if (dump_opt['d'] || dump_opt['i']) {
 		spa_feature_t f;
-		mos_refd_objs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
-		    NULL, 0, 0);
+		mos_refd_objs = zfs_range_tree_create_flags(
+		    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+		    0, "dump_zpool:mos_refd_objs");
 		dump_objset(dp->dp_meta_objset);
 
 		if (dump_opt['d'] >= 3) {
diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h
index 83fbe620fe3..6ce995d0a08 100644
--- a/include/sys/metaslab_impl.h
+++ b/include/sys/metaslab_impl.h
@@ -539,6 +539,8 @@ typedef struct metaslab_unflushed_phys {
 	uint64_t	msp_unflushed_txg;
 } metaslab_unflushed_phys_t;
 
+char *metaslab_rt_name(metaslab_group_t *, metaslab_t *, const char *);
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/include/sys/range_tree.h b/include/sys/range_tree.h
index 23e80f64284..0f688468245 100644
--- a/include/sys/range_tree.h
+++ b/include/sys/range_tree.h
@@ -49,6 +49,9 @@ typedef enum zfs_range_seg_type {
 	ZFS_RANGE_SEG_NUM_TYPES,
 } zfs_range_seg_type_t;
 
+#define	ZFS_RT_NAME(rt)		(((rt)->rt_name != NULL) ? (rt)->rt_name : "")
+#define	ZFS_RT_F_DYN_NAME	(1ULL << 0) /* if rt_name must be freed */
+
 /*
  * Note: the range_tree may not be accessed concurrently; consumers
  * must provide external locking if required.
@@ -68,6 +71,9 @@ typedef struct zfs_range_tree {
 	void		*rt_arg;
 	uint64_t	rt_gap;		/* allowable inter-segment gap */
 
+	uint64_t	rt_flags;
+	const char	*rt_name;	/* details for debugging */
+
 	/*
 	 * The rt_histogram maintains a histogram of ranges. Each bucket,
 	 * rt_histogram[i], contains the number of ranges whose size is:
@@ -281,6 +287,9 @@ zfs_range_tree_t *zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops,
     uint64_t gap);
 zfs_range_tree_t *zfs_range_tree_create(const zfs_range_tree_ops_t *ops,
     zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift);
+zfs_range_tree_t *zfs_range_tree_create_flags(const zfs_range_tree_ops_t *ops,
+    zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
+    uint64_t flags, const char *name);
 void zfs_range_tree_destroy(zfs_range_tree_t *rt);
 boolean_t zfs_range_tree_contains(zfs_range_tree_t *rt, uint64_t start,
     uint64_t size);
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index c925eb490cd..fa22fa2bac3 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -649,6 +649,7 @@ uint64_t vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b);
 int param_get_raidz_impl(char *buf, zfs_kernel_param_t *kp);
 #endif
 int param_set_raidz_impl(ZFS_MODULE_PARAM_ARGS);
+char *vdev_rt_name(vdev_t *vd, const char *name);
 
 /*
  * Vdev ashift optimization tunables
diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c
index 904a039edf9..451e1533efa 100644
--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c
@@ -86,6 +86,19 @@ int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
 static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
 #endif /* _KERNEL */
 
+static char *
+rt_name(dnode_t *dn, const char *name)
+{
+	struct objset *os = dn->dn_objset;
+
+	return (kmem_asprintf("{spa=%s objset=%llu obj=%llu %s}",
+	    spa_name(os->os_spa),
+	    (u_longlong_t)(os->os_dsl_dataset ?
+	    os->os_dsl_dataset->ds_object : DMU_META_OBJSET),
+	    (u_longlong_t)dn->dn_object,
+	    name));
+}
+
 static int
 dbuf_compare(const void *x1, const void *x2)
 {
@@ -2436,8 +2449,10 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
 	{
 		int txgoff = tx->tx_txg & TXG_MASK;
 		if (dn->dn_free_ranges[txgoff] == NULL) {
-			dn->dn_free_ranges[txgoff] = zfs_range_tree_create(NULL,
-			    ZFS_RANGE_SEG64, NULL, 0, 0);
+			dn->dn_free_ranges[txgoff] =
+			    zfs_range_tree_create_flags(
+			    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+			    ZFS_RT_F_DYN_NAME, rt_name(dn, "dn_free_ranges"));
 		}
 		zfs_range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
 		zfs_range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 69484d404ee..102a43e1166 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -375,6 +375,16 @@ static metaslab_stats_t metaslab_stats = {
 #define	METASLABSTAT_BUMP(stat) \
 	atomic_inc_64(&metaslab_stats.stat.value.ui64);
 
+char *
+metaslab_rt_name(metaslab_group_t *mg, metaslab_t *ms, const char *name)
+{
+	return (kmem_asprintf("{spa=%s vdev_guid=%llu ms_id=%llu %s}",
+	    spa_name(mg->mg_vd->vdev_spa),
+	    (u_longlong_t)mg->mg_vd->vdev_guid,
+	    (u_longlong_t)ms->ms_id,
+	    name));
+}
+
 
 static kstat_t *metaslab_ksp;
 
@@ -2900,30 +2910,43 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
 	zfs_range_seg_type_t type =
 	    metaslab_calculate_range_tree_type(vd, ms, &start, &shift);
 
-	ms->ms_allocatable = zfs_range_tree_create(NULL, type, NULL, start,
-	    shift);
+	ms->ms_allocatable = zfs_range_tree_create_flags(
+	    NULL, type, NULL, start, shift,
+	    ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_allocatable"));
 	for (int t = 0; t < TXG_SIZE; t++) {
-		ms->ms_allocating[t] = zfs_range_tree_create(NULL, type,
-		    NULL, start, shift);
+		ms->ms_allocating[t] = zfs_range_tree_create_flags(
+		    NULL, type, NULL, start, shift,
+		    ZFS_RT_F_DYN_NAME,
+		    metaslab_rt_name(mg, ms, "ms_allocating"));
 	}
-	ms->ms_freeing = zfs_range_tree_create(NULL, type, NULL, start, shift);
-	ms->ms_freed = zfs_range_tree_create(NULL, type, NULL, start, shift);
+	ms->ms_freeing = zfs_range_tree_create_flags(
+	    NULL, type, NULL, start, shift,
+	    ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_freeing"));
+	ms->ms_freed = zfs_range_tree_create_flags(
+	    NULL, type, NULL, start, shift,
+	    ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_freed"));
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
-		ms->ms_defer[t] = zfs_range_tree_create(NULL, type, NULL,
-		    start, shift);
+		ms->ms_defer[t] = zfs_range_tree_create_flags(
+		    NULL, type, NULL, start, shift,
+		    ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_defer"));
 	}
-	ms->ms_checkpointing =
-	    zfs_range_tree_create(NULL, type, NULL, start, shift);
-	ms->ms_unflushed_allocs =
-	    zfs_range_tree_create(NULL, type, NULL, start, shift);
+	ms->ms_checkpointing = zfs_range_tree_create_flags(
+	    NULL, type, NULL, start, shift,
+	    ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_checkpointing"));
+	ms->ms_unflushed_allocs = zfs_range_tree_create_flags(
+	    NULL, type, NULL, start, shift,
+	    ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_unflushed_allocs"));
 
 	metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
 	mrap->mra_bt = &ms->ms_unflushed_frees_by_size;
 	mrap->mra_floor_shift = metaslab_by_size_min_shift;
-	ms->ms_unflushed_frees = zfs_range_tree_create(&metaslab_rt_ops,
-	    type, mrap, start, shift);
+	ms->ms_unflushed_frees = zfs_range_tree_create_flags(
+	    &metaslab_rt_ops, type, mrap, start, shift,
+	    ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_unflushed_frees"));
 
-	ms->ms_trim = zfs_range_tree_create(NULL, type, NULL, start, shift);
+	ms->ms_trim = zfs_range_tree_create_flags(
+	    NULL, type, NULL, start, shift,
+	    ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_trim"));
 
 	metaslab_group_add(mg, ms);
 	metaslab_set_fragmentation(ms, B_FALSE);
@@ -3897,7 +3920,10 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
 	type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp,
 	    &start, &shift);
 
-	condense_tree = zfs_range_tree_create(NULL, type, NULL, start, shift);
+	condense_tree = zfs_range_tree_create_flags(
+	    NULL, type, NULL, start, shift,
+	    ZFS_RT_F_DYN_NAME,
+	    metaslab_rt_name(msp->ms_group, msp, "condense_tree"));
 
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 		zfs_range_tree_walk(msp->ms_defer[t],
@@ -3954,8 +3980,10 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
 	 * followed by FREES (due to space_map_write() in metaslab_sync()) for
 	 * sync pass 1.
 	 */
-	zfs_range_tree_t *tmp_tree = zfs_range_tree_create(NULL, type, NULL,
-	    start, shift);
+	zfs_range_tree_t *tmp_tree = zfs_range_tree_create_flags(
+	    NULL, type, NULL, start, shift,
+	    ZFS_RT_F_DYN_NAME,
+	    metaslab_rt_name(msp->ms_group, msp, "tmp_tree"));
 	zfs_range_tree_add(tmp_tree, msp->ms_start, msp->ms_size);
 	space_map_write(sm, tmp_tree, SM_ALLOC, SM_NO_VDEVID, tx);
 	space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx);
diff --git a/module/zfs/range_tree.c b/module/zfs/range_tree.c
index 373636c6925..fc2b17606bd 100644
--- a/module/zfs/range_tree.c
+++ b/module/zfs/range_tree.c
@@ -201,10 +201,10 @@ ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg64_find_in_buf, zfs_range_seg64_t,
 ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg_gap_find_in_buf,
     zfs_range_seg_gap_t, zfs_range_tree_seg_gap_compare)
 
-zfs_range_tree_t *
-zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops,
+static zfs_range_tree_t *
+zfs_range_tree_create_impl(const zfs_range_tree_ops_t *ops,
     zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
-    uint64_t gap)
+    uint64_t gap, uint64_t flags, const char *name)
 {
 	zfs_range_tree_t *rt = kmem_zalloc(sizeof (zfs_range_tree_t), KM_SLEEP);
 
@@ -236,6 +236,8 @@ zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops,
 
 	rt->rt_ops = ops;
 	rt->rt_gap = gap;
+	rt->rt_flags = flags;
+	rt->rt_name = name;
 	rt->rt_arg = arg;
 	rt->rt_type = type;
 	rt->rt_start = start;
@@ -247,11 +249,30 @@ zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops,
 	return (rt);
 }
 
+zfs_range_tree_t *
+zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops,
+    zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
+    uint64_t gap)
+{
+	return (zfs_range_tree_create_impl(ops, type, arg, start, shift, gap,
+	    0, NULL));
+}
+
 zfs_range_tree_t *
 zfs_range_tree_create(const zfs_range_tree_ops_t *ops,
     zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift)
 {
-	return (zfs_range_tree_create_gap(ops, type, arg, start, shift, 0));
+	return (zfs_range_tree_create_impl(ops, type, arg, start, shift, 0,
+	    0, NULL));
+}
+
+zfs_range_tree_t *
+zfs_range_tree_create_flags(const zfs_range_tree_ops_t *ops,
+    zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
+    uint64_t flags, const char *name)
+{
+	return (zfs_range_tree_create_impl(ops, type, arg, start, shift, 0,
+	    flags, name));
 }
 
 void
@@ -262,6 +283,9 @@ zfs_range_tree_destroy(zfs_range_tree_t *rt)
 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL)
 		rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
 
+	if (rt->rt_name != NULL && (rt->rt_flags & ZFS_RT_F_DYN_NAME))
+		kmem_strfree((char *)(uintptr_t)rt->rt_name);
+
 	zfs_btree_destroy(&rt->rt_root);
 	kmem_free(rt, sizeof (*rt));
 }
@@ -271,15 +295,17 @@ zfs_range_tree_adjust_fill(zfs_range_tree_t *rt, zfs_range_seg_t *rs,
     int64_t delta)
 {
 	if (delta < 0 && delta * -1 >= zfs_rs_get_fill(rs, rt)) {
-		zfs_panic_recover("zfs: attempting to decrease fill to or "
-		    "below 0; probable double remove in segment [%llx:%llx]",
+		zfs_panic_recover("zfs: rt=%s: attempting to decrease fill to "
+		    "or below 0; probable double remove in segment [%llx:%llx]",
+		    ZFS_RT_NAME(rt),
 		    (longlong_t)zfs_rs_get_start(rs, rt),
 		    (longlong_t)zfs_rs_get_end(rs, rt));
 	}
 	if (zfs_rs_get_fill(rs, rt) + delta > zfs_rs_get_end(rs, rt) -
 	    zfs_rs_get_start(rs, rt)) {
-		zfs_panic_recover("zfs: attempting to increase fill beyond "
-		    "max; probable double add in segment [%llx:%llx]",
+		zfs_panic_recover("zfs: rt=%s: attempting to increase fill "
+		    "beyond max; probable double add in segment [%llx:%llx]",
+		    ZFS_RT_NAME(rt),
 		    (longlong_t)zfs_rs_get_start(rs, rt),
 		    (longlong_t)zfs_rs_get_end(rs, rt));
 	}
@@ -319,14 +345,17 @@ zfs_range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill)
 	 * the normal code paths.
 	 */
 	if (rs != NULL) {
-		if (gap == 0) {
-			zfs_panic_recover("zfs: adding existent segment to "
-			    "range tree (offset=%llx size=%llx)",
-			    (longlong_t)start, (longlong_t)size);
-			return;
-		}
 		uint64_t rstart = zfs_rs_get_start(rs, rt);
 		uint64_t rend = zfs_rs_get_end(rs, rt);
+		if (gap == 0) {
+			zfs_panic_recover("zfs: rt=%s: adding segment "
+			    "(offset=%llx size=%llx) overlapping with existing "
+			    "one (offset=%llx size=%llx)",
+			    ZFS_RT_NAME(rt),
+			    (longlong_t)start, (longlong_t)size,
+			    (longlong_t)rstart, (longlong_t)(rend - rstart));
+			return;
+		}
 		if (rstart <= start && rend >= end) {
 			zfs_range_tree_adjust_fill(rt, rs, fill);
 			return;
@@ -451,6 +480,7 @@ zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size,
 	zfs_range_seg_t *rs;
 	zfs_range_seg_max_t rsearch, rs_tmp;
 	uint64_t end = start + size;
+	uint64_t rstart, rend;
 	boolean_t left_over, right_over;
 
 	VERIFY3U(size, !=, 0);
@@ -464,12 +494,15 @@ zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size,
 
 	/* Make sure we completely overlap with someone */
 	if (rs == NULL) {
-		zfs_panic_recover("zfs: removing nonexistent segment from "
-		    "range tree (offset=%llx size=%llx)",
-		    (longlong_t)start, (longlong_t)size);
+		zfs_panic_recover("zfs: rt=%s: removing nonexistent segment "
+		    "from range tree (offset=%llx size=%llx)",
+		    ZFS_RT_NAME(rt), (longlong_t)start, (longlong_t)size);
 		return;
 	}
 
+	rstart = zfs_rs_get_start(rs, rt);
+	rend = zfs_rs_get_end(rs, rt);
+
 	/*
 	 * Range trees with gap support must only remove complete segments
 	 * from the tree. This allows us to maintain accurate fill accounting
@@ -479,31 +512,36 @@ zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size,
 	if (rt->rt_gap != 0) {
 		if (do_fill) {
 			if (zfs_rs_get_fill(rs, rt) == size) {
-				start = zfs_rs_get_start(rs, rt);
-				end = zfs_rs_get_end(rs, rt);
+				start = rstart;
+				end = rend;
 				size = end - start;
 			} else {
 				zfs_range_tree_adjust_fill(rt, rs, -size);
 				return;
 			}
-		} else if (zfs_rs_get_start(rs, rt) != start ||
-		    zfs_rs_get_end(rs, rt) != end) {
-			zfs_panic_recover("zfs: freeing partial segment of "
-			    "gap tree (offset=%llx size=%llx) of "
+		} else if (rstart != start || rend != end) {
+			zfs_panic_recover("zfs: rt=%s: freeing partial segment "
+			    "of gap tree (offset=%llx size=%llx) of "
 			    "(offset=%llx size=%llx)",
+			    ZFS_RT_NAME(rt),
 			    (longlong_t)start, (longlong_t)size,
-			    (longlong_t)zfs_rs_get_start(rs, rt),
-			    (longlong_t)zfs_rs_get_end(rs, rt) -
-			    zfs_rs_get_start(rs, rt));
+			    (longlong_t)rstart, (longlong_t)(rend - rstart));
 			return;
 		}
 	}
 
-	VERIFY3U(zfs_rs_get_start(rs, rt), <=, start);
-	VERIFY3U(zfs_rs_get_end(rs, rt), >=, end);
+	if (!(rstart <= start && rend >= end)) {
+		panic("zfs: rt=%s: removing segment "
+		    "(offset=%llx size=%llx) not completely overlapped by "
+		    "existing one (offset=%llx size=%llx)",
+		    ZFS_RT_NAME(rt),
+		    (longlong_t)start, (longlong_t)size,
+		    (longlong_t)rstart, (longlong_t)(rend - rstart));
+		return;
+	}
 
-	left_over = (zfs_rs_get_start(rs, rt) != start);
-	right_over = (zfs_rs_get_end(rs, rt) != end);
+	left_over = (rstart != start);
+	right_over = (rend != end);
 
 	zfs_range_tree_stat_decr(rt, rs);
 
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index aa4038a7526..70b14fb9b2c 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -243,6 +243,25 @@ vdev_dbgmsg_print_tree(vdev_t *vd, int indent)
 		vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2);
 }
 
+char *
+vdev_rt_name(vdev_t *vd, const char *name)
+{
+	return (kmem_asprintf("{spa=%s vdev_guid=%llu %s}",
+	    spa_name(vd->vdev_spa),
+	    (u_longlong_t)vd->vdev_guid,
+	    name));
+}
+
+static char *
+vdev_rt_name_dtl(vdev_t *vd, const char *name, vdev_dtl_type_t dtl_type)
+{
+	return (kmem_asprintf("{spa=%s vdev_guid=%llu %s[%d]}",
+	    spa_name(vd->vdev_spa),
+	    (u_longlong_t)vd->vdev_guid,
+	    name,
+	    dtl_type));
+}
+
 /*
  * Virtual device management.
  */
@@ -695,8 +714,9 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 
 	rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL);
 	mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL);
-	vd->vdev_obsolete_segments = zfs_range_tree_create(NULL,
-	    ZFS_RANGE_SEG64, NULL, 0, 0);
+	vd->vdev_obsolete_segments = zfs_range_tree_create_flags(
+	    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+	    ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "vdev_obsolete_segments"));
 
 	/*
 	 * Initialize rate limit structs for events.  We rate limit ZIO delay
@@ -750,8 +770,9 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 	cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL);
 
 	for (int t = 0; t < DTL_TYPES; t++) {
-		vd->vdev_dtl[t] = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
-		    NULL, 0, 0);
+		vd->vdev_dtl[t] = zfs_range_tree_create_flags(
+		    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+		    ZFS_RT_F_DYN_NAME, vdev_rt_name_dtl(vd, "vdev_dtl", t));
 	}
 
 	txg_list_create(&vd->vdev_ms_list, spa,
@@ -3458,7 +3479,9 @@ vdev_dtl_load(vdev_t *vd)
 			return (error);
 		ASSERT(vd->vdev_dtl_sm != NULL);
 
-		rt = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0);
+		rt = zfs_range_tree_create_flags(
+		    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+		    ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "vdev_dtl_load:rt"));
 		error = space_map_load(vd->vdev_dtl_sm, rt, SM_ALLOC);
 		if (error == 0) {
 			mutex_enter(&vd->vdev_dtl_lock);
@@ -3606,7 +3629,8 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
 		ASSERT(vd->vdev_dtl_sm != NULL);
 	}
 
-	rtsync = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0);
+	rtsync = zfs_range_tree_create_flags(NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+	    ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "rtsync"));
 
 	mutex_enter(&vd->vdev_dtl_lock);
 	zfs_range_tree_walk(rt, zfs_range_tree_add, rtsync);
diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c
index 4274728578a..9243c76e810 100644
--- a/module/zfs/vdev_initialize.c
+++ b/module/zfs/vdev_initialize.c
@@ -541,8 +541,9 @@ vdev_initialize_thread(void *arg)
 
 	abd_t *deadbeef = vdev_initialize_block_alloc();
 
-	vd->vdev_initialize_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
-	    NULL, 0, 0);
+	vd->vdev_initialize_tree = zfs_range_tree_create_flags(
+	    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+	    ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "vdev_initialize_tree"));
 
 	for (uint64_t i = 0; !vd->vdev_detached &&
 	    i < vd->vdev_top->vdev_ms_count; i++) {
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
index ecb6c7f50b4..a5fa9a60493 100644
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -4589,8 +4589,10 @@ spa_raidz_expand_thread(void *arg, zthr_t *zthr)
 		uint64_t shift, start;
 		zfs_range_seg_type_t type = metaslab_calculate_range_tree_type(
 		    raidvd, msp, &start, &shift);
-		zfs_range_tree_t *rt = zfs_range_tree_create(NULL, type, NULL,
-		    start, shift);
+		zfs_range_tree_t *rt = zfs_range_tree_create_flags(
+		    NULL, type, NULL, start, shift, ZFS_RT_F_DYN_NAME,
+		    metaslab_rt_name(msp->ms_group, msp,
+		    "spa_raidz_expand_thread:rt"));
 		zfs_range_tree_add(rt, msp->ms_start, msp->ms_size);
 		zfs_range_tree_walk(msp->ms_allocatable, zfs_range_tree_remove,
 		    rt);
diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c
index 0e296606d03..cf259788ccf 100644
--- a/module/zfs/vdev_rebuild.c
+++ b/module/zfs/vdev_rebuild.c
@@ -787,8 +787,9 @@ vdev_rebuild_thread(void *arg)
 	vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
 	vr->vr_top_vdev = vd;
 	vr->vr_scan_msp = NULL;
-	vr->vr_scan_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL,
-	    0, 0);
+	vr->vr_scan_tree = zfs_range_tree_create_flags(
+	    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+	    ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "vr_scan_tree"));
 	mutex_init(&vr->vr_io_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&vr->vr_io_cv, NULL, CV_DEFAULT, NULL);
 
diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c
index db79ded6dce..3887be4bd54 100644
--- a/module/zfs/vdev_removal.c
+++ b/module/zfs/vdev_removal.c
@@ -364,13 +364,15 @@ spa_vdev_removal_create(vdev_t *vd)
 	spa_vdev_removal_t *svr = kmem_zalloc(sizeof (*svr), KM_SLEEP);
 	mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL);
-	svr->svr_allocd_segs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
-	    NULL, 0, 0);
+	svr->svr_allocd_segs = zfs_range_tree_create_flags(
+	    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+	    ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "svr_allocd_segs"));
 	svr->svr_vdev_id = vd->vdev_id;
 
 	for (int i = 0; i < TXG_SIZE; i++) {
-		svr->svr_frees[i] = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
-		    NULL, 0, 0);
+		svr->svr_frees[i] = zfs_range_tree_create_flags(
+		    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+		    ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "svr_frees"));
 		list_create(&svr->svr_new_segments[i],
 		    sizeof (vdev_indirect_mapping_entry_t),
 		    offsetof(vdev_indirect_mapping_entry_t, vime_node));
@@ -1179,8 +1181,9 @@ spa_vdev_copy_segment(vdev_t *vd, zfs_range_tree_t *segs,
 	 * relative to the start of the range to be copied (i.e. relative to the
 	 * local variable "start").
 	 */
-	zfs_range_tree_t *obsolete_segs = zfs_range_tree_create(NULL,
-	    ZFS_RANGE_SEG64, NULL, 0, 0);
+	zfs_range_tree_t *obsolete_segs = zfs_range_tree_create_flags(
+	    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+	    ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "obsolete_segs"));
 
 	zfs_btree_index_t where;
 	zfs_range_seg_t *rs = zfs_btree_first(&segs->rt_root, &where);
@@ -1448,8 +1451,9 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
 	 * allocated segments that we are copying.  We may also be copying
 	 * free segments (of up to vdev_removal_max_span bytes).
 	 */
-	zfs_range_tree_t *segs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
-	    NULL, 0, 0);
+	zfs_range_tree_t *segs = zfs_range_tree_create_flags(
+	    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+	    ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "spa_vdev_copy_impl:segs"));
 	for (;;) {
 		zfs_range_tree_t *rt = svr->svr_allocd_segs;
 		zfs_range_seg_t *rs = zfs_range_tree_first(rt);
@@ -1610,8 +1614,9 @@ spa_vdev_remove_thread(void *arg)
 	vca.vca_read_error_bytes = 0;
 	vca.vca_write_error_bytes = 0;
 
-	zfs_range_tree_t *segs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
-	    NULL, 0, 0);
+	zfs_range_tree_t *segs = zfs_range_tree_create_flags(
+	    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+	    ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "spa_vdev_remove_thread:segs"));
 
 	mutex_enter(&svr->svr_lock);
 
@@ -1895,8 +1900,9 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
 		    vdev_indirect_mapping_max_offset(vim));
 	}
 
-	zfs_range_tree_t *segs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
-	    NULL, 0, 0);
+	zfs_range_tree_t *segs = zfs_range_tree_create_flags(
+	    NULL, ZFS_RANGE_SEG64, NULL, 0, 0, ZFS_RT_F_DYN_NAME,
+	    vdev_rt_name(vd, "spa_vdev_remove_cancel_sync:segs"));
 	for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
 		metaslab_t *msp = vd->vdev_ms[msi];
 
diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c
index 842bb3e690d..fc8d5b8e9a8 100644
--- a/module/zfs/vdev_trim.c
+++ b/module/zfs/vdev_trim.c
@@ -902,7 +902,9 @@ vdev_trim_thread(void *arg)
 	ta.trim_vdev = vd;
 	ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max;
 	ta.trim_extent_bytes_min = zfs_trim_extent_bytes_min;
-	ta.trim_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0);
+	ta.trim_tree = zfs_range_tree_create_flags(
+	    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+	    ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "trim_tree"));
 	ta.trim_type = TRIM_TYPE_MANUAL;
 	ta.trim_flags = 0;
 
@@ -1305,8 +1307,10 @@ vdev_autotrim_thread(void *arg)
 			 * Allocate an empty range tree which is swapped in
 			 * for the existing ms_trim tree while it is processed.
 			 */
-			trim_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
-			    NULL, 0, 0);
+			trim_tree = zfs_range_tree_create_flags(
+			    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+			    ZFS_RT_F_DYN_NAME,
+			    vdev_rt_name(vd, "autotrim_tree"));
 			zfs_range_tree_swap(&msp->ms_trim, &trim_tree);
 			ASSERT(zfs_range_tree_is_empty(msp->ms_trim));
 
@@ -1360,8 +1364,10 @@ vdev_autotrim_thread(void *arg)
 				if (!cvd->vdev_ops->vdev_op_leaf)
 					continue;
 
-				ta->trim_tree = zfs_range_tree_create(NULL,
-				    ZFS_RANGE_SEG64, NULL, 0, 0);
+				ta->trim_tree = zfs_range_tree_create_flags(
+				    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+				    ZFS_RT_F_DYN_NAME,
+				    vdev_rt_name(vd, "autotrim_tree"));
 				zfs_range_tree_walk(trim_tree,
 				    vdev_trim_range_add, ta);
 			}
@@ -1600,7 +1606,9 @@ vdev_trim_l2arc_thread(void *arg)
 	vd->vdev_trim_secure = 0;
 
 	ta.trim_vdev = vd;
-	ta.trim_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0);
+	ta.trim_tree = zfs_range_tree_create_flags(
+	    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+	    ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "trim_tree"));
 	ta.trim_type = TRIM_TYPE_MANUAL;
 	ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max;
 	ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE;
@@ -1735,7 +1743,9 @@ vdev_trim_simple(vdev_t *vd, uint64_t start, uint64_t size)
 	ASSERT(!vd->vdev_top->vdev_rz_expanding);
 
 	ta.trim_vdev = vd;
-	ta.trim_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0);
+	ta.trim_tree = zfs_range_tree_create_flags(
+	    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+	    ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "trim_tree"));
 	ta.trim_type = TRIM_TYPE_SIMPLE;
 	ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max;
 	ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE;

From 92da9e0e938e92fc69490f590db14f0b9c177613 Mon Sep 17 00:00:00 2001
From: Fedor Uporov <60701163+fuporovvStack@users.noreply.github.com>
Date: Thu, 31 Jul 2025 18:02:09 +0300
Subject: [PATCH 57/72] ZVOL: Implement zvol_alloc() function on FreeBSD side

Implement zvol_alloc() function on FreeBSD side to increase code base
compatibility with Linux. Also, fix issue with late returning in case
if volmode=none.

Sponsored-by: vStack, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Signed-off-by: Fedor Uporov <fuporov.vstack@gmail.com>
Closes #17482
---
 module/os/freebsd/zfs/zvol_os.c | 141 ++++++++++++++++++--------------
 1 file changed, 80 insertions(+), 61 deletions(-)

diff --git a/module/os/freebsd/zfs/zvol_os.c b/module/os/freebsd/zfs/zvol_os.c
index 5ec35a9de10..7b6f84178ec 100644
--- a/module/os/freebsd/zfs/zvol_os.c
+++ b/module/os/freebsd/zfs/zvol_os.c
@@ -1314,6 +1314,79 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
 	dataset_kstats_rename(&zv->zv_kstat, newname);
 }
 
+/*
+ * Allocate memory for a new zvol_state_t and setup the required
+ * request queue and generic disk structures for the block device.
+ */
+static zvol_state_t *
+zvol_alloc(const char *name, uint64_t volblocksize)
+{
+	zvol_state_t *zv;
+	uint64_t volmode;
+
+	if (dsl_prop_get_integer(name,
+	    zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL) != 0)
+		return (NULL);
+
+	if (volmode == ZFS_VOLMODE_DEFAULT)
+		volmode = zvol_volmode;
+
+	if (volmode == ZFS_VOLMODE_NONE)
+		return (NULL);
+
+	zv = kmem_zalloc(sizeof (*zv), KM_SLEEP);
+	zv->zv_hash = zvol_name_hash(name);
+	mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL);
+	zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
+	zv->zv_volmode = volmode;
+	zv->zv_volblocksize = volblocksize;
+	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
+		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
+		struct g_provider *pp;
+		struct g_geom *gp;
+
+		g_topology_lock();
+		gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
+		gp->start = zvol_geom_bio_start;
+		gp->access = zvol_geom_access;
+		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
+		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
+		pp->sectorsize = DEV_BSIZE;
+		pp->mediasize = 0;
+		pp->private = zv;
+
+		zsg->zsg_provider = pp;
+	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
+		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
+		struct cdev *dev;
+		struct make_dev_args args;
+
+		make_dev_args_init(&args);
+		args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
+		args.mda_devsw = &zvol_cdevsw;
+		args.mda_cr = NULL;
+		args.mda_uid = UID_ROOT;
+		args.mda_gid = GID_OPERATOR;
+		args.mda_mode = 0640;
+		args.mda_si_drv2 = zv;
+		if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name) != 0) {
+			kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
+			kmem_free(zv, sizeof (zvol_state_t));
+			return (NULL);
+		}
+
+		dev->si_iosize_max = maxphys;
+		zsd->zsd_cdev = dev;
+		knlist_init_sx(&zsd->zsd_selinfo.si_note, &zv->zv_state_lock);
+	}
+	(void) strlcpy(zv->zv_name, name, MAXPATHLEN);
+	rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
+	zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
+
+	return (zv);
+}
+
 /*
  * Remove minor node for the specified volume.
  */
@@ -1368,7 +1441,7 @@ zvol_os_create_minor(const char *name)
 	objset_t *os;
 	dmu_object_info_t *doi;
 	uint64_t volsize;
-	uint64_t volmode, hash, len;
+	uint64_t hash, len;
 	int error;
 	bool replayed_zil = B_FALSE;
 
@@ -1400,70 +1473,15 @@ zvol_os_create_minor(const char *name)
 	if (error)
 		goto out_dmu_objset_disown;
 
-	error = dsl_prop_get_integer(name,
-	    zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL);
-	if (error)
+	zv = zvol_alloc(name, doi->doi_data_block_size);
+	if (zv == NULL) {
+		error = SET_ERROR(EAGAIN);
 		goto out_dmu_objset_disown;
-
-	if (volmode == ZFS_VOLMODE_DEFAULT)
-		volmode = zvol_volmode;
-	if (volmode == ZFS_VOLMODE_NONE)
-		goto out_dmu_objset_disown;
-
-	/*
-	 * zvol_alloc equivalent ...
-	 */
-	zv = kmem_zalloc(sizeof (*zv), KM_SLEEP);
-	zv->zv_hash = hash;
-	mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL);
-	zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
-	zv->zv_volmode = volmode;
-	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
-		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
-		struct g_provider *pp;
-		struct g_geom *gp;
-
-		g_topology_lock();
-		gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
-		gp->start = zvol_geom_bio_start;
-		gp->access = zvol_geom_access;
-		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
-		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
-		pp->sectorsize = DEV_BSIZE;
-		pp->mediasize = 0;
-		pp->private = zv;
-
-		zsg->zsg_provider = pp;
-	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
-		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
-		struct cdev *dev;
-		struct make_dev_args args;
-
-		make_dev_args_init(&args);
-		args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
-		args.mda_devsw = &zvol_cdevsw;
-		args.mda_cr = NULL;
-		args.mda_uid = UID_ROOT;
-		args.mda_gid = GID_OPERATOR;
-		args.mda_mode = 0640;
-		args.mda_si_drv2 = zv;
-		if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name)
-		    == 0) {
-			dev->si_iosize_max = maxphys;
-			zsd->zsd_cdev = dev;
-			knlist_init_sx(&zsd->zsd_selinfo.si_note,
-			    &zv->zv_state_lock);
-		}
 	}
-	(void) strlcpy(zv->zv_name, name, MAXPATHLEN);
-	rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
-	zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
 
 	if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
 		zv->zv_flags |= ZVOL_RDONLY;
 
-	zv->zv_volblocksize = doi->doi_data_block_size;
 	zv->zv_volsize = volsize;
 	zv->zv_objset = os;
 
@@ -1494,13 +1512,14 @@ zvol_os_create_minor(const char *name)
 out_dmu_objset_disown:
 	dmu_objset_disown(os, B_TRUE, FTAG);
 
-	if (error == 0 && volmode == ZFS_VOLMODE_GEOM) {
+	if (error == 0 && zv->zv_volmode == ZFS_VOLMODE_GEOM) {
 		g_error_provider(zv->zv_zso->zso_geom.zsg_provider, 0);
+		/* geom was locked inside zvol_alloc() function */
 		g_topology_unlock();
 	}
 out_doi:
 	kmem_free(doi, sizeof (dmu_object_info_t));
-	if (error == 0 && volmode != ZFS_VOLMODE_NONE) {
+	if (error == 0 && zv->zv_volmode != ZFS_VOLMODE_NONE) {
 		rw_enter(&zvol_state_lock, RW_WRITER);
 		zvol_insert(zv);
 		zvol_minors++;

From 1aec627c60fe9efc3313e553cef389adda08e7b4 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Fri, 1 Aug 2025 08:51:47 +1000
Subject: [PATCH 58/72] linux/atomic: fill out API for atomic pointer ops

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17580
---
 include/os/linux/spl/sys/atomic.h | 32 +++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/include/os/linux/spl/sys/atomic.h b/include/os/linux/spl/sys/atomic.h
index b2a39d7d6cb..f4bcd58bd28 100644
--- a/include/os/linux/spl/sys/atomic.h
+++ b/include/os/linux/spl/sys/atomic.h
@@ -71,6 +71,22 @@ atomic_cas_ptr(volatile void *target,  void *cmp, void *newval)
 	return ((void *)atomic_cas_64((volatile uint64_t *)target,
 	    (uint64_t)cmp, (uint64_t)newval));
 }
+static __inline__ void *
+atomic_swap_ptr(volatile void *target, void *newval)
+{
+	return ((void *)atomic_swap_64((volatile uint64_t *)target,
+	    (uint64_t)newval));
+}
+static __inline__ void *
+atomic_load_ptr(volatile void *target)
+{
+	return ((void *)atomic_load_64((volatile uint64_t *)target));
+}
+static __inline__ void
+atomic_store_ptr(volatile void *target, void *newval)
+{
+	atomic_store_64((volatile uint64_t *)target, (uint64_t)newval);
+}
 #else /* _LP64 */
 static __inline__ void *
 atomic_cas_ptr(volatile void *target,  void *cmp, void *newval)
@@ -78,6 +94,22 @@ atomic_cas_ptr(volatile void *target,  void *cmp, void *newval)
 	return ((void *)atomic_cas_32((volatile uint32_t *)target,
 	    (uint32_t)cmp, (uint32_t)newval));
 }
+static __inline__ void *
+atomic_swap_ptr(volatile void *target, void *newval)
+{
+	return ((void *)atomic_swap_32((volatile uint32_t *)target,
+	    (uint32_t)newval));
+}
+static __inline__ void *
+atomic_load_ptr(volatile void *target)
+{
+	return ((void *)atomic_load_32((volatile uint32_t *)target));
+}
+static __inline__ void
+atomic_store_ptr(volatile void *target, void *newval)
+{
+	atomic_store_32((volatile uint32_t *)target, (uint32_t)newval);
+}
 #endif /* _LP64 */
 
 #endif  /* _SPL_ATOMIC_H */

From 0f8a1105eea2d4d4bf7e05979cdf858be3f4c3b0 Mon Sep 17 00:00:00 2001
From: khoang98 <43098119+khoang98@users.noreply.github.com>
Date: Fri, 1 Aug 2025 19:47:41 -0400
Subject: [PATCH 59/72] Skip dbuf_evict_one() from dbuf_evict_notify() for
 reclaim thread

Avoid calling dbuf_evict_one() from memory reclaim contexts (e.g. Linux
kswapd, FreeBSD pagedaemon). This prevents deadlock caused by reclaim
threads waiting for the dbuf hash lock in the call sequence:
dbuf_evict_one -> dbuf_destroy -> arc_buf_destroy

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Signed-off-by: Kaitlin Hoang <kthoang@amazon.com>
Closes #17561
---
 include/os/freebsd/spl/sys/misc.h |  5 +++++
 include/os/linux/spl/sys/misc.h   |  6 ++++++
 include/sys/zfs_context.h         |  5 +++++
 module/os/freebsd/spl/spl_misc.c  |  9 +++++++++
 module/os/linux/spl/spl-thread.c  | 12 ++++++++++++
 module/zfs/dbuf.c                 | 10 +++++++++-
 6 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/include/os/freebsd/spl/sys/misc.h b/include/os/freebsd/spl/sys/misc.h
index 091ebe77281..acce8734b2c 100644
--- a/include/os/freebsd/spl/sys/misc.h
+++ b/include/os/freebsd/spl/sys/misc.h
@@ -56,4 +56,9 @@ struct opensolaris_utsname {
 #define	task_io_account_read(n)
 #define	task_io_account_write(n)
 
+/*
+ * Check if the current thread is a memory reclaim thread.
+ */
+extern int current_is_reclaim_thread(void);
+
 #endif	/* _OPENSOLARIS_SYS_MISC_H_ */
diff --git a/include/os/linux/spl/sys/misc.h b/include/os/linux/spl/sys/misc.h
index 0b44786f8a6..fbaaf229bd1 100644
--- a/include/os/linux/spl/sys/misc.h
+++ b/include/os/linux/spl/sys/misc.h
@@ -24,7 +24,13 @@
 #define	_OS_LINUX_SPL_MISC_H
 
 #include <linux/kobject.h>
+#include <linux/swap.h>
 
 extern void spl_signal_kobj_evt(struct block_device *bdev);
 
+/*
+ * Check if the current thread is a memory reclaim thread.
+ */
+extern int current_is_reclaim_thread(void);
+
 #endif
diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h
index 0f76c7adcf8..7112d3ef5c9 100644
--- a/include/sys/zfs_context.h
+++ b/include/sys/zfs_context.h
@@ -224,6 +224,11 @@ typedef pthread_t	kthread_t;
 #define	thread_join(t)	pthread_join((pthread_t)(t), NULL)
 
 #define	newproc(f, a, cid, pri, ctp, pid)	(ENOSYS)
+/*
+ * Check if the current thread is a memory reclaim thread.
+ * Always returns false in userspace (no memory reclaim thread).
+ */
+#define	current_is_reclaim_thread()	(0)
 
 /* in libzpool, p0 exists only to have its address taken */
 typedef struct proc {
diff --git a/module/os/freebsd/spl/spl_misc.c b/module/os/freebsd/spl/spl_misc.c
index f9125a067cd..3f360d167b1 100644
--- a/module/os/freebsd/spl/spl_misc.c
+++ b/module/os/freebsd/spl/spl_misc.c
@@ -101,6 +101,15 @@ spl_panic(const char *file, const char *func, int line, const char *fmt, ...)
 	va_end(ap);
 }
 
+/*
+ * Check if the current thread is a memory reclaim thread.
+ * Returns true if curproc is pageproc (FreeBSD's page daemon).
+ */
+int
+current_is_reclaim_thread(void)
+{
+	return (curproc == pageproc);
+}
 
 SYSINIT(opensolaris_utsname_init, SI_SUB_TUNABLES, SI_ORDER_ANY,
     opensolaris_utsname_init, NULL);
diff --git a/module/os/linux/spl/spl-thread.c b/module/os/linux/spl/spl-thread.c
index 1398483a3ac..f42f455222d 100644
--- a/module/os/linux/spl/spl-thread.c
+++ b/module/os/linux/spl/spl-thread.c
@@ -28,6 +28,7 @@
 #include <sys/kmem.h>
 #include <sys/tsd.h>
 #include <sys/string.h>
+#include <sys/misc.h>
 
 /*
  * Thread interfaces
@@ -197,3 +198,14 @@ issig(void)
 }
 
 EXPORT_SYMBOL(issig);
+
+/*
+ * Check if the current thread is a memory reclaim thread.
+ * Returns true if current thread is kswapd.
+ */
+int
+current_is_reclaim_thread(void)
+{
+	return (current_is_kswapd());
+}
+EXPORT_SYMBOL(current_is_reclaim_thread);
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index f1b5a17f337..a4cc79c35c2 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -866,8 +866,16 @@ dbuf_evict_notify(uint64_t size)
 	 * and grabbing the lock results in massive lock contention.
 	 */
 	if (size > dbuf_cache_target_bytes()) {
-		if (size > dbuf_cache_hiwater_bytes())
+		/*
+		 * Avoid calling dbuf_evict_one() from memory reclaim context
+		 * (e.g. Linux kswapd, FreeBSD pagedaemon) to prevent deadlocks.
+		 * Memory reclaim threads can get stuck waiting for the dbuf
+		 * hash lock.
+		 */
+		if (size > dbuf_cache_hiwater_bytes() &&
+		    !current_is_reclaim_thread()) {
 			dbuf_evict_one();
+		}
 		cv_signal(&dbuf_evict_cv);
 	}
 }

From 0b6fd024a7873a8a83abd22b2a71d59a6034f833 Mon Sep 17 00:00:00 2001
From: Fedor Uporov <60701163+fuporovvStack@users.noreply.github.com>
Date: Wed, 6 Aug 2025 17:10:52 +0300
Subject: [PATCH 60/72] ZVOL: Unify zvol minors operations and improve error
 handling

Now zvol minors creation logic is passed thru spa_zvol_taskq, like it
is doing for remove/rename zvol minors functions. Appropriate
zvol minors creation functions are refactored:
- The zvol_create_minor()/zvol_minors_create_recursive() were removed.
- The single zvol_create_minors() is added instead.

Also, it become possible to collect zvol minors subtasks status, to
detect, if some zvol minor subtask is failed in the subtasks chain.
The appropriate message is reported to zfs_dbgmsg buffer in this case.

Sponsored-by: vStack, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Signed-off-by: Fedor Uporov <fuporov.vstack@gmail.com>
Closes #17575
---
 include/sys/zvol.h              |   3 +-
 include/sys/zvol_impl.h         |   3 +-
 lib/libzpool/kernel.c           |   8 +-
 module/os/freebsd/zfs/zvol_os.c |  50 +++--
 module/os/linux/zfs/zvol_os.c   |  36 ++--
 module/zfs/dmu_objset.c         |   2 +-
 module/zfs/dmu_recv.c           |   4 +-
 module/zfs/dsl_crypt.c          |   2 +-
 module/zfs/dsl_dataset.c        |   4 +-
 module/zfs/spa.c                |   4 +-
 module/zfs/zcp.c                |   2 +-
 module/zfs/zvol.c               | 327 ++++++++++++++++++++------------
 12 files changed, 265 insertions(+), 180 deletions(-)

diff --git a/include/sys/zvol.h b/include/sys/zvol.h
index 32e70365093..cdc9dba2a28 100644
--- a/include/sys/zvol.h
+++ b/include/sys/zvol.h
@@ -36,8 +36,7 @@
 #define	SPEC_MAXOFFSET_T	((1LL << ((NBBY * sizeof (daddr32_t)) + \
 				DEV_BSHIFT - 1)) - 1)
 
-extern void zvol_create_minor(const char *);
-extern void zvol_create_minors_recursive(const char *);
+extern void zvol_create_minors(const char *);
 extern void zvol_remove_minors(spa_t *, const char *, boolean_t);
 extern void zvol_rename_minors(spa_t *, const char *, const char *, boolean_t);
 
diff --git a/include/sys/zvol_impl.h b/include/sys/zvol_impl.h
index 038d4cb48f9..f3dd9f26f23 100644
--- a/include/sys/zvol_impl.h
+++ b/include/sys/zvol_impl.h
@@ -108,7 +108,6 @@ zvol_state_t *zvol_find_by_name_hash(const char *name,
     uint64_t hash, int mode);
 int zvol_first_open(zvol_state_t *zv, boolean_t readonly);
 uint64_t zvol_name_hash(const char *name);
-void zvol_remove_minors_impl(const char *name);
 void zvol_last_close(zvol_state_t *zv);
 void zvol_insert(zvol_state_t *zv);
 void zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off,
@@ -132,7 +131,7 @@ void zv_request_task_free(zv_request_task_t *task);
  * platform dependent functions exported to platform independent code
  */
 void zvol_os_free(zvol_state_t *zv);
-void zvol_os_rename_minor(zvol_state_t *zv, const char *newname);
+int zvol_os_rename_minor(zvol_state_t *zv, const char *newname);
 int zvol_os_create_minor(const char *name);
 int zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize);
 boolean_t zvol_os_is_zvol(const char *path);
diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c
index 48f6b0ca4e1..2e8bf160465 100644
--- a/lib/libzpool/kernel.c
+++ b/lib/libzpool/kernel.c
@@ -1031,13 +1031,7 @@ kmem_cache_reap_active(void)
 }
 
 void
-zvol_create_minor(const char *name)
-{
-	(void) name;
-}
-
-void
-zvol_create_minors_recursive(const char *name)
+zvol_create_minors(const char *name)
 {
 	(void) name;
 }
diff --git a/module/os/freebsd/zfs/zvol_os.c b/module/os/freebsd/zfs/zvol_os.c
index 7b6f84178ec..72a7c4ea082 100644
--- a/module/os/freebsd/zfs/zvol_os.c
+++ b/module/os/freebsd/zfs/zvol_os.c
@@ -1248,9 +1248,11 @@ zvol_os_is_zvol(const char *device)
 	return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
 }
 
-void
+int
 zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
 {
+	int error = 0;
+
 	ASSERT(RW_LOCK_HELD(&zvol_state_lock));
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 
@@ -1304,42 +1306,47 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
 		args.mda_gid = GID_OPERATOR;
 		args.mda_mode = 0640;
 		args.mda_si_drv2 = zv;
-		if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname)
-		    == 0) {
+		error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname);
+		if (error == 0) {
 			dev->si_iosize_max = maxphys;
 			zsd->zsd_cdev = dev;
 		}
 	}
 	strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
 	dataset_kstats_rename(&zv->zv_kstat, newname);
+
+	return (error);
 }
 
 /*
  * Allocate memory for a new zvol_state_t and setup the required
  * request queue and generic disk structures for the block device.
  */
-static zvol_state_t *
-zvol_alloc(const char *name, uint64_t volblocksize)
+static int
+zvol_alloc(const char *name, uint64_t volsize, uint64_t volblocksize,
+    zvol_state_t **zvp)
 {
 	zvol_state_t *zv;
 	uint64_t volmode;
+	int error;
 
-	if (dsl_prop_get_integer(name,
-	    zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL) != 0)
-		return (NULL);
+	error = dsl_prop_get_integer(name, zfs_prop_to_name(ZFS_PROP_VOLMODE),
+	    &volmode, NULL);
+	if (error)
+		return (error);
 
 	if (volmode == ZFS_VOLMODE_DEFAULT)
 		volmode = zvol_volmode;
 
 	if (volmode == ZFS_VOLMODE_NONE)
-		return (NULL);
+		return (0);
 
 	zv = kmem_zalloc(sizeof (*zv), KM_SLEEP);
-	zv->zv_hash = zvol_name_hash(name);
 	mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL);
 	zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
 	zv->zv_volmode = volmode;
+	zv->zv_volsize = volsize;
 	zv->zv_volblocksize = volblocksize;
 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
@@ -1370,10 +1377,11 @@ zvol_alloc(const char *name, uint64_t volblocksize)
 		args.mda_gid = GID_OPERATOR;
 		args.mda_mode = 0640;
 		args.mda_si_drv2 = zv;
-		if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name) != 0) {
+		error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name);
+		if (error) {
 			kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
 			kmem_free(zv, sizeof (zvol_state_t));
-			return (NULL);
+			return (error);
 		}
 
 		dev->si_iosize_max = maxphys;
@@ -1384,7 +1392,8 @@ zvol_alloc(const char *name, uint64_t volblocksize)
 	rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
 	zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
 
-	return (zv);
+	*zvp = zv;
+	return (error);
 }
 
 /*
@@ -1437,7 +1446,7 @@ zvol_os_free(zvol_state_t *zv)
 int
 zvol_os_create_minor(const char *name)
 {
-	zvol_state_t *zv;
+	zvol_state_t *zv = NULL;
 	objset_t *os;
 	dmu_object_info_t *doi;
 	uint64_t volsize;
@@ -1473,16 +1482,15 @@ zvol_os_create_minor(const char *name)
 	if (error)
 		goto out_dmu_objset_disown;
 
-	zv = zvol_alloc(name, doi->doi_data_block_size);
-	if (zv == NULL) {
-		error = SET_ERROR(EAGAIN);
+	error = zvol_alloc(name, volsize, doi->doi_data_block_size, &zv);
+	if (error || zv == NULL)
 		goto out_dmu_objset_disown;
-	}
+
+	zv->zv_hash = hash;
 
 	if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
 		zv->zv_flags |= ZVOL_RDONLY;
 
-	zv->zv_volsize = volsize;
 	zv->zv_objset = os;
 
 	ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
@@ -1512,14 +1520,14 @@ zvol_os_create_minor(const char *name)
 out_dmu_objset_disown:
 	dmu_objset_disown(os, B_TRUE, FTAG);
 
-	if (error == 0 && zv->zv_volmode == ZFS_VOLMODE_GEOM) {
+	if (error == 0 && zv && zv->zv_volmode == ZFS_VOLMODE_GEOM) {
 		g_error_provider(zv->zv_zso->zso_geom.zsg_provider, 0);
 		/* geom was locked inside zvol_alloc() function */
 		g_topology_unlock();
 	}
 out_doi:
 	kmem_free(doi, sizeof (dmu_object_info_t));
-	if (error == 0 && zv->zv_volmode != ZFS_VOLMODE_NONE) {
+	if (error == 0 && zv) {
 		rw_enter(&zvol_state_lock, RW_WRITER);
 		zvol_insert(zv);
 		zvol_minors++;
diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
index 57a9711e902..a7431cc4da9 100644
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@@ -1302,27 +1302,30 @@ zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits)
  * Allocate memory for a new zvol_state_t and setup the required
  * request queue and generic disk structures for the block device.
  */
-static zvol_state_t *
-zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize)
+static int
+zvol_alloc(dev_t dev, const char *name, uint64_t volsize, uint64_t volblocksize,
+    zvol_state_t **zvp)
 {
 	zvol_state_t *zv;
 	struct zvol_state_os *zso;
 	uint64_t volmode;
 	int ret;
 
-	if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0)
-		return (NULL);
+	ret = dsl_prop_get_integer(name, "volmode", &volmode, NULL);
+	if (ret)
+		return (ret);
 
 	if (volmode == ZFS_VOLMODE_DEFAULT)
 		volmode = zvol_volmode;
 
 	if (volmode == ZFS_VOLMODE_NONE)
-		return (NULL);
+		return (0);
 
 	zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
 	zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
 	zv->zv_zso = zso;
 	zv->zv_volmode = volmode;
+	zv->zv_volsize = volsize;
 	zv->zv_volblocksize = volblocksize;
 
 	list_link_init(&zv->zv_next);
@@ -1396,12 +1399,13 @@ zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize)
 	snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d",
 	    ZVOL_DEV_NAME, (dev & MINORMASK));
 
-	return (zv);
+	*zvp = zv;
+	return (ret);
 
 out_kmem:
 	kmem_free(zso, sizeof (struct zvol_state_os));
 	kmem_free(zv, sizeof (zvol_state_t));
-	return (NULL);
+	return (ret);
 }
 
 /*
@@ -1562,7 +1566,7 @@ zvol_os_add_disk(struct gendisk *disk)
 int
 zvol_os_create_minor(const char *name)
 {
-	zvol_state_t *zv;
+	zvol_state_t *zv = NULL;
 	objset_t *os;
 	dmu_object_info_t *doi;
 	uint64_t volsize;
@@ -1611,18 +1615,16 @@ zvol_os_create_minor(const char *name)
 	if (error)
 		goto out_dmu_objset_disown;
 
-	zv = zvol_alloc(MKDEV(zvol_major, minor), name,
-	    doi->doi_data_block_size);
-	if (zv == NULL) {
-		error = SET_ERROR(EAGAIN);
+	error = zvol_alloc(MKDEV(zvol_major, minor), name,
+	    volsize, doi->doi_data_block_size, &zv);
+	if (error || zv == NULL)
 		goto out_dmu_objset_disown;
-	}
+
 	zv->zv_hash = hash;
 
 	if (dmu_objset_is_snapshot(os))
 		zv->zv_flags |= ZVOL_RDONLY;
 
-	zv->zv_volsize = volsize;
 	zv->zv_objset = os;
 
 	/* Default */
@@ -1689,7 +1691,7 @@ zvol_os_create_minor(const char *name)
 	 * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close()
 	 * directly as well.
 	 */
-	if (error == 0) {
+	if (error == 0 && zv) {
 		rw_enter(&zvol_state_lock, RW_WRITER);
 		zvol_insert(zv);
 		rw_exit(&zvol_state_lock);
@@ -1701,7 +1703,7 @@ zvol_os_create_minor(const char *name)
 	return (error);
 }
 
-void
+int
 zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
 {
 	int readonly = get_disk_ro(zv->zv_zso->zvo_disk);
@@ -1728,6 +1730,8 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
 	set_disk_ro(zv->zv_zso->zvo_disk, readonly);
 
 	dataset_kstats_rename(&zv->zv_kstat, newname);
+
+	return (0);
 }
 
 void
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
index c1101088be1..c135f620800 100644
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@ -1370,7 +1370,7 @@ dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
 	    6, ZFS_SPACE_CHECK_NORMAL);
 
 	if (rv == 0)
-		zvol_create_minor(name);
+		zvol_create_minors(name);
 
 	crfree(cr);
 
diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c
index 3a4bd7a1cea..afc9823decc 100644
--- a/module/zfs/dmu_recv.c
+++ b/module/zfs/dmu_recv.c
@@ -3831,11 +3831,11 @@ dmu_recv_end(dmu_recv_cookie_t *drc, void *owner)
 		nvlist_free(drc->drc_keynvl);
 	} else if (!drc->drc_heal) {
 		if (drc->drc_newfs) {
-			zvol_create_minor(drc->drc_tofs);
+			zvol_create_minors(drc->drc_tofs);
 		}
 		char *snapname = kmem_asprintf("%s@%s",
 		    drc->drc_tofs, drc->drc_tosnap);
-		zvol_create_minor(snapname);
+		zvol_create_minors(snapname);
 		kmem_strfree(snapname);
 	}
 
diff --git a/module/zfs/dsl_crypt.c b/module/zfs/dsl_crypt.c
index db568f42d24..6b6bb8d45b6 100644
--- a/module/zfs/dsl_crypt.c
+++ b/module/zfs/dsl_crypt.c
@@ -866,7 +866,7 @@ spa_keystore_load_wkey(const char *dsname, dsl_crypto_params_t *dcp,
 	dsl_pool_rele(dp, FTAG);
 
 	/* create any zvols under this ds */
-	zvol_create_minors_recursive(dsname);
+	zvol_create_minors(dsname);
 
 	return (0);
 
diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c
index c0a7872c40a..21c8a682199 100644
--- a/module/zfs/dsl_dataset.c
+++ b/module/zfs/dsl_dataset.c
@@ -2005,7 +2005,7 @@ dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors)
 	if (error == 0) {
 		for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
 		    pair = nvlist_next_nvpair(snaps, pair)) {
-			zvol_create_minor(nvpair_name(pair));
+			zvol_create_minors(nvpair_name(pair));
 		}
 	}
 
@@ -3413,7 +3413,7 @@ dsl_dataset_clone(const char *clone, const char *origin)
 	    6, ZFS_SPACE_CHECK_NORMAL);
 
 	if (rv == 0)
-		zvol_create_minor(clone);
+		zvol_create_minors(clone);
 
 	crfree(cr);
 
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index c0876c93540..31f152a8059 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -5905,7 +5905,7 @@ spa_open_common(const char *pool, spa_t **spapp, const void *tag,
 	}
 
 	if (firstopen)
-		zvol_create_minors_recursive(spa_name(spa));
+		zvol_create_minors(spa_name(spa));
 
 	*spapp = spa;
 
@@ -6883,7 +6883,7 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
 
 	mutex_exit(&spa_namespace_lock);
 
-	zvol_create_minors_recursive(pool);
+	zvol_create_minors(pool);
 
 	spa_import_os(spa);
 
diff --git a/module/zfs/zcp.c b/module/zfs/zcp.c
index 6960ea360b1..9aecf67fd25 100644
--- a/module/zfs/zcp.c
+++ b/module/zfs/zcp.c
@@ -1175,7 +1175,7 @@ zcp_eval(const char *poolname, const char *program, boolean_t sync,
 	for (nvpair_t *pair = nvlist_next_nvpair(runinfo.zri_new_zvols, NULL);
 	    pair != NULL;
 	    pair = nvlist_next_nvpair(runinfo.zri_new_zvols, pair)) {
-		zvol_create_minor(nvpair_name(pair));
+		zvol_create_minors(nvpair_name(pair));
 	}
 	fnvlist_free(runinfo.zri_new_zvols);
 
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
index 4116e16133b..7e264f308cf 100644
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -102,6 +102,7 @@ extern int zfs_bclone_wait_dirty;
 zv_taskq_t zvol_taskqs;
 
 typedef enum {
+	ZVOL_ASYNC_CREATE_MINORS,
 	ZVOL_ASYNC_REMOVE_MINORS,
 	ZVOL_ASYNC_RENAME_MINORS,
 	ZVOL_ASYNC_SET_SNAPDEV,
@@ -110,10 +111,14 @@ typedef enum {
 } zvol_async_op_t;
 
 typedef struct {
-	zvol_async_op_t op;
-	char name1[MAXNAMELEN];
-	char name2[MAXNAMELEN];
-	uint64_t value;
+	zvol_async_op_t zt_op;
+	char zt_name1[MAXNAMELEN];
+	char zt_name2[MAXNAMELEN];
+	uint64_t zt_value;
+	uint32_t zt_total;
+	uint32_t zt_done;
+	int32_t zt_status;
+	int zt_error;
 } zvol_task_t;
 
 zv_request_task_t *
@@ -1421,6 +1426,57 @@ zvol_create_minors_cb(const char *dsname, void *arg)
 	return (0);
 }
 
+static void
+zvol_task_update_status(zvol_task_t *task, uint64_t total, uint64_t done,
+    int error)
+{
+
+	task->zt_total += total;
+	task->zt_done += done;
+	if (task->zt_total != task->zt_done) {
+		task->zt_status = -1;
+		if (error)
+			task->zt_error = error;
+	}
+}
+
+static const char *
+zvol_task_op_msg(zvol_async_op_t op)
+{
+	switch (op) {
+	case ZVOL_ASYNC_CREATE_MINORS:
+		return ("create");
+	case ZVOL_ASYNC_REMOVE_MINORS:
+		return ("remove");
+	case ZVOL_ASYNC_RENAME_MINORS:
+		return ("rename");
+	case ZVOL_ASYNC_SET_SNAPDEV:
+	case ZVOL_ASYNC_SET_VOLMODE:
+		return ("set property");
+	default:
+		return ("unknown");
+	}
+
+	__builtin_unreachable();
+	return (NULL);
+}
+
+static void
+zvol_task_report_status(zvol_task_t *task)
+{
+
+	if (task->zt_status == 0)
+		return;
+
+	if (task->zt_error) {
+		dprintf("The %s minors zvol task was not ok, last error %d\n",
+		    zvol_task_op_msg(task->zt_op), task->zt_error);
+	} else {
+		dprintf("The %s minors zvol task was not ok\n",
+		    zvol_task_op_msg(task->zt_op));
+	}
+}
+
 /*
  * Create minors for the specified dataset, including children and snapshots.
  * Pay attention to the 'snapdev' property and iterate over the snapshots
@@ -1438,14 +1494,27 @@ zvol_create_minors_cb(const char *dsname, void *arg)
  * 'visible' (which also verifies that the parent is a zvol), and if so,
  * a minor node for that snapshot is created.
  */
-void
-zvol_create_minors_recursive(const char *name)
+static void
+zvol_create_minors_impl(zvol_task_t *task)
 {
+	const char *name = task->zt_name1;
 	list_t minors_list;
 	minors_job_t *job;
+	uint64_t snapdev;
+	int total = 0, done = 0, last_error, error;
 
-	if (zvol_inhibit_dev)
+	/*
+	 * Note: the dsl_pool_config_lock must not be held.
+	 * Minor node creation needs to obtain the zvol_state_lock.
+	 * zvol_open() obtains the zvol_state_lock and then the dsl pool
+	 * config lock.  Therefore, we can't have the config lock now if
+	 * we are going to wait for the zvol_state_lock, because it
+	 * would be a lock order inversion which could lead to deadlock.
+	 */
+
+	if (zvol_inhibit_dev) {
 		return;
+	}
 
 	/*
 	 * This is the list for prefetch jobs. Whenever we found a match
@@ -1461,13 +1530,16 @@ zvol_create_minors_recursive(const char *name)
 
 
 	if (strchr(name, '@') != NULL) {
-		uint64_t snapdev;
-
-		int error = dsl_prop_get_integer(name, "snapdev",
-		    &snapdev, NULL);
-
-		if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE)
-			(void) zvol_os_create_minor(name);
+		error = dsl_prop_get_integer(name, "snapdev", &snapdev, NULL);
+		if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE) {
+			error = zvol_os_create_minor(name);
+			if (error == 0) {
+				done++;
+			} else {
+				last_error = error;
+			}
+			total++;
+		}
 	} else {
 		fstrans_cookie_t cookie = spl_fstrans_mark();
 		(void) dmu_objset_find(name, zvol_create_minors_cb,
@@ -1482,41 +1554,30 @@ zvol_create_minors_recursive(const char *name)
 	 * sequentially.
 	 */
 	while ((job = list_remove_head(&minors_list)) != NULL) {
-		if (!job->error)
-			(void) zvol_os_create_minor(job->name);
+		if (!job->error) {
+			error = zvol_os_create_minor(job->name);
+			if (error == 0) {
+				done++;
+			} else {
+				last_error = error;
+			}
+		} else if (job->error == EINVAL) {
+			/*
+			 * The objset, with the name requested by current job
+			 * exist, but have the type different from zvol.
+			 * Just ignore this sort of errors.
+			 */
+			done++;
+		} else {
+			last_error = job->error;
+		}
+		total++;
 		kmem_strfree(job->name);
 		kmem_free(job, sizeof (minors_job_t));
 	}
 
 	list_destroy(&minors_list);
-}
-
-void
-zvol_create_minor(const char *name)
-{
-	/*
-	 * Note: the dsl_pool_config_lock must not be held.
-	 * Minor node creation needs to obtain the zvol_state_lock.
-	 * zvol_open() obtains the zvol_state_lock and then the dsl pool
-	 * config lock.  Therefore, we can't have the config lock now if
-	 * we are going to wait for the zvol_state_lock, because it
-	 * would be a lock order inversion which could lead to deadlock.
-	 */
-
-	if (zvol_inhibit_dev)
-		return;
-
-	if (strchr(name, '@') != NULL) {
-		uint64_t snapdev;
-
-		int error = dsl_prop_get_integer(name,
-		    "snapdev", &snapdev, NULL);
-
-		if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE)
-			(void) zvol_os_create_minor(name);
-	} else {
-		(void) zvol_os_create_minor(name);
-	}
+	zvol_task_update_status(task, total, done, last_error);
 }
 
 /*
@@ -1564,10 +1625,11 @@ zvol_free_task(void *arg)
 	zvol_os_free(arg);
 }
 
-void
-zvol_remove_minors_impl(const char *name)
+static void
+zvol_remove_minors_impl(zvol_task_t *task)
 {
 	zvol_state_t *zv, *zv_next;
+	const char *name = task ? task->zt_name1 : NULL;
 	int namelen = ((name) ? strlen(name) : 0);
 	taskqid_t t;
 	list_t delay_list, free_list;
@@ -1649,13 +1711,13 @@ zvol_remove_minors_impl(const char *name)
 }
 
 /* Remove minor for this specific volume only */
-static void
+static int
 zvol_remove_minor_impl(const char *name)
 {
 	zvol_state_t *zv = NULL, *zv_next;
 
 	if (zvol_inhibit_dev)
-		return;
+		return (0);
 
 	rw_enter(&zvol_state_lock, RW_WRITER);
 
@@ -1671,7 +1733,7 @@ zvol_remove_minor_impl(const char *name)
 
 	if (zv == NULL) {
 		rw_exit(&zvol_state_lock);
-		return;
+		return (ENOENT);
 	}
 
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
@@ -1685,7 +1747,7 @@ zvol_remove_minor_impl(const char *name)
 		mutex_exit(&zv->zv_state_lock);
 		rw_exit(&zvol_state_lock);
 		zvol_remove_minor_task(zv);
-		return;
+		return (0);
 	}
 
 	zvol_remove(zv);
@@ -1695,16 +1757,20 @@ zvol_remove_minor_impl(const char *name)
 	rw_exit(&zvol_state_lock);
 
 	zvol_os_free(zv);
+
+	return (0);
 }
 
 /*
  * Rename minors for specified dataset including children and snapshots.
  */
 static void
-zvol_rename_minors_impl(const char *oldname, const char *newname)
+zvol_rename_minors_impl(zvol_task_t *task)
 {
 	zvol_state_t *zv, *zv_next;
-	int oldnamelen;
+	const char *oldname = task->zt_name1;
+	const char *newname = task->zt_name2;
+	int total = 0, done = 0, last_error, error, oldnamelen;
 
 	if (zvol_inhibit_dev)
 		return;
@@ -1719,24 +1785,31 @@ zvol_rename_minors_impl(const char *oldname, const char *newname)
 		mutex_enter(&zv->zv_state_lock);
 
 		if (strcmp(zv->zv_name, oldname) == 0) {
-			zvol_os_rename_minor(zv, newname);
+			error = zvol_os_rename_minor(zv, newname);
 		} else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 &&
 		    (zv->zv_name[oldnamelen] == '/' ||
 		    zv->zv_name[oldnamelen] == '@')) {
 			char *name = kmem_asprintf("%s%c%s", newname,
 			    zv->zv_name[oldnamelen],
 			    zv->zv_name + oldnamelen + 1);
-			zvol_os_rename_minor(zv, name);
+			error = zvol_os_rename_minor(zv, name);
 			kmem_strfree(name);
 		}
-
+		if (error) {
+			last_error = error;
+		} else {
+			done++;
+		}
+		total++;
 		mutex_exit(&zv->zv_state_lock);
 	}
 
 	rw_exit(&zvol_state_lock);
+	zvol_task_update_status(task, total, done, last_error);
 }
 
 typedef struct zvol_snapdev_cb_arg {
+	zvol_task_t *task;
 	uint64_t snapdev;
 } zvol_snapdev_cb_arg_t;
 
@@ -1744,26 +1817,31 @@ static int
 zvol_set_snapdev_cb(const char *dsname, void *param)
 {
 	zvol_snapdev_cb_arg_t *arg = param;
+	int error = 0;
 
 	if (strchr(dsname, '@') == NULL)
 		return (0);
 
 	switch (arg->snapdev) {
 		case ZFS_SNAPDEV_VISIBLE:
-			(void) zvol_os_create_minor(dsname);
+			error = zvol_os_create_minor(dsname);
 			break;
 		case ZFS_SNAPDEV_HIDDEN:
-			(void) zvol_remove_minor_impl(dsname);
+			error = zvol_remove_minor_impl(dsname);
 			break;
 	}
 
+	zvol_task_update_status(arg->task, 1, error == 0, error);
 	return (0);
 }
 
 static void
-zvol_set_snapdev_impl(char *name, uint64_t snapdev)
+zvol_set_snapdev_impl(zvol_task_t *task)
 {
-	zvol_snapdev_cb_arg_t arg = {snapdev};
+	const char *name = task->zt_name1;
+	uint64_t snapdev = task->zt_value;
+
+	zvol_snapdev_cb_arg_t arg = {task, snapdev};
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 	/*
 	 * The zvol_set_snapdev_sync() sets snapdev appropriately
@@ -1774,11 +1852,14 @@ zvol_set_snapdev_impl(char *name, uint64_t snapdev)
 }
 
 static void
-zvol_set_volmode_impl(char *name, uint64_t volmode)
+zvol_set_volmode_impl(zvol_task_t *task)
 {
+	const char *name = task->zt_name1;
+	uint64_t volmode = task->zt_value;
 	fstrans_cookie_t cookie;
 	uint64_t old_volmode;
 	zvol_state_t *zv;
+	int error;
 
 	if (strchr(name, '@') != NULL)
 		return;
@@ -1791,7 +1872,7 @@ zvol_set_volmode_impl(char *name, uint64_t volmode)
 	 */
 	zv = zvol_find_by_name(name, RW_NONE);
 	if (zv == NULL && volmode == ZFS_VOLMODE_NONE)
-			return;
+		return;
 	if (zv != NULL) {
 		old_volmode = zv->zv_volmode;
 		mutex_exit(&zv->zv_state_lock);
@@ -1802,51 +1883,34 @@ zvol_set_volmode_impl(char *name, uint64_t volmode)
 	cookie = spl_fstrans_mark();
 	switch (volmode) {
 		case ZFS_VOLMODE_NONE:
-			(void) zvol_remove_minor_impl(name);
+			error = zvol_remove_minor_impl(name);
 			break;
 		case ZFS_VOLMODE_GEOM:
 		case ZFS_VOLMODE_DEV:
-			(void) zvol_remove_minor_impl(name);
-			(void) zvol_os_create_minor(name);
+			error = zvol_remove_minor_impl(name);
+			/*
+			 * The remove minor function call above, might be not
+			 * needed, if volmode was switched from 'none' value.
+			 * Ignore error in this case.
+			 */
+			if (error == ENOENT)
+				error = 0;
+			else if (error)
+				break;
+			error = zvol_os_create_minor(name);
 			break;
 		case ZFS_VOLMODE_DEFAULT:
-			(void) zvol_remove_minor_impl(name);
+			error = zvol_remove_minor_impl(name);
 			if (zvol_volmode == ZFS_VOLMODE_NONE)
 				break;
 			else /* if zvol_volmode is invalid defaults to "geom" */
-				(void) zvol_os_create_minor(name);
+				error = zvol_os_create_minor(name);
 			break;
 	}
+	zvol_task_update_status(task, 1, error == 0, error);
 	spl_fstrans_unmark(cookie);
 }
 
-static zvol_task_t *
-zvol_task_alloc(zvol_async_op_t op, const char *name1, const char *name2,
-    uint64_t value)
-{
-	zvol_task_t *task;
-
-	/* Never allow tasks on hidden names. */
-	if (name1[0] == '$')
-		return (NULL);
-
-	task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
-	task->op = op;
-	task->value = value;
-
-	strlcpy(task->name1, name1, sizeof (task->name1));
-	if (name2 != NULL)
-		strlcpy(task->name2, name2, sizeof (task->name2));
-
-	return (task);
-}
-
-static void
-zvol_task_free(zvol_task_t *task)
-{
-	kmem_free(task, sizeof (zvol_task_t));
-}
-
 /*
  * The worker thread function performed asynchronously.
  */
@@ -1855,25 +1919,29 @@ zvol_task_cb(void *arg)
 {
 	zvol_task_t *task = arg;
 
-	switch (task->op) {
+	switch (task->zt_op) {
+	case ZVOL_ASYNC_CREATE_MINORS:
+		zvol_create_minors_impl(task);
+		break;
 	case ZVOL_ASYNC_REMOVE_MINORS:
-		zvol_remove_minors_impl(task->name1);
+		zvol_remove_minors_impl(task);
 		break;
 	case ZVOL_ASYNC_RENAME_MINORS:
-		zvol_rename_minors_impl(task->name1, task->name2);
+		zvol_rename_minors_impl(task);
 		break;
 	case ZVOL_ASYNC_SET_SNAPDEV:
-		zvol_set_snapdev_impl(task->name1, task->value);
+		zvol_set_snapdev_impl(task);
 		break;
 	case ZVOL_ASYNC_SET_VOLMODE:
-		zvol_set_volmode_impl(task->name1, task->value);
+		zvol_set_volmode_impl(task);
 		break;
 	default:
 		VERIFY(0);
 		break;
 	}
 
-	zvol_task_free(task);
+	zvol_task_report_status(task);
+	kmem_free(task, sizeof (zvol_task_t));
 }
 
 typedef struct zvol_set_prop_int_arg {
@@ -1918,23 +1986,17 @@ zvol_set_common_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
 	if (dsl_prop_get_int_ds(ds, prop_name, &prop) != 0)
 		return (0);
 
-	switch (zsda->zsda_prop) {
-		case ZFS_PROP_VOLMODE:
-			task = zvol_task_alloc(ZVOL_ASYNC_SET_VOLMODE, dsname,
-			    NULL, prop);
-			break;
-		case ZFS_PROP_SNAPDEV:
-			task = zvol_task_alloc(ZVOL_ASYNC_SET_SNAPDEV, dsname,
-			    NULL, prop);
-			break;
-		default:
-			task = NULL;
-			break;
-	}
-
-	if (task == NULL)
+	task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
+	if (zsda->zsda_prop == ZFS_PROP_VOLMODE) {
+		task->zt_op = ZVOL_ASYNC_SET_VOLMODE;
+	} else if (zsda->zsda_prop == ZFS_PROP_SNAPDEV) {
+		task->zt_op = ZVOL_ASYNC_SET_SNAPDEV;
+	} else {
+		kmem_free(task, sizeof (zvol_task_t));
 		return (0);
-
+	}
+	task->zt_value = prop;
+	strlcpy(task->zt_name1, dsname, sizeof (task->zt_name1));
 	(void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb,
 	    task, TQ_SLEEP);
 	return (0);
@@ -1987,16 +2049,35 @@ zvol_set_common(const char *ddname, zfs_prop_t prop, zprop_source_t source,
 	    zvol_set_common_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE));
 }
 
+void
+zvol_create_minors(const char *name)
+{
+	spa_t *spa;
+	zvol_task_t *task;
+	taskqid_t id;
+
+	if (spa_open(name, &spa, FTAG) != 0)
+		return;
+
+	task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
+	task->zt_op = ZVOL_ASYNC_CREATE_MINORS;
+	strlcpy(task->zt_name1, name, sizeof (task->zt_name1));
+	id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
+	if (id != TASKQID_INVALID)
+		taskq_wait_id(spa->spa_zvol_taskq, id);
+
+	spa_close(spa, FTAG);
+}
+
 void
 zvol_remove_minors(spa_t *spa, const char *name, boolean_t async)
 {
 	zvol_task_t *task;
 	taskqid_t id;
 
-	task = zvol_task_alloc(ZVOL_ASYNC_REMOVE_MINORS, name, NULL, ~0ULL);
-	if (task == NULL)
-		return;
-
+	task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
+	task->zt_op = ZVOL_ASYNC_REMOVE_MINORS;
+	strlcpy(task->zt_name1, name, sizeof (task->zt_name1));
 	id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
 	if ((async == B_FALSE) && (id != TASKQID_INVALID))
 		taskq_wait_id(spa->spa_zvol_taskq, id);
@@ -2009,10 +2090,10 @@ zvol_rename_minors(spa_t *spa, const char *name1, const char *name2,
 	zvol_task_t *task;
 	taskqid_t id;
 
-	task = zvol_task_alloc(ZVOL_ASYNC_RENAME_MINORS, name1, name2, ~0ULL);
-	if (task == NULL)
-		return;
-
+	task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
+	task->zt_op = ZVOL_ASYNC_RENAME_MINORS;
+	strlcpy(task->zt_name1, name1, sizeof (task->zt_name1));
+	strlcpy(task->zt_name2, name2, sizeof (task->zt_name2));
 	id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
 	if ((async == B_FALSE) && (id != TASKQID_INVALID))
 		taskq_wait_id(spa->spa_zvol_taskq, id);

From 48c9b2e79da4421429430aeeb76e1e556ff2953e Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Wed, 5 Feb 2025 00:47:50 +1100
Subject: [PATCH 61/72] ZTS: include microsecond timestamps on all output

When reviewing test output after a failure, it's often quite difficult
to work out the order and timing of events, and to correlate test suite
output with kernel logs.

This adds timestamps to ZTS output to help with this, in three places:

- all of the standard log_XXX functions ultimately end up in _printline,
  which now prefixes output with a timestamp. An escape hatch
  environment variable is provided for user_cmd, which often calls the
  logging functions while also depending on the captured output.

- the test runner logging function log() also now prefixes its output
  with a timestamp.

- on failure, when capturing the kernel log in zfs_dmesg.ksh, the "iso"
  time format is requested.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17045
---
 tests/test-runner/bin/test-runner.py.in | 5 ++++-
 tests/test-runner/bin/zts-report.py.in  | 4 +++-
 tests/test-runner/include/logapi.shlib  | 7 ++++++-
 tests/zfs-tests/callbacks/zfs_dmesg.ksh | 7 ++++++-
 tests/zfs-tests/include/libtest.shlib   | 4 +++-
 5 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/tests/test-runner/bin/test-runner.py.in b/tests/test-runner/bin/test-runner.py.in
index 5bf13f5c08a..2158208be6e 100755
--- a/tests/test-runner/bin/test-runner.py.in
+++ b/tests/test-runner/bin/test-runner.py.in
@@ -15,6 +15,7 @@
 #
 # Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 # Copyright (c) 2019 Datto Inc.
+# Copyright (c) 2025, Klara, Inc.
 #
 # This script must remain compatible with Python 3.6+.
 #
@@ -372,6 +373,8 @@ User: %s
         stdout/stderr/merged in its own file.
         """
 
+        timeprefix = datetime.now().strftime('[%FT%T.%f] ')
+
         logname = getpwuid(os.getuid()).pw_name
         rer = ''
         if self.reran is True:
@@ -383,7 +386,7 @@ User: %s
             msga = 'Test: %s%s ' % (self.pathname, user)
         msgb = '[%s] [%s]%s\n' % (self.result.runtime, self.result.result, rer)
         pad = ' ' * (80 - (len(msga) + len(msgb)))
-        result_line = msga + pad + msgb
+        result_line = timeprefix + msga + pad + msgb
 
         # The result line is always written to the log file. If -q was
         # specified only failures are written to the console, otherwise
diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in
index 40f5083d129..00197012014 100755
--- a/tests/test-runner/bin/zts-report.py.in
+++ b/tests/test-runner/bin/zts-report.py.in
@@ -15,6 +15,7 @@
 #
 # Copyright (c) 2017 by Delphix. All rights reserved.
 # Copyright (c) 2018 by Lawrence Livermore National Security, LLC.
+# Copyright (c) 2025, Klara, Inc.
 #
 # This script must remain compatible with Python 3.6+.
 #
@@ -381,7 +382,8 @@ def process_results(pathname):
 
     prefix = '/zfs-tests/tests/(?:functional|perf/regression)/'
     pattern = \
-        r'^Test(?:\s+\(\S+\))?:' + \
+        r'^(?:\[[0-9\-T:\.]+\]\s+)?' + \
+        r'Test(?:\s+\(\S+\))?:' + \
         rf'\s*\S*{prefix}(\S+)' + \
         r'\s*\(run as (\S+)\)\s*\[(\S+)\]\s*\[(\S+)\]'
     pattern_log = r'^\s*Log directory:\s*(\S*)'
diff --git a/tests/test-runner/include/logapi.shlib b/tests/test-runner/include/logapi.shlib
index 670ecfefb98..29e0c7f1c9c 100644
--- a/tests/test-runner/include/logapi.shlib
+++ b/tests/test-runner/include/logapi.shlib
@@ -25,6 +25,7 @@
 # Use is subject to license terms.
 #
 # Copyright (c) 2012, 2020 by Delphix. All rights reserved.
+# Copyright (c) 2025, Klara, Inc.
 #
 
 STF_PASS=0
@@ -465,7 +466,11 @@ function _endlog
 
 function _printline
 {
-	echo "$@"
+	if [[ -n "$ZTS_LOG_SUPPRESS_TIMESTAMP" ]] ; then
+		printf '[%(%FT%T.%6N)T] %s\n' now "$*"
+	else
+		echo "$@"
+	fi
 }
 
 # Output an error message
diff --git a/tests/zfs-tests/callbacks/zfs_dmesg.ksh b/tests/zfs-tests/callbacks/zfs_dmesg.ksh
index 73c65412531..de31765a52e 100755
--- a/tests/zfs-tests/callbacks/zfs_dmesg.ksh
+++ b/tests/zfs-tests/callbacks/zfs_dmesg.ksh
@@ -15,6 +15,7 @@
 #
 # Copyright (c) 2016 by Delphix. All rights reserved.
 # Copyright (c) 2017 Lawrence Livermore National Security, LLC.
+# Copyright (c) 2025, Klara, Inc.
 #
 
 # $1: number of lines to output (default: 200)
@@ -25,7 +26,11 @@ echo " Tailing last $lines lines of dmesg log"
 echo "================================================================="
 
 # report and reset afterwards
-sudo dmesg -c | tail -n $lines
+dmesg_args="-c"
+if [[ $(uname) = "Linux" ]] ; then
+	dmesg_args="$dmesg_args --time-format=iso"
+fi
+sudo dmesg $dmesg_args | tail -n $lines
 
 echo "================================================================="
 echo " End of dmesg log"
diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib
index 4b8db189310..23e89599cae 100644
--- a/tests/zfs-tests/include/libtest.shlib
+++ b/tests/zfs-tests/include/libtest.shlib
@@ -2884,7 +2884,9 @@ function user_run
 	typeset out=$TEST_BASE_DIR/out
 	typeset err=$TEST_BASE_DIR/err
 
-	sudo -Eu $user env PATH="$PATH" ksh <<<"$*" >$out 2>$err
+	sudo -Eu $user \
+	    env PATH="$PATH" ZTS_LOG_SUPPRESS_TIMESTAMP=1 \
+	    ksh <<<"$*" >$out 2>$err
 	typeset res=$?
 	log_note "out: $(<$out)"
 	log_note "err: $(<$err)"

From 2c8beeece094d748bac08e2f282a40b02e171ac7 Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Tue, 5 Aug 2025 11:18:06 +1000
Subject: [PATCH 62/72] CI: match and trim out internal timestamp for test
 prefix

Adjust the regexes to match the test line with timestamps, then remove
them for the summary. The internal timestamp is still in the full logs.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17045
---
 .github/workflows/scripts/qemu-6-tests.sh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/scripts/qemu-6-tests.sh b/.github/workflows/scripts/qemu-6-tests.sh
index e8e6adecd62..5ab822f4f07 100755
--- a/.github/workflows/scripts/qemu-6-tests.sh
+++ b/.github/workflows/scripts/qemu-6-tests.sh
@@ -21,11 +21,13 @@ function prefix() {
   S=$((DIFF-(M*60)))
 
   CTR=$(cat /tmp/ctr)
-  echo $LINE| grep -q "^Test[: ]" && CTR=$((CTR+1)) && echo $CTR > /tmp/ctr
+  echo $LINE| grep -q '^\[.*] Test[: ]' && CTR=$((CTR+1)) && echo $CTR > /tmp/ctr
 
   BASE="$HOME/work/zfs/zfs"
   COLOR="$BASE/scripts/zfs-tests-color.sh"
-  CLINE=$(echo $LINE| grep "^Test[ :]" | sed -e 's|/usr/local|/usr|g' \
+  CLINE=$(echo $LINE| grep '^\[.*] Test[: ]' \
+    | sed -e 's|^\[.*] Test|Test|g' \
+    | sed -e 's|/usr/local|/usr|g' \
     | sed -e 's| /usr/share/zfs/zfs-tests/tests/| |g' | $COLOR)
   if [ -z "$CLINE" ]; then
     printf "vm${ID}: %s\n" "$LINE"

From 8ecf044d620314a28d32beb87eccd72e4907bc8e Mon Sep 17 00:00:00 2001
From: Paul Dagnelie <paul.dagnelie@klarasystems.com>
Date: Thu, 31 Jul 2025 15:23:25 -0700
Subject: [PATCH 63/72] Improve and fix gang blocks dyn header test

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Signed-off-by: Paul Dagnelie <paul.dagnelie@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #17587
---
 .../functional/gang_blocks/gang_blocks.kshlib | 17 +++++
 .../gang_blocks_dyn_header_pos.ksh            | 70 +++++++++++--------
 2 files changed, 56 insertions(+), 31 deletions(-)

diff --git a/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks.kshlib b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks.kshlib
index 553533377aa..1fd9b321cd9 100644
--- a/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks.kshlib
+++ b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks.kshlib
@@ -72,6 +72,23 @@ function get_num_dvas
 	sed 's/.*L[0-9] \(.*\) [a-f0-9]*L.*/\1/' | awk '{print NF}'
 }
 
+function check_gang_bp
+{
+	typeset gang="$(echo -n $1 | grep -c 'gang')"
+	[[ "$gang" == "1" ]] || return 1
+	return 0
+}
+
+function check_is_gang_bp
+{
+	check_gang_bp $1 || log_fail "Not a gang DVA: \"$1\""
+}
+
+function check_not_gang_bp
+{
+	check_gang_bp $1 && log_fail "Gang DVA: \"$1\""
+}
+
 function check_gang_dva
 {
 	typeset last_byte="$(echo -n $1 | tail -c 1)"
diff --git a/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_pos.ksh b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_pos.ksh
index e6d6629e9e1..2941325fdc3 100755
--- a/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_pos.ksh
+++ b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_pos.ksh
@@ -33,41 +33,49 @@ log_assert "Verify that we don't use large gang headers on small-ashift pools".
 log_onexit cleanup
 preamble
 
-log_must zpool create -f -o ashift=12 -o feature@dynamic_gang_header=enabled $TESTPOOL $DISKS
-log_must zfs create -o recordsize=1M $TESTPOOL/$TESTFS
-mountpoint=$(get_prop mountpoint $TESTPOOL/$TESTFS)
-set_tunable64 METASLAB_FORCE_GANGING 200000
-set_tunable32 METASLAB_FORCE_GANGING_PCT 100
+for vdevtype in "" "mirror" "raidz" "raidz2" "draid"; do
+	log_must zpool create -f -o ashift=12 $TESTPOOL $vdevtype $DISKS
+	log_must zfs create -o recordsize=1M $TESTPOOL/$TESTFS
+	mountpoint=$(get_prop mountpoint $TESTPOOL/$TESTFS)
+	set_tunable64 METASLAB_FORCE_GANGING 200000
+	set_tunable32 METASLAB_FORCE_GANGING_PCT 100
 
-status=$(get_pool_prop feature@dynamic_gang_header $TESTPOOL)
-[[ "$status" == "enabled" ]] || log_fail "Dynamic gang headers not enabled"
-path="${mountpoint}/file"
-log_must dd if=/dev/urandom of=$path bs=1M count=1
-log_must zpool sync $TESTPOOL
-first_block=$(get_first_block_dva $TESTPOOL/$TESTFS file)
-leaves=$(read_gang_header $TESTPOOL $first_block 1000 | grep -v HOLE)
-first_dva=$(echo "$leaves" | head -n 1 | awk '{print $1}' | sed 's/.*<//' | sed 's/>.*//')
-check_not_gang_dva $first_dva
+	status=$(get_pool_prop feature@dynamic_gang_header $TESTPOOL)
+	[[ "$status" == "enabled" ]] || \
+		log_fail "Dynamic gang headers not enabled"
+	path="${mountpoint}/file"
+	log_must dd if=/dev/urandom of=$path bs=1M count=1
+	log_must zpool sync $TESTPOOL
+	first_block=$(get_first_block_dva $TESTPOOL/$TESTFS file)
+	leaves=$(read_gang_header $TESTPOOL $first_block 1000 | \
+		grep -v HOLE | grep -v "^Found")
+	first_child=$(echo "$leaves" | head -n 1)
+	check_gang_bp $first_child
 
-num_leaves=$(echo "$leaves" | wc -l)
-[[ "$num_leaves" -gt 3 ]] && log_fail "used a larger gang header too soon: \"$leaves\""
-log_must verify_pool $TESTPOOL
-status=$(get_pool_prop feature@dynamic_gang_header $TESTPOOL)
-[[ "$status" == "active" ]] || log_fail "Dynamic gang headers not active"
+	num_leaves=$(echo "$leaves" | wc -l)
+	[[ "$num_leaves" -gt 3 ]] && \
+		log_fail "used a larger gang header too soon: \"$leaves\""
+	log_must verify_pool $TESTPOOL
+	status=$(get_pool_prop feature@dynamic_gang_header $TESTPOOL)
+	[[ "$status" == "active" ]] || log_fail "Dynamic gang headers not active"
 
-path="${mountpoint}/file2"
-log_must dd if=/dev/urandom of=$path bs=1M count=1
-log_must zpool sync $TESTPOOL
-first_block=$(get_first_block_dva $TESTPOOL/$TESTFS file2)
-leaves=$(read_gang_header $TESTPOOL $first_block 1000 | grep -v HOLE)
-first_dva=$(echo "$leaves" | head -n 1 | awk '{print $1}' | sed 's/.*<//' | sed 's/>.*//')
-check_not_gang_dva $first_dva
+	path="${mountpoint}/file2"
+	log_must dd if=/dev/urandom of=$path bs=1M count=1
+	log_must zpool sync $TESTPOOL
+	first_block=$(get_first_block_dva $TESTPOOL/$TESTFS file2)
+	leaves=$(read_gang_header $TESTPOOL $first_block 1000 | \
+		grep -v HOLE | grep -v "^Found")
+	first_child=$(echo "$leaves" | head -n 1)
+	check_not_gang_bp $first_child
 
-num_leaves=$(echo "$leaves" | wc -l)
-[[ "$num_leaves" -gt 3 ]] || log_fail "didn't use a larger gang header: \"$leaves\""
+	num_leaves=$(echo "$leaves" | wc -l)
+	[[ "$num_leaves" -gt 3 ]] || \
+		log_fail "didn't use a larger gang header: \"$leaves\""
 
 
-log_must verify_pool $TESTPOOL
-status=$(get_pool_prop feature@dynamic_gang_header $TESTPOOL)
-[[ "$status" == "active" ]] || log_fail "Dynamic gang headers not active"
+	log_must verify_pool $TESTPOOL
+	status=$(get_pool_prop feature@dynamic_gang_header $TESTPOOL)
+	[[ "$status" == "active" ]] || log_fail "Dynamic gang headers not active"
+	log_must zpool destroy $TESTPOOL
+done
 log_pass "We don't use large gang headers on small-ashift pools".

From 31c4fa93bb161e0e6afe6bb3d46f8fdcf43f5c84 Mon Sep 17 00:00:00 2001
From: Paul Dagnelie <paul.dagnelie@klarasystems.com>
Date: Fri, 1 Aug 2025 09:58:53 -0700
Subject: [PATCH 64/72] Fix dynamic gang block headers on raidz and mirror
 devices

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Signed-off-by: Paul Dagnelie <paul.dagnelie@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #17587
---
 include/sys/vdev.h        | 14 ++++++++++++++
 include/sys/vdev_impl.h   |  1 -
 module/zfs/zio.c          |  4 ++--
 module/zfs/zio_checksum.c |  6 +++++-
 4 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/include/sys/vdev.h b/include/sys/vdev.h
index 7f5a9aaef1b..510474d6c08 100644
--- a/include/sys/vdev.h
+++ b/include/sys/vdev.h
@@ -139,6 +139,7 @@ extern uint64_t vdev_asize_to_psize_txg(vdev_t *vd, uint64_t asize,
 extern uint64_t vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize,
     uint64_t txg);
 extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);
+extern uint64_t vdev_get_min_alloc(vdev_t *vd);
 
 /*
  * Return the amount of space allocated for a gang block header.  Note that
@@ -151,6 +152,19 @@ vdev_gang_header_asize(vdev_t *vd)
 	return (vdev_psize_to_asize_txg(vd, SPA_OLD_GANGBLOCKSIZE, 0));
 }
 
+/*
+ * Return the amount of data that can be stored in a gang header. Because we
+ * need to ensure gang headers can always be allocated (as long as there is
+ * space available), this is the minimum allocatable size on the vdev. Note that
+ * since the physical birth txg is not provided, this must be constant for
+ * a given vdev.  (e.g. raidz expansion can't change this)
+ */
+static inline uint64_t
+vdev_gang_header_psize(vdev_t *vd)
+{
+	return (vdev_get_min_alloc(vd));
+}
+
 extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux);
 extern int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux);
 extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags,
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index fa22fa2bac3..4ab472bd674 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -621,7 +621,6 @@ extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg);
 extern uint64_t vdev_default_min_asize(vdev_t *vd);
 extern uint64_t vdev_get_min_asize(vdev_t *vd);
 extern void vdev_set_min_asize(vdev_t *vd);
-extern uint64_t vdev_get_min_alloc(vdev_t *vd);
 extern uint64_t vdev_get_nparity(vdev_t *vd);
 extern uint64_t vdev_get_ndisks(vdev_t *vd);
 
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 7e4caaa83ee..0fde2d6f7c1 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -2956,8 +2956,8 @@ zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
 		for (int dva = 0; dva < BP_GET_NDVAS(bp); dva++) {
 			vdev_t *vd = vdev_lookup_top(gio->io_spa,
 			    DVA_GET_VDEV(&bp->blk_dva[dva]));
-			uint64_t asize = vdev_gang_header_asize(vd);
-			gangblocksize = MIN(gangblocksize, asize);
+			uint64_t psize = vdev_gang_header_psize(vd);
+			gangblocksize = MIN(gangblocksize, psize);
 		}
 		spa_config_exit(gio->io_spa, SCL_VDEV, FTAG);
 	} else {
diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c
index 8cec3a6f562..4cb9da0db0b 100644
--- a/module/zfs/zio_checksum.c
+++ b/module/zfs/zio_checksum.c
@@ -569,7 +569,11 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
 		    SPA_OLD_GANGBLOCKSIZE, offset, info);
 		if (error == 0) {
 			ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
-			zio_t *pio = zio_unique_parent(zio);
+			zio_t *pio;
+			for (pio = zio_unique_parent(zio);
+			    pio->io_child_type != ZIO_CHILD_GANG;
+			    pio = zio_unique_parent(pio))
+				;
 			zio_gang_node_t *gn = pio->io_private;
 			gn->gn_gangblocksize = SPA_OLD_GANGBLOCKSIZE;
 		}

From 7ac5440ecf5340a0153674ff2083008cdcee6fa1 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Fri, 1 Aug 2025 13:51:46 +1000
Subject: [PATCH 65/72] ZTS: mmap_ftruncate test to confirm async writeback
 behaviour

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17584
---
 tests/runfiles/common.run                     |  2 +-
 tests/zfs-tests/cmd/.gitignore                |  1 +
 tests/zfs-tests/cmd/Makefile.am               |  4 +-
 tests/zfs-tests/cmd/mmap_ftruncate.c          | 85 +++++++++++++++++++
 tests/zfs-tests/include/commands.cfg          |  1 +
 tests/zfs-tests/tests/Makefile.am             |  1 +
 .../tests/functional/mmap/mmap_ftruncate.ksh  | 80 +++++++++++++++++
 7 files changed, 172 insertions(+), 2 deletions(-)
 create mode 100644 tests/zfs-tests/cmd/mmap_ftruncate.c
 create mode 100755 tests/zfs-tests/tests/functional/mmap/mmap_ftruncate.ksh

diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index 7cc7a3cf94f..16869d397c7 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -801,7 +801,7 @@ tags = ['functional', 'migration']
 
 [tests/functional/mmap]
 tests = ['mmap_mixed', 'mmap_read_001_pos', 'mmap_seek_001_pos',
-    'mmap_sync_001_pos', 'mmap_write_001_pos']
+    'mmap_sync_001_pos', 'mmap_write_001_pos', 'mmap_ftruncate']
 tags = ['functional', 'mmap']
 
 [tests/functional/mount]
diff --git a/tests/zfs-tests/cmd/.gitignore b/tests/zfs-tests/cmd/.gitignore
index e9a6f8f0ac1..1cd90024e94 100644
--- a/tests/zfs-tests/cmd/.gitignore
+++ b/tests/zfs-tests/cmd/.gitignore
@@ -23,6 +23,7 @@
 /mkfiles
 /mktree
 /mmap_exec
+/mmap_ftruncate
 /mmap_libaio
 /mmap_seek
 /mmap_sync
diff --git a/tests/zfs-tests/cmd/Makefile.am b/tests/zfs-tests/cmd/Makefile.am
index 909a72c43d8..d5448055a1e 100644
--- a/tests/zfs-tests/cmd/Makefile.am
+++ b/tests/zfs-tests/cmd/Makefile.am
@@ -72,7 +72,9 @@ scripts_zfs_tests_bin_PROGRAMS += %D%/mkbusy %D%/mkfile %D%/mkfiles %D%/mktree
 %C%_mkfile_LDADD = $(LTLIBINTL)
 
 
-scripts_zfs_tests_bin_PROGRAMS += %D%/mmap_exec %D%/mmap_seek %D%/mmap_sync %D%/mmapwrite %D%/readmmap
+scripts_zfs_tests_bin_PROGRAMS += \
+	%D%/mmap_exec %D%/mmap_ftruncate %D%/mmap_seek \
+	%D%/mmap_sync %D%/mmapwrite %D%/readmmap
 %C%_mmapwrite_LDADD = -lpthread
 
 if WANT_MMAP_LIBAIO
diff --git a/tests/zfs-tests/cmd/mmap_ftruncate.c b/tests/zfs-tests/cmd/mmap_ftruncate.c
new file mode 100644
index 00000000000..91cdfe3715e
--- /dev/null
+++ b/tests/zfs-tests/cmd/mmap_ftruncate.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: CDDL-1.0
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2025, Klara, Inc.
+ */
+
+/*
+ * Tests async writeback behaviour. Creates a file, maps it into memory, and
+ * dirties every page within it. Then, calls ftruncate() to collapse the file
+ * back down to 0. This causes the kernel to begin writeback on the dirty
+ * pages so they can be freed, before it can complete the ftruncate() call.
+ * None of these are sync operations, so they should avoid the various "force
+ * flush" codepaths.
+ */
+
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#define	_pdfail(f, l, s)	\
+	do { perror("[" f "#" #l "] " s); exit(2); } while (0)
+#define	pdfail(str) _pdfail(__FILE__, __LINE__, str)
+
+int
+main(int argc, char **argv) {
+	if (argc != 3) {
+		printf("usage: mmap_ftruncate <file> <size>\n");
+		exit(2);
+	}
+
+	const char *file = argv[1];
+
+	char *end;
+	off_t sz = strtoull(argv[2], &end, 0);
+	if (end == argv[2] || *end != '\0' || sz == 0) {
+		fprintf(stderr, "E: invalid size");
+		exit(2);
+	}
+
+	int fd = open(file, O_CREAT|O_TRUNC|O_RDWR, S_IRUSR|S_IWUSR);
+	if (fd < 0)
+		pdfail("open");
+
+	if (ftruncate(fd, sz) < 0)
+		pdfail("ftruncate");
+
+	char *p = mmap(NULL, sz, PROT_WRITE, MAP_SHARED, fd, 0);
+	if (p == MAP_FAILED)
+		pdfail("mmap");
+
+	for (off_t off = 0; off < sz; off += 4096)
+		p[off] = 1;
+
+	if (ftruncate(fd, 0) < 0)
+		pdfail("ftruncate");
+
+	if (munmap(p, sz) < 0)
+		pdfail("munmap");
+
+	close(fd);
+	return (0);
+}
diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg
index 1c7e42a06e0..bbaa8665ecc 100644
--- a/tests/zfs-tests/include/commands.cfg
+++ b/tests/zfs-tests/include/commands.cfg
@@ -205,6 +205,7 @@ export ZFSTEST_FILES='badsend
     mkfiles
     mktree
     mmap_exec
+    mmap_ftruncate
     mmap_libaio
     mmap_seek
     mmap_sync
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index 388a4160736..505fe3daf82 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -1675,6 +1675,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/mmap/mmap_seek_001_pos.ksh \
 	functional/mmap/mmap_sync_001_pos.ksh \
 	functional/mmap/mmap_write_001_pos.ksh \
+	functional/mmap/mmap_ftruncate.ksh \
 	functional/mmap/setup.ksh \
 	functional/mmp/cleanup.ksh \
 	functional/mmp/mmp_active_import.ksh \
diff --git a/tests/zfs-tests/tests/functional/mmap/mmap_ftruncate.ksh b/tests/zfs-tests/tests/functional/mmap/mmap_ftruncate.ksh
new file mode 100755
index 00000000000..63ebf95de7f
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/mmap/mmap_ftruncate.ksh
@@ -0,0 +1,80 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2025, Klara, Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# This verifies that async writeback of dirty mmap()'d pages completes quickly.
+# ftruncate() is an operation that will trigger async writeback, but is not
+# itself a syncing operation, making it a useful proxy for any way the kernel
+# might trigger async writeback.
+#
+# The guts of this test is in the mmap_ftruncate program. This driver sets a
+# larger zfs_txg_timeout. Test failure occurs ftruncate() blocks waiting for
+# the writeback until the txg timeout is reached and the changes are forcibly
+# written out. Success means the DMU has accepted the changes and cleared the
+# page dirty flags.
+#
+
+TIMEOUT=180
+TESTFILE=/$TESTPOOL/truncfile
+TESTSIZE=$((2*1024*1024*1024)) # 2G
+
+verify_runnable "global"
+
+typeset claim="async writeback of dirty mmap()'d pages completes quickly"
+
+log_assert $claim
+
+log_must save_tunable TXG_TIMEOUT
+
+function cleanup
+{
+	log_must restore_tunable TXG_TIMEOUT
+	rm -f $TESTFILE
+}
+log_onexit cleanup
+
+log_must set_tunable32 TXG_TIMEOUT $TIMEOUT
+log_must zpool sync -f
+
+# run mmap_ftruncate and record the run time
+typeset -i start=$(date +%s)
+log_must mmap_ftruncate $TESTFILE $TESTSIZE
+typeset -i end=$(date +%s)
+typeset -i delta=$((end - start))
+
+# in practice, mmap_ftruncate needs a few seconds to dirty all the pages, and
+# when this test passes, the ftruncate() call itself should be near-instant.
+# when it fails, then its only the txg sync that allows ftruncate() to
+# complete, in that case, the run time will be extremely close to the timeout,
+# so to avoid any confusion at the edges, we require that it complets within
+# half the transaction time.  for any timeout higher than ~30s that should be a
+# very bright line down the middle.
+log_must test $delta -lt $((TIMEOUT / 2))
+
+log_pass $claim

From a18c9edda62165feaf020f29e6b4602e787b5378 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Mon, 28 Jul 2025 10:33:40 +1000
Subject: [PATCH 66/72] Linux: sync: remove async/sync accounting

All this machinery is there to try to understand when there an async
writeback waiting to complete because the intent log callbacks are still
outstanding, and force them with a timely zil_commit(). The next commit
fixes this properly, so there's no need for all this extra housekeeping.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17584
---
 include/os/linux/zfs/sys/trace_acl.h |  7 +----
 include/sys/zfs_znode.h              |  2 --
 module/os/freebsd/zfs/zfs_znode_os.c |  7 -----
 module/os/linux/zfs/zfs_ctldir.c     |  2 --
 module/os/linux/zfs/zfs_vnops_os.c   | 25 -----------------
 module/os/linux/zfs/zfs_znode_os.c   |  7 -----
 module/os/linux/zfs/zpl_file.c       | 41 ----------------------------
 module/zfs/zfs_vnops.c               |  2 --
 8 files changed, 1 insertion(+), 92 deletions(-)

diff --git a/include/os/linux/zfs/sys/trace_acl.h b/include/os/linux/zfs/sys/trace_acl.h
index 8923657daf0..d88b4937ef0 100644
--- a/include/os/linux/zfs/sys/trace_acl.h
+++ b/include/os/linux/zfs/sys/trace_acl.h
@@ -59,8 +59,6 @@ DECLARE_EVENT_CLASS(zfs_ace_class,
 	    __field(uint64_t,		z_size)
 	    __field(uint64_t,		z_pflags)
 	    __field(uint32_t,		z_sync_cnt)
-	    __field(uint32_t,		z_sync_writes_cnt)
-	    __field(uint32_t,		z_async_writes_cnt)
 	    __field(mode_t,		z_mode)
 	    __field(boolean_t,		z_is_sa)
 	    __field(boolean_t,		z_is_ctldir)
@@ -92,8 +90,6 @@ DECLARE_EVENT_CLASS(zfs_ace_class,
 	    __entry->z_size		= zn->z_size;
 	    __entry->z_pflags		= zn->z_pflags;
 	    __entry->z_sync_cnt		= zn->z_sync_cnt;
-	    __entry->z_sync_writes_cnt	= zn->z_sync_writes_cnt;
-	    __entry->z_async_writes_cnt	= zn->z_async_writes_cnt;
 	    __entry->z_mode		= zn->z_mode;
 	    __entry->z_is_sa		= zn->z_is_sa;
 	    __entry->z_is_ctldir	= zn->z_is_ctldir;
@@ -117,7 +113,7 @@ DECLARE_EVENT_CLASS(zfs_ace_class,
 	TP_printk("zn { id %llu unlinked %u atime_dirty %u "
 	    "zn_prefetch %u blksz %u seq %u "
 	    "mapcnt %llu size %llu pflags %llu "
-	    "sync_cnt %u sync_writes_cnt %u async_writes_cnt %u "
+	    "sync_cnt %u "
 	    "mode 0x%x is_sa %d is_ctldir %d "
 	    "inode { uid %u gid %u ino %lu nlink %u size %lli "
 	    "blkbits %u bytes %u mode 0x%x generation %x } } "
@@ -126,7 +122,6 @@ DECLARE_EVENT_CLASS(zfs_ace_class,
 	    __entry->z_zn_prefetch, __entry->z_blksz,
 	    __entry->z_seq, __entry->z_mapcnt, __entry->z_size,
 	    __entry->z_pflags, __entry->z_sync_cnt,
-	    __entry->z_sync_writes_cnt, __entry->z_async_writes_cnt,
 	    __entry->z_mode, __entry->z_is_sa, __entry->z_is_ctldir,
 	    __entry->i_uid, __entry->i_gid, __entry->i_ino, __entry->i_nlink,
 	    __entry->i_size, __entry->i_blkbits,
diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h
index b3a267e16f3..fa3c7b5b39c 100644
--- a/include/sys/zfs_znode.h
+++ b/include/sys/zfs_znode.h
@@ -201,8 +201,6 @@ typedef struct znode {
 	uint64_t	z_size;		/* file size (cached) */
 	uint64_t	z_pflags;	/* pflags (cached) */
 	uint32_t	z_sync_cnt;	/* synchronous open count */
-	uint32_t	z_sync_writes_cnt; /* synchronous write count */
-	uint32_t	z_async_writes_cnt; /* asynchronous write count */
 	mode_t		z_mode;		/* mode (cached) */
 	kmutex_t	z_acl_lock;	/* acl data lock */
 	zfs_acl_t	*z_acl_cached;	/* cached acl */
diff --git a/module/os/freebsd/zfs/zfs_znode_os.c b/module/os/freebsd/zfs/zfs_znode_os.c
index 9bad1e13d7c..775f54a65f7 100644
--- a/module/os/freebsd/zfs/zfs_znode_os.c
+++ b/module/os/freebsd/zfs/zfs_znode_os.c
@@ -150,8 +150,6 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
 	zp->z_xattr_cached = NULL;
 	zp->z_xattr_parent = 0;
 	zp->z_vnode = NULL;
-	zp->z_sync_writes_cnt = 0;
-	zp->z_async_writes_cnt = 0;
 
 	return (0);
 }
@@ -172,9 +170,6 @@ zfs_znode_cache_destructor(void *buf, void *arg)
 
 	ASSERT3P(zp->z_acl_cached, ==, NULL);
 	ASSERT3P(zp->z_xattr_cached, ==, NULL);
-
-	ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt));
-	ASSERT0(atomic_load_32(&zp->z_async_writes_cnt));
 }
 
 
@@ -456,8 +451,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
 	zp->z_blksz = blksz;
 	zp->z_seq = 0x7A4653;
 	zp->z_sync_cnt = 0;
-	zp->z_sync_writes_cnt = 0;
-	zp->z_async_writes_cnt = 0;
 	atomic_store_ptr(&zp->z_cached_symlink, NULL);
 
 	zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
diff --git a/module/os/linux/zfs/zfs_ctldir.c b/module/os/linux/zfs/zfs_ctldir.c
index 84b25cb2c5a..6552a933ce0 100644
--- a/module/os/linux/zfs/zfs_ctldir.c
+++ b/module/os/linux/zfs/zfs_ctldir.c
@@ -511,8 +511,6 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id,
 	zp->z_pflags = 0;
 	zp->z_mode = 0;
 	zp->z_sync_cnt = 0;
-	zp->z_sync_writes_cnt = 0;
-	zp->z_async_writes_cnt = 0;
 	ip->i_generation = 0;
 	ip->i_ino = id;
 	ip->i_mode = (S_IFDIR | S_IRWXUGO);
diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c
index ed9721dade7..493f069a135 100644
--- a/module/os/linux/zfs/zfs_vnops_os.c
+++ b/module/os/linux/zfs/zfs_vnops_os.c
@@ -3703,11 +3703,9 @@ static void
 zfs_putpage_async_commit_cb(void *arg)
 {
 	struct page *pp = arg;
-	znode_t *zp = ITOZ(pp->mapping->host);
 
 	ClearPageError(pp);
 	end_page_writeback(pp);
-	atomic_dec_32(&zp->z_async_writes_cnt);
 }
 
 /*
@@ -3827,15 +3825,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
 		zfs_rangelock_exit(lr);
 
 		if (wbc->sync_mode != WB_SYNC_NONE) {
-			/*
-			 * Speed up any non-sync page writebacks since
-			 * they may take several seconds to complete.
-			 * Refer to the comment in zpl_fsync() for details.
-			 */
-			if (atomic_load_32(&zp->z_async_writes_cnt) > 0) {
-				zil_commit(zfsvfs->z_log, zp->z_id);
-			}
-
 			if (PageWriteback(pp))
 #ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT
 				folio_wait_bit(page_folio(pp), PG_writeback);
@@ -3861,8 +3850,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
 	 * was in fact not skipped and should not be counted as if it were.
 	 */
 	wbc->pages_skipped--;
-	if (!for_sync)
-		atomic_inc_32(&zp->z_async_writes_cnt);
 	set_page_writeback(pp);
 	unlock_page(pp);
 
@@ -3881,8 +3868,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
 #endif
 		ClearPageError(pp);
 		end_page_writeback(pp);
-		if (!for_sync)
-			atomic_dec_32(&zp->z_async_writes_cnt);
 		zfs_rangelock_exit(lr);
 		zfs_exit(zfsvfs, FTAG);
 		return (err);
@@ -3916,16 +3901,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
 		 * performance reasons.
 		 */
 		commit = B_TRUE;
-	} else if (!for_sync && atomic_load_32(&zp->z_sync_writes_cnt) > 0) {
-		/*
-		 * If the caller does not intend to wait synchronously
-		 * for this page writeback to complete and there are active
-		 * synchronous calls on this file, do a commit so that
-		 * the latter don't accidentally end up waiting for
-		 * our writeback to complete. Refer to the comment in
-		 * zpl_fsync() (when HAVE_FSYNC_RANGE is defined) for details.
-		 */
-		commit = B_TRUE;
 	}
 
 	zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, commit,
diff --git a/module/os/linux/zfs/zfs_znode_os.c b/module/os/linux/zfs/zfs_znode_os.c
index 5692868c9dc..7683eeb3cf9 100644
--- a/module/os/linux/zfs/zfs_znode_os.c
+++ b/module/os/linux/zfs/zfs_znode_os.c
@@ -126,8 +126,6 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
 	zp->z_acl_cached = NULL;
 	zp->z_xattr_cached = NULL;
 	zp->z_xattr_parent = 0;
-	zp->z_sync_writes_cnt = 0;
-	zp->z_async_writes_cnt = 0;
 
 	return (0);
 }
@@ -149,9 +147,6 @@ zfs_znode_cache_destructor(void *buf, void *arg)
 	ASSERT3P(zp->z_dirlocks, ==, NULL);
 	ASSERT3P(zp->z_acl_cached, ==, NULL);
 	ASSERT3P(zp->z_xattr_cached, ==, NULL);
-
-	ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt));
-	ASSERT0(atomic_load_32(&zp->z_async_writes_cnt));
 }
 
 static int
@@ -548,8 +543,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
 	zp->z_blksz = blksz;
 	zp->z_seq = 0x7A4653;
 	zp->z_sync_cnt = 0;
-	zp->z_sync_writes_cnt = 0;
-	zp->z_async_writes_cnt = 0;
 
 	zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
 
diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c
index 1a82c13e152..ef7bd735208 100644
--- a/module/os/linux/zfs/zpl_file.c
+++ b/module/os/linux/zfs/zpl_file.c
@@ -111,52 +111,11 @@ zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
 {
 	struct inode *inode = filp->f_mapping->host;
 	znode_t *zp = ITOZ(inode);
-	zfsvfs_t *zfsvfs = ITOZSB(inode);
 	cred_t *cr = CRED();
 	int error;
 	fstrans_cookie_t cookie;
 
-	/*
-	 * The variables z_sync_writes_cnt and z_async_writes_cnt work in
-	 * tandem so that sync writes can detect if there are any non-sync
-	 * writes going on and vice-versa. The "vice-versa" part to this logic
-	 * is located in zfs_putpage() where non-sync writes check if there are
-	 * any ongoing sync writes. If any sync and non-sync writes overlap,
-	 * we do a commit to complete the non-sync writes since the latter can
-	 * potentially take several seconds to complete and thus block sync
-	 * writes in the upcoming call to filemap_write_and_wait_range().
-	 */
-	atomic_inc_32(&zp->z_sync_writes_cnt);
-	/*
-	 * If the following check does not detect an overlapping non-sync write
-	 * (say because it's just about to start), then it is guaranteed that
-	 * the non-sync write will detect this sync write. This is because we
-	 * always increment z_sync_writes_cnt / z_async_writes_cnt before doing
-	 * the check on z_async_writes_cnt / z_sync_writes_cnt here and in
-	 * zfs_putpage() respectively.
-	 */
-	if (atomic_load_32(&zp->z_async_writes_cnt) > 0) {
-		if ((error = zpl_enter(zfsvfs, FTAG)) != 0) {
-			atomic_dec_32(&zp->z_sync_writes_cnt);
-			return (error);
-		}
-		zil_commit(zfsvfs->z_log, zp->z_id);
-		zpl_exit(zfsvfs, FTAG);
-	}
-
 	error = filemap_write_and_wait_range(inode->i_mapping, start, end);
-
-	/*
-	 * The sync write is not complete yet but we decrement
-	 * z_sync_writes_cnt since zfs_fsync() increments and decrements
-	 * it internally. If a non-sync write starts just after the decrement
-	 * operation but before we call zfs_fsync(), it may not detect this
-	 * overlapping sync write but it does not matter since we have already
-	 * gone past filemap_write_and_wait_range() and we won't block due to
-	 * the non-sync write.
-	 */
-	atomic_dec_32(&zp->z_sync_writes_cnt);
-
 	if (error)
 		return (error);
 
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index dfffcc4a404..8ad992f5b62 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -115,9 +115,7 @@ zfs_fsync(znode_t *zp, int syncflag, cred_t *cr)
 	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
 		if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 			return (error);
-		atomic_inc_32(&zp->z_sync_writes_cnt);
 		zil_commit(zfsvfs->z_log, zp->z_id);
-		atomic_dec_32(&zp->z_sync_writes_cnt);
 		zfs_exit(zfsvfs, FTAG);
 	}
 	return (error);

From fb7a8503bcfbbfe7b79d6c934062eee3c692b48b Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Mon, 28 Jul 2025 10:51:00 +1000
Subject: [PATCH 67/72] Linux: zfs_putpage: complete async page writeback
 immediately

For async page writeback, we do not need to wait for the page to be on
disk before returning to the caller; it's enough that the data from the
dirty page be on the DMU and in the in-memory ZIL, just like any other
write.

So, if this is not a syncing write, don't add a callback to the itx, and
instead just unlock the page immediately.

(This is effectively the same concept used for FreeBSD in d323fbf49c).

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17584
Closes #14290
---
 module/os/linux/zfs/zfs_vnops_os.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c
index 493f069a135..e555a15476e 100644
--- a/module/os/linux/zfs/zfs_vnops_os.c
+++ b/module/os/linux/zfs/zfs_vnops_os.c
@@ -3691,16 +3691,7 @@ zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr,
 }
 
 static void
-zfs_putpage_sync_commit_cb(void *arg)
-{
-	struct page *pp = arg;
-
-	ClearPageError(pp);
-	end_page_writeback(pp);
-}
-
-static void
-zfs_putpage_async_commit_cb(void *arg)
+zfs_putpage_commit_cb(void *arg)
 {
 	struct page *pp = arg;
 
@@ -3904,8 +3895,12 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
 	}
 
 	zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, commit,
-	    B_FALSE, for_sync ? zfs_putpage_sync_commit_cb :
-	    zfs_putpage_async_commit_cb, pp);
+	    B_FALSE, for_sync ? zfs_putpage_commit_cb : NULL, pp);
+
+	if (!for_sync) {
+		ClearPageError(pp);
+		end_page_writeback(pp);
+	}
 
 	dmu_tx_commit(tx);
 

From c3496b5cc6f842dd80c6729884145ff6c0643954 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Fri, 1 Aug 2025 16:05:07 +1000
Subject: [PATCH 68/72] Linux: zfs_putpage: document (and fix!) confusing
 sync/commit modes

The structure of zfs_putpage() and its callers is tricky to follow.
There's a lot more we could do to improve it, but at least now we have
some description of one of the trickier bits.

Writing this exposed a very subtle bug: most async pages pushed out
through zpl_putpages() would go to the ZIL with commit=false, which can
yield a less-efficient write policy. So this commit updates that too.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17584
---
 module/os/linux/zfs/zfs_vnops_os.c | 55 ++++++++++++++++++++++++------
 1 file changed, 44 insertions(+), 11 deletions(-)

diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c
index e555a15476e..6a2fc5ad793 100644
--- a/module/os/linux/zfs/zfs_vnops_os.c
+++ b/module/os/linux/zfs/zfs_vnops_os.c
@@ -25,6 +25,7 @@
  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
  * Copyright 2017 Nexenta Systems, Inc.
+ * Copyright (c) 2025, Klara, Inc.
  */
 
 /* Portions Copyright 2007 Jeremy Teo */
@@ -3884,17 +3885,49 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
 
 	err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
 
-	boolean_t commit = B_FALSE;
-	if (wbc->sync_mode != WB_SYNC_NONE) {
-		/*
-		 * Note that this is rarely called under writepages(), because
-		 * writepages() normally handles the entire commit for
-		 * performance reasons.
-		 */
-		commit = B_TRUE;
-	}
+	/*
+	 * A note about for_sync vs wbc->sync_mode.
+	 *
+	 * for_sync indicates that this is a syncing writeback, that is, kernel
+	 * caller expects the data to be durably stored before being notified.
+	 * Often, but not always, the call was triggered by a userspace syncing
+	 * op (eg fsync(), msync(MS_SYNC)). For our purposes, for_sync==TRUE
+	 * means that that page should remain "locked" (in the writeback state)
+	 * until it is definitely on disk (ie zil_commit() or spa_sync()).
+	 * Otherwise, we can unlock and return as soon as it is on the
+	 * in-memory ZIL.
+	 *
+	 * wbc->sync_mode has similar meaning. wbc is passed from the kernel to
+	 * zpl_writepages()/zpl_writepage(); wbc->sync_mode==WB_SYNC_NONE
+	 * indicates this a regular async writeback (eg a cache eviction) and
+	 * so does not need a durability guarantee, while WB_SYNC_ALL indicates
+	 * a syncing op that must be waited on (by convention, we test for
+	 * !WB_SYNC_NONE rather than WB_SYNC_ALL, to prefer durability over
+	 * performance should there ever be a new mode that we have not yet
+	 * added support for).
+	 *
+	 * So, why a separate for_sync field? This is because zpl_writepages()
+	 * calls zfs_putpage() multiple times for a single "logical" operation.
+	 * It wants all the individual pages to be for_sync==TRUE ie only
+	 * unlocked once durably stored, but it only wants one call to
+	 * zil_commit() at the very end, once all the pages are synced. So,
+	 * it repurposes sync_mode slightly to indicate who issue and wait for
+	 * the IO: for NONE, the caller to zfs_putpage() will do it, while for
+	 * ALL, zfs_putpage should do it.
+	 *
+	 * Summary:
+	 *   for_sync:  0=unlock immediately; 1 unlock once on disk
+	 *   sync_mode: NONE=caller will commit; ALL=we will commit
+	 */
+	boolean_t need_commit = (wbc->sync_mode != WB_SYNC_NONE);
 
-	zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, commit,
+	/*
+	 * We use for_sync as the "commit" arg to zfs_log_write() (arg 7)
+	 * because it is a policy flag that indicates "someone will call
+	 * zil_commit() soon". for_sync=TRUE means exactly that; the only
+	 * question is whether it will be us, or zpl_writepages().
+	 */
+	zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, for_sync,
 	    B_FALSE, for_sync ? zfs_putpage_commit_cb : NULL, pp);
 
 	if (!for_sync) {
@@ -3906,7 +3939,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
 
 	zfs_rangelock_exit(lr);
 
-	if (commit)
+	if (need_commit)
 		zil_commit(zfsvfs->z_log, zp->z_id);
 
 	dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen);

From 894edd084e3992a64339c195e93aad41e9323269 Mon Sep 17 00:00:00 2001
From: Mariusz Zaborski <mariusz.zaborski@klarasystems.com>
Date: Wed, 6 Aug 2025 19:31:21 +0200
Subject: [PATCH 69/72] Add TXG timestamp database

This feature enables tracking of when TXGs are committed to disk,
providing an estimated timestamp for each TXG.

With this information, it becomes possible to perform scrubs based
on specific date ranges, improving the granularity of data
management and recovery operations.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Reviewed-by: Paul Dagnelie <paul.dagnelie@klarasystems.com>
Signed-off-by: Mariusz Zaborski <mariusz.zaborski@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #16853
---
 cmd/zpool/zpool_main.c                        |  63 ++++-
 include/Makefile.am                           |   1 +
 include/libzfs.h                              |   2 +
 include/sys/dmu.h                             |   3 +
 include/sys/spa_impl.h                        |   8 +
 include/zfs_crrd.h                            |  75 ++++++
 lib/libzfs/libzfs.abi                         |   9 +
 lib/libzfs/libzfs_pool.c                      |  10 +-
 lib/libzpool/Makefile.am                      |   1 +
 man/man4/zfs.4                                |  15 ++
 man/man8/zpool-scrub.8                        |  42 +++-
 module/Kbuild.in                              |   1 +
 module/Makefile.bsd                           |   1 +
 module/zfs/spa.c                              | 160 ++++++++++++
 module/zfs/spa_misc.c                         |   2 +
 module/zfs/zfs_crrd.c                         | 227 ++++++++++++++++++
 module/zfs/zfs_ioctl.c                        |  27 ++-
 tests/runfiles/common.run                     |   3 +-
 tests/zfs-tests/include/tunables.cfg          |   1 +
 tests/zfs-tests/tests/Makefile.am             |   1 +
 .../zpool_scrub_date_range_001.ksh            |  94 ++++++++
 21 files changed, 736 insertions(+), 10 deletions(-)
 create mode 100644 include/zfs_crrd.h
 create mode 100644 module/zfs/zfs_crrd.c
 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_date_range_001.ksh

diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index 23cc590cc30..d401e087916 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -513,8 +513,8 @@ get_usage(zpool_help_t idx)
 		return (gettext("\tinitialize [-c | -s | -u] [-w] <-a | <pool> "
 		    "[<device> ...]>\n"));
 	case HELP_SCRUB:
-		return (gettext("\tscrub [-e | -s | -p | -C] [-w] <-a | "
-		    "<pool> [<pool> ...]>\n"));
+		return (gettext("\tscrub [-e | -s | -p | -C | -E | -S] [-w] "
+		    "<-a | <pool> [<pool> ...]>\n"));
 	case HELP_RESILVER:
 		return (gettext("\tresilver <pool> ...\n"));
 	case HELP_TRIM:
@@ -8359,6 +8359,8 @@ zpool_do_reopen(int argc, char **argv)
 typedef struct scrub_cbdata {
 	int	cb_type;
 	pool_scrub_cmd_t cb_scrub_cmd;
+	time_t	cb_date_start;
+	time_t	cb_date_end;
 } scrub_cbdata_t;
 
 static boolean_t
@@ -8402,8 +8404,8 @@ scrub_callback(zpool_handle_t *zhp, void *data)
 		return (1);
 	}
 
-	err = zpool_scan(zhp, cb->cb_type, cb->cb_scrub_cmd);
-
+	err = zpool_scan_range(zhp, cb->cb_type, cb->cb_scrub_cmd,
+	    cb->cb_date_start, cb->cb_date_end);
 	if (err == 0 && zpool_has_checkpoint(zhp) &&
 	    cb->cb_type == POOL_SCAN_SCRUB) {
 		(void) printf(gettext("warning: will not scrub state that "
@@ -8421,10 +8423,34 @@ wait_callback(zpool_handle_t *zhp, void *data)
 	return (zpool_wait(zhp, *act));
 }
 
+static time_t
+date_string_to_sec(const char *timestr, boolean_t rounding)
+{
+	struct tm tm = {0};
+	int adjustment = rounding ? 1 : 0;
+
+	/* Allow mktime to determine timezone. */
+	tm.tm_isdst = -1;
+
+	if (strptime(timestr, "%Y-%m-%d %H:%M", &tm) == NULL) {
+		if (strptime(timestr, "%Y-%m-%d", &tm) == NULL) {
+			fprintf(stderr, gettext("Failed to parse the date.\n"));
+			usage(B_FALSE);
+		}
+		adjustment *= 24 * 60 * 60;
+	} else {
+		adjustment *= 60;
+	}
+
+	return (mktime(&tm) + adjustment);
+}
+
 /*
- * zpool scrub [-e | -s | -p | -C] [-w] <pool> ...
+ * zpool scrub [-e | -s | -p | -C | -E | -S] [-w] <pool> ...
  *
  *	-e	Only scrub blocks in the error log.
+ *	-E	End date of scrub.
+ *	-S	Start date of scrub.
  *	-s	Stop.  Stops any in-progress scrub.
  *	-p	Pause. Pause in-progress scrub.
  *	-w	Wait.  Blocks until scrub has completed.
@@ -8440,6 +8466,7 @@ zpool_do_scrub(int argc, char **argv)
 
 	cb.cb_type = POOL_SCAN_SCRUB;
 	cb.cb_scrub_cmd = POOL_SCRUB_NORMAL;
+	cb.cb_date_start = cb.cb_date_end = 0;
 
 	boolean_t is_error_scrub = B_FALSE;
 	boolean_t is_pause = B_FALSE;
@@ -8448,7 +8475,7 @@ zpool_do_scrub(int argc, char **argv)
 	boolean_t scrub_all = B_FALSE;
 
 	/* check options */
-	while ((c = getopt(argc, argv, "aspweC")) != -1) {
+	while ((c = getopt(argc, argv, "aspweCE:S:")) != -1) {
 		switch (c) {
 		case 'a':
 			scrub_all = B_TRUE;
@@ -8456,9 +8483,19 @@ zpool_do_scrub(int argc, char **argv)
 		case 'e':
 			is_error_scrub = B_TRUE;
 			break;
+		case 'E':
+			/*
+			 * Round the date. It's better to scrub more data than
+			 * less. This also makes the date inclusive.
+			 */
+			cb.cb_date_end = date_string_to_sec(optarg, B_TRUE);
+			break;
 		case 's':
 			is_stop = B_TRUE;
 			break;
+		case 'S':
+			cb.cb_date_start = date_string_to_sec(optarg, B_FALSE);
+			break;
 		case 'p':
 			is_pause = B_TRUE;
 			break;
@@ -8506,6 +8543,19 @@ zpool_do_scrub(int argc, char **argv)
 		}
 	}
 
+	if ((cb.cb_date_start != 0 || cb.cb_date_end != 0) &&
+	    cb.cb_scrub_cmd != POOL_SCRUB_NORMAL) {
+		(void) fprintf(stderr, gettext("invalid option combination: "
+		    "start/end date is available only with normal scrub\n"));
+		usage(B_FALSE);
+	}
+	if (cb.cb_date_start != 0 && cb.cb_date_end != 0 &&
+	    cb.cb_date_start > cb.cb_date_end) {
+		(void) fprintf(stderr, gettext("invalid arguments: "
+		    "end date has to be later than start date\n"));
+		usage(B_FALSE);
+	}
+
 	if (wait && (cb.cb_type == POOL_SCAN_NONE ||
 	    cb.cb_scrub_cmd == POOL_SCRUB_PAUSE)) {
 		(void) fprintf(stderr, gettext("invalid option combination: "
@@ -8546,6 +8596,7 @@ zpool_do_resilver(int argc, char **argv)
 
 	cb.cb_type = POOL_SCAN_RESILVER;
 	cb.cb_scrub_cmd = POOL_SCRUB_NORMAL;
+	cb.cb_date_start = cb.cb_date_end = 0;
 
 	/* check options */
 	while ((c = getopt(argc, argv, "")) != -1) {
diff --git a/include/Makefile.am b/include/Makefile.am
index a0427ae6a47..7588cd0aedc 100644
--- a/include/Makefile.am
+++ b/include/Makefile.am
@@ -10,6 +10,7 @@ COMMON_H = \
 	cityhash.h \
 	zfeature_common.h \
 	zfs_comutil.h \
+	zfs_crrd.h \
 	zfs_deleg.h \
 	zfs_fletcher.h \
 	zfs_namecheck.h \
diff --git a/include/libzfs.h b/include/libzfs.h
index 187d7b44936..3fcdc176a62 100644
--- a/include/libzfs.h
+++ b/include/libzfs.h
@@ -302,6 +302,8 @@ typedef struct initialize_cbdata {
  * Functions to manipulate pool and vdev state
  */
 _LIBZFS_H int zpool_scan(zpool_handle_t *, pool_scan_func_t, pool_scrub_cmd_t);
+_LIBZFS_H int zpool_scan_range(zpool_handle_t *, pool_scan_func_t,
+    pool_scrub_cmd_t, time_t, time_t);
 _LIBZFS_H int zpool_initialize_one(zpool_handle_t *, void *);
 _LIBZFS_H int zpool_initialize(zpool_handle_t *, pool_initialize_func_t,
     nvlist_t *);
diff --git a/include/sys/dmu.h b/include/sys/dmu.h
index 0b2e443a433..7dc6daaf06e 100644
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@@ -414,6 +414,9 @@ typedef struct dmu_buf {
 #define	DMU_POOL_ZPOOL_CHECKPOINT	"com.delphix:zpool_checkpoint"
 #define	DMU_POOL_LOG_SPACEMAP_ZAP	"com.delphix:log_spacemap_zap"
 #define	DMU_POOL_DELETED_CLONES		"com.delphix:deleted_clones"
+#define	DMU_POOL_TXG_LOG_TIME_MINUTES	"com.klaraystems:txg_log_time:minutes"
+#define	DMU_POOL_TXG_LOG_TIME_DAYS	"com.klaraystems:txg_log_time:days"
+#define	DMU_POOL_TXG_LOG_TIME_MONTHS	"com.klaraystems:txg_log_time:months"
 
 /*
  * Allocate an object from this objset.  The range of object numbers
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index a596235ce01..07a959db344 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -55,6 +55,8 @@
 #include <sys/dsl_deadlist.h>
 #include <zfeature_common.h>
 
+#include "zfs_crrd.h"
+
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -344,6 +346,12 @@ struct spa {
 	spa_checkpoint_info_t spa_checkpoint_info; /* checkpoint accounting */
 	zthr_t		*spa_checkpoint_discard_zthr;
 
+	kmutex_t	spa_txg_log_time_lock;	/* for spa_txg_log_time */
+	dbrrd_t		spa_txg_log_time;
+	uint64_t	spa_last_noted_txg;
+	uint64_t	spa_last_noted_txg_time;
+	uint64_t	spa_last_flush_txg_time;
+
 	space_map_t	*spa_syncing_log_sm;	/* current log space map */
 	avl_tree_t	spa_sm_logs_by_txg;
 	kmutex_t	spa_flushed_ms_lock;	/* for metaslabs_by_flushed */
diff --git a/include/zfs_crrd.h b/include/zfs_crrd.h
new file mode 100644
index 00000000000..ba192a2062e
--- /dev/null
+++ b/include/zfs_crrd.h
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: CDDL-1.0
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2024 Klara Inc.
+ *
+ * This software was developed by
+ * Mariusz Zaborski <mariusz.zaborski@klarasystems.com>
+ * Fred Weigel <fred.weigel@klarasystems.com>
+ * under sponsorship from Wasabi Technology, Inc. and Klara Inc.
+ */
+
+#ifndef _CRRD_H_
+#define	_CRRD_H_
+
+#define	RRD_MAX_ENTRIES	256
+
+#define	RRD_ENTRY_SIZE	sizeof (uint64_t)
+#define	RRD_STRUCT_ELEM	(sizeof (rrd_t) / RRD_ENTRY_SIZE)
+
+typedef enum {
+	DBRRD_FLOOR,
+	DBRRD_CEILING
+} dbrrd_rounding_t;
+
+typedef struct {
+	uint64_t	rrdd_time;
+	uint64_t	rrdd_txg;
+} rrd_data_t;
+
+typedef struct {
+	uint64_t	rrd_head;	/* head (beginning) */
+	uint64_t	rrd_tail;	/* tail (end) */
+	uint64_t	rrd_length;
+
+	rrd_data_t	rrd_entries[RRD_MAX_ENTRIES];
+} rrd_t;
+
+typedef struct {
+	rrd_t		dbr_minutes;
+	rrd_t		dbr_days;
+	rrd_t		dbr_months;
+} dbrrd_t;
+
+size_t rrd_len(rrd_t *rrd);
+
+const rrd_data_t *rrd_entry(rrd_t *r, size_t i);
+rrd_data_t *rrd_tail_entry(rrd_t *rrd);
+uint64_t rrd_tail(rrd_t *rrd);
+uint64_t rrd_get(rrd_t *rrd, size_t i);
+
+void rrd_add(rrd_t *rrd, hrtime_t time, uint64_t txg);
+
+void dbrrd_add(dbrrd_t *db, hrtime_t time, uint64_t txg);
+uint64_t dbrrd_query(dbrrd_t *r, hrtime_t tv, dbrrd_rounding_t rouding);
+
+#endif
diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi
index bd2ab646802..37d22402e77 100644
--- a/lib/libzfs/libzfs.abi
+++ b/lib/libzfs/libzfs.abi
@@ -574,6 +574,7 @@
     <elf-symbol name='zpool_reguid' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_reopen_one' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_scan' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='zpool_scan_range' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_search_import' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_set_bootenv' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_set_guid' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@@ -6946,6 +6947,14 @@
       <parameter type-id='b51cf3c2' name='cmd'/>
       <return type-id='95e97e5e'/>
     </function-decl>
+    <function-decl name='zpool_scan_range' mangled-name='zpool_scan_range' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_scan_range'>
+      <parameter type-id='4c81de99' name='zhp'/>
+      <parameter type-id='7313fbe2' name='func'/>
+      <parameter type-id='b51cf3c2' name='cmd'/>
+      <parameter type-id='c9d12d66' name='date_start'/>
+      <parameter type-id='c9d12d66' name='date_end'/>
+      <return type-id='95e97e5e'/>
+    </function-decl>
     <function-decl name='zpool_find_vdev_by_physpath' mangled-name='zpool_find_vdev_by_physpath' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_find_vdev_by_physpath'>
       <parameter type-id='4c81de99' name='zhp'/>
       <parameter type-id='80f4b756' name='ppath'/>
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index b6fb153c496..10b42720e96 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -2773,7 +2773,13 @@ zpool_trim(zpool_handle_t *zhp, pool_trim_func_t cmd_type, nvlist_t *vds,
  * Scan the pool.
  */
 int
-zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd)
+zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd) {
+	return (zpool_scan_range(zhp, func, cmd, 0, 0));
+}
+
+int
+zpool_scan_range(zpool_handle_t *zhp, pool_scan_func_t func,
+    pool_scrub_cmd_t cmd, time_t date_start, time_t date_end)
 {
 	char errbuf[ERRBUFLEN];
 	int err;
@@ -2782,6 +2788,8 @@ zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd)
 	nvlist_t *args = fnvlist_alloc();
 	fnvlist_add_uint64(args, "scan_type", (uint64_t)func);
 	fnvlist_add_uint64(args, "scan_command", (uint64_t)cmd);
+	fnvlist_add_uint64(args, "scan_date_start", (uint64_t)date_start);
+	fnvlist_add_uint64(args, "scan_date_end", (uint64_t)date_end);
 
 	err = lzc_scrub(ZFS_IOC_POOL_SCRUB, zhp->zpool_name, args, NULL);
 	fnvlist_free(args);
diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am
index 5cdb6a3eb24..aeacc595b36 100644
--- a/lib/libzpool/Makefile.am
+++ b/lib/libzpool/Makefile.am
@@ -177,6 +177,7 @@ nodist_libzpool_la_SOURCES = \
 	module/zfs/zfs_byteswap.c \
 	module/zfs/zfs_chksum.c \
 	module/zfs/zfs_debug_common.c \
+	module/zfs/zfs_crrd.c \
 	module/zfs/zfs_fm.c \
 	module/zfs/zfs_fuid.c \
 	module/zfs/zfs_ratelimit.c \
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index e00b1848b41..fa37c7cdb9e 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -2246,6 +2246,21 @@ Defer frees starting in this pass.
 Maximum memory used for prefetching a checkpoint's space map on each
 vdev while discarding the checkpoint.
 .
+.It Sy zfs_spa_note_txg_time Ns = Ns Sy 600 Pq uint
+This parameter defines, in seconds, how often the TXG time database will record
+a new TXG if it has changed.
+After the specified time interval has passed, and if the TXG number has changed,
+the new value is recorded in the database.
+These timestamps can later be used for more granular operations, such as
+scrubbing.
+.
+.It Sy zfs_spa_flush_txg_time Ns = Ns Sy 600 Pq uint
+This parameter defines, in seconds, how often the ZFS will flush
+the TXG time database to disk.
+It ensures that the data is actually written to persistent storage, which helps
+preserve the database in case of unexpected shutdown.
+The database is also automatically flushed during the export sequence.
+.
 .It Sy zfs_special_class_metadata_reserve_pct Ns = Ns Sy 25 Ns % Pq uint
 Only allow small data blocks to be allocated on the special and dedup vdev
 types when the available free space percentage on these vdevs exceeds this
diff --git a/man/man8/zpool-scrub.8 b/man/man8/zpool-scrub.8
index 9b4cf132c83..0ecf8bd3851 100644
--- a/man/man8/zpool-scrub.8
+++ b/man/man8/zpool-scrub.8
@@ -28,7 +28,7 @@
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\" Copyright (c) 2025 Hewlett Packard Enterprise Development LP.
 .\"
-.Dd November 18, 2024
+.Dd December 11, 2024
 .Dt ZPOOL-SCRUB 8
 .Os
 .
@@ -40,6 +40,8 @@
 .Cm scrub
 .Op Ns Fl e | Ns Fl p | Fl s Ns | Fl C Ns
 .Op Fl w
+.Op Fl S Ar date
+.Op Fl E Ar date
 .Fl a Ns | Ns Ar pool Ns …
 .
 .Sh DESCRIPTION
@@ -125,6 +127,44 @@ resilvering, nor can it be run when a regular scrub is paused.
 Continue scrub from last saved txg (see zpool
 .Sy last_scrubbed_txg
 property).
+.It Fl S Ar date , Fl E Ar date
+Allows specifying the date range for blocks created between these dates.
+.Bl -bullet -compact -offset indent
+.It
+.Fl S
+Defines a start date.
+If not specified, scrubbing begins from the start of the pool's
+existence.
+.It
+.Fl E
+Defines an end date.
+If not specified, scrubbing continues up to the most recent data.
+.El
+The provided date should be in the format:
+.Dq YYYY-MM-DD HH:MM .
+Where:
+.Bl -bullet -compact -offset indent
+.It
+.Dq YYYY
+is the year.
+.It
+.Dq MM
+is the numeric representation of the month.
+.It
+.Dq DD
+is the day of the month.
+.It
+.Dq HH
+is the hour.
+.It
+.Dq MM
+is the minutes.
+.El
+The hour and minutes parameters can be omitted.
+The time should be provided in machine local time zone.
+Specifying dates prior to enabling this feature will result in scrubbing
+starting from the date the pool was created.
+If the time was moved backward manually the data range may become inaccurate.
 .El
 .Sh EXAMPLES
 .Ss Example 1
diff --git a/module/Kbuild.in b/module/Kbuild.in
index ece603fee73..3d6f288fa5d 100644
--- a/module/Kbuild.in
+++ b/module/Kbuild.in
@@ -406,6 +406,7 @@ ZFS_OBJS := \
 	zfs_byteswap.o \
 	zfs_chksum.o \
 	zfs_debug_common.o \
+	zfs_crrd.o \
 	zfs_fm.o \
 	zfs_fuid.o \
 	zfs_impl.o \
diff --git a/module/Makefile.bsd b/module/Makefile.bsd
index 7e7c3db73a4..3ba38c43f25 100644
--- a/module/Makefile.bsd
+++ b/module/Makefile.bsd
@@ -217,6 +217,7 @@ SRCS+=	abd_os.c \
 	vdev_label_os.c \
 	zfs_acl.c \
 	zfs_ctldir.c \
+	zfs_crrd.c \
 	zfs_debug.c \
 	zfs_dir.c \
 	zfs_file_os.c \
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 31f152a8059..bbf474ed631 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -100,6 +100,7 @@
 #include <sys/vmsystm.h>
 #endif	/* _KERNEL */
 
+#include "zfs_crrd.h"
 #include "zfs_prop.h"
 #include "zfs_comutil.h"
 #include <cityhash.h>
@@ -310,6 +311,41 @@ static int zfs_livelist_condense_zthr_cancel = 0;
  */
 static int zfs_livelist_condense_new_alloc = 0;
 
+/*
+ * Time variable to decide how often the txg should be added into the
+ * database (in seconds).
+ * The smallest available resolution is in minutes, which means an update occurs
+ * each time we reach `spa_note_txg_time` and the txg has changed. We provide
+ * a 256-slot ring buffer for minute-level resolution. The number is limited by
+ * the size of the structure we use and the maximum amount of bytes we can write
+ * into ZAP. Setting `spa_note_txg_time` to 10 minutes results in approximately
+ * 144 records per day. Given the 256 slots, this provides roughly 1.5 days of
+ * high-resolution data.
+ *
+ * The user can decrease `spa_note_txg_time` to increase resolution within
+ * a day, at the cost of retaining fewer days of data. Alternatively, increasing
+ * the interval allows storing data over a longer period, but with lower
+ * frequency.
+ *
+ * This parameter does not affect the daily or monthly databases, as those only
+ * store one record per day and per month, respectively.
+ */
+static uint_t spa_note_txg_time = 10 * 60;
+
+/*
+ * How often flush txg database to a disk (in seconds).
+ * We flush data every time we write to it, making it the most reliable option.
+ * Since this happens every 10 minutes, it shouldn't introduce any noticeable
+ * overhead for the system. In case of failure, we will always have an
+ * up-to-date version of the database.
+ *
+ * The user can adjust the flush interval to a lower value, but it probably
+ * doesn't make sense to flush more often than the database is updated.
+ * The user can also increase the interval if they're concerned about the
+ * performance of writing the entire database to disk.
+ */
+static uint_t spa_flush_txg_time = 10 * 60;
+
 /*
  * ==========================================================================
  * SPA properties routines
@@ -2040,6 +2076,111 @@ spa_destroy_aux_threads(spa_t *spa)
 	}
 }
 
+static void
+spa_sync_time_logger(spa_t *spa, uint64_t txg)
+{
+	uint64_t curtime;
+	dmu_tx_t *tx;
+
+	if (!spa_writeable(spa)) {
+		return;
+	}
+	curtime = gethrestime_sec();
+	if (curtime < spa->spa_last_noted_txg_time + spa_note_txg_time) {
+		return;
+	}
+
+	if (txg > spa->spa_last_noted_txg) {
+		spa->spa_last_noted_txg_time = curtime;
+		spa->spa_last_noted_txg = txg;
+
+		mutex_enter(&spa->spa_txg_log_time_lock);
+		dbrrd_add(&spa->spa_txg_log_time, curtime, txg);
+		mutex_exit(&spa->spa_txg_log_time_lock);
+	}
+
+	if (curtime < spa->spa_last_flush_txg_time + spa_flush_txg_time) {
+		return;
+	}
+	spa->spa_last_flush_txg_time = curtime;
+
+	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
+
+	VERIFY0(zap_update(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_TXG_LOG_TIME_MINUTES, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM,
+	    &spa->spa_txg_log_time.dbr_minutes, tx));
+	VERIFY0(zap_update(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_TXG_LOG_TIME_DAYS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM,
+	    &spa->spa_txg_log_time.dbr_days, tx));
+	VERIFY0(zap_update(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_TXG_LOG_TIME_MONTHS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM,
+	    &spa->spa_txg_log_time.dbr_months, tx));
+	dmu_tx_commit(tx);
+}
+
+static void
+spa_unload_sync_time_logger(spa_t *spa)
+{
+	uint64_t txg;
+	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+	VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT));
+
+	txg = dmu_tx_get_txg(tx);
+	spa->spa_last_noted_txg_time = 0;
+	spa->spa_last_flush_txg_time = 0;
+	spa_sync_time_logger(spa, txg);
+
+	dmu_tx_commit(tx);
+}
+
+static void
+spa_load_txg_log_time(spa_t *spa)
+{
+	int error;
+
+	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_TXG_LOG_TIME_MINUTES, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM,
+	    &spa->spa_txg_log_time.dbr_minutes);
+	if (error != 0 && error != ENOENT) {
+		spa_load_note(spa, "unable to load a txg time database with "
+		    "minute resolution [error=%d]", error);
+	}
+	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_TXG_LOG_TIME_DAYS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM,
+	    &spa->spa_txg_log_time.dbr_days);
+	if (error != 0 && error != ENOENT) {
+		spa_load_note(spa, "unable to load a txg time database with "
+		    "day resolution [error=%d]", error);
+	}
+	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_TXG_LOG_TIME_MONTHS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM,
+	    &spa->spa_txg_log_time.dbr_months);
+	if (error != 0 && error != ENOENT) {
+		spa_load_note(spa, "unable to load a txg time database with "
+		    "month resolution [error=%d]", error);
+	}
+}
+
+static boolean_t
+spa_should_sync_time_logger_on_unload(spa_t *spa)
+{
+
+	if (!spa_writeable(spa))
+		return (B_FALSE);
+
+	if (!spa->spa_sync_on)
+		return (B_FALSE);
+
+	if (spa_state(spa) != POOL_STATE_EXPORTED)
+		return (B_FALSE);
+
+	if (spa->spa_last_noted_txg == 0)
+		return (B_FALSE);
+
+	return (B_TRUE);
+}
+
+
 /*
  * Opposite of spa_load().
  */
@@ -2061,6 +2202,9 @@ spa_unload(spa_t *spa)
 	 * we delay the final TXGs beyond what spa_final_txg is set at.
 	 */
 	if (spa->spa_final_txg == UINT64_MAX) {
+		if (spa_should_sync_time_logger_on_unload(spa))
+			spa_unload_sync_time_logger(spa);
+
 		/*
 		 * If the log space map feature is enabled and the pool is
 		 * getting exported (but not destroyed), we want to spend some
@@ -4717,6 +4861,9 @@ spa_ld_get_props(spa_t *spa)
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
+	/* Load time log */
+	spa_load_txg_log_time(spa);
+
 	/*
 	 * Load the persistent error log.  If we have an older pool, this will
 	 * not be present.
@@ -7140,6 +7287,9 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
 			spa_config_exit(spa, SCL_ALL, FTAG);
 		}
 
+		if (spa_should_sync_time_logger_on_unload(spa))
+			spa_unload_sync_time_logger(spa);
+
 		/*
 		 * If the log space map feature is enabled and the pool is
 		 * getting exported (but not destroyed), we want to spend some
@@ -10190,6 +10340,8 @@ spa_sync(spa_t *spa, uint64_t txg)
 	 */
 	brt_pending_apply(spa, txg);
 
+	spa_sync_time_logger(spa, txg);
+
 	/*
 	 * Lock out configuration changes.
 	 */
@@ -10232,6 +10384,7 @@ spa_sync(spa_t *spa, uint64_t txg)
 	dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
 
 	spa->spa_sync_starttime = gethrtime();
+
 	taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
 	spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq,
 	    spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
@@ -11105,6 +11258,13 @@ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT,
 	"Whether extra ALLOC blkptrs were added to a livelist entry while it "
 	"was being condensed");
 
+ZFS_MODULE_PARAM(zfs_spa, spa_, note_txg_time, UINT, ZMOD_RW,
+	"How frequently TXG timestamps are stored internally (in seconds)");
+
+ZFS_MODULE_PARAM(zfs_spa, spa_, flush_txg_time, UINT, ZMOD_RW,
+	"How frequently the TXG timestamps database should be flushed "
+	"to disk (in seconds)");
+
 #ifdef _KERNEL
 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read,
 	spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RW,
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index d2ba1f954e9..2eba8362a16 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -715,6 +715,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 	mutex_init(&spa->spa_feat_stats_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_flushed_ms_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_activities_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa->spa_txg_log_time_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL);
@@ -903,6 +904,7 @@ spa_remove(spa_t *spa)
 	mutex_destroy(&spa->spa_vdev_top_lock);
 	mutex_destroy(&spa->spa_feat_stats_lock);
 	mutex_destroy(&spa->spa_activities_lock);
+	mutex_destroy(&spa->spa_txg_log_time_lock);
 
 	kmem_free(spa, sizeof (spa_t));
 }
diff --git a/module/zfs/zfs_crrd.c b/module/zfs/zfs_crrd.c
new file mode 100644
index 00000000000..f9267ed41d7
--- /dev/null
+++ b/module/zfs/zfs_crrd.c
@@ -0,0 +1,227 @@
+// SPDX-License-Identifier: CDDL-1.0
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2024 Klara Inc.
+ *
+ * This software was developed by
+ * Mariusz Zaborski <mariusz.zaborski@klarasystems.com>
+ * Fred Weigel <fred.weigel@klarasystems.com>
+ * under sponsorship from Wasabi Technology, Inc. and Klara Inc.
+ */
+/*
+ * This file implements a round-robin database that stores timestamps and txg
+ * numbers. Due to limited space, we use a round-robin approach, where
+ * the oldest records are overwritten when there is no longer enough room.
+ * This is a best-effort mechanism, and the database should be treated as
+ * an approximation. Consider this before consuming it.
+ *
+ * The database is linear, meaning we assume each new entry is newer than the
+ * ones already stored. Because of this, if time is manipulated, the database
+ * will only accept records that are newer than the existing ones.
+ * (For example, jumping 10 years into the future and then back can lead to
+ * situation when for 10 years we wont write anything to database)
+ *
+ * All times stored in the database use UTC, which makes it easy to convert to
+ * and from local time.
+ *
+ * Each database holds 256 records (as defined in the `RRD_MAX_ENTRIES` macro).
+ * This limit comes from the maximum size of a ZAP object, where we store the
+ * binary blob.
+ *
+ * We've split the database into three smaller ones.
+ * The `minute database` provides high resolution (default: every 10 minutes),
+ * but only covers approximately 1.5 days. This gives a detailed view of recent
+ * activity, useful, for example, when performing a scrub of the last hour.
+ * The `daily database` records one txg per day. With 256 entries, it retains
+ * roughly 8 months of data. This allows users to scrub or analyze txgs across
+ * a range of days.
+ * The `monthly database` stores one record per month, giving approximately
+ * 21 years of history.
+ * All these calculations assume the worst-case scenario: the pool is always
+ * online and actively written to.
+ *
+ * A potential source of confusion is that the database does not store data
+ * while the pool is offline, leading to potential gaps in timeline. Also,
+ * the database contains no records from before this feature was enabled.
+ * Both, upon reflection, are expected.
+ */
+#include <sys/zfs_context.h>
+
+#include "zfs_crrd.h"
+
+rrd_data_t *
+rrd_tail_entry(rrd_t *rrd)
+{
+	size_t n;
+
+	if (rrd_len(rrd) == 0)
+		return (NULL);
+
+	if (rrd->rrd_tail == 0)
+		n = RRD_MAX_ENTRIES - 1;
+	else
+		n = rrd->rrd_tail - 1;
+
+	return (&rrd->rrd_entries[n]);
+}
+
+uint64_t
+rrd_tail(rrd_t *rrd)
+{
+	const rrd_data_t *tail;
+
+	tail = rrd_tail_entry(rrd);
+
+	return (tail == NULL ? 0 : tail->rrdd_time);
+}
+
+/*
+ * Return length of data in the rrd.
+ * rrd_get works from 0..rrd_len()-1.
+ */
+size_t
+rrd_len(rrd_t *rrd)
+{
+
+	return (rrd->rrd_length);
+}
+
+const rrd_data_t *
+rrd_entry(rrd_t *rrd, size_t i)
+{
+	size_t n;
+
+	if (i >= rrd_len(rrd)) {
+		return (0);
+	}
+
+	n = (rrd->rrd_head + i) % RRD_MAX_ENTRIES;
+	return (&rrd->rrd_entries[n]);
+}
+
+uint64_t
+rrd_get(rrd_t *rrd, size_t i)
+{
+	const rrd_data_t *data = rrd_entry(rrd, i);
+
+	return (data == NULL ? 0 : data->rrdd_txg);
+}
+
+/* Add value to database. */
+void
+rrd_add(rrd_t *rrd, hrtime_t time, uint64_t txg)
+{
+	rrd_data_t *tail;
+
+	tail = rrd_tail_entry(rrd);
+	if (tail != NULL && tail->rrdd_time == time) {
+		if (tail->rrdd_txg < txg) {
+			tail->rrdd_txg = txg;
+		} else {
+			return;
+		}
+	}
+
+	rrd->rrd_entries[rrd->rrd_tail].rrdd_time = time;
+	rrd->rrd_entries[rrd->rrd_tail].rrdd_txg = txg;
+
+	rrd->rrd_tail = (rrd->rrd_tail + 1) % RRD_MAX_ENTRIES;
+
+	if (rrd->rrd_length < RRD_MAX_ENTRIES) {
+		rrd->rrd_length++;
+	} else {
+		rrd->rrd_head = (rrd->rrd_head + 1) % RRD_MAX_ENTRIES;
+	}
+}
+
+void
+dbrrd_add(dbrrd_t *db, hrtime_t time, uint64_t txg)
+{
+	hrtime_t daydiff, monthdiff, minutedif;
+
+	minutedif = time - rrd_tail(&db->dbr_minutes);
+	daydiff = time - rrd_tail(&db->dbr_days);
+	monthdiff = time - rrd_tail(&db->dbr_months);
+
+	if (monthdiff >= 0 && monthdiff >= SEC2NSEC(30 * 24 * 60 * 60))
+		rrd_add(&db->dbr_months, time, txg);
+	else if (daydiff >= 0 && daydiff >= SEC2NSEC(24 * 60 * 60))
+		rrd_add(&db->dbr_days, time, txg);
+	else if (minutedif >= 0)
+		rrd_add(&db->dbr_minutes, time, txg);
+}
+
+/*
+ * We could do a binary search here, but the routine isn't frequently
+ * called and the data is small so we stick to a simple loop.
+ */
+static const rrd_data_t *
+rrd_query(rrd_t *rrd, hrtime_t tv, dbrrd_rounding_t rounding)
+{
+	const rrd_data_t *data = NULL;
+
+	for (size_t i = 0; i < rrd_len(rrd); i++) {
+		const rrd_data_t *cur = rrd_entry(rrd, i);
+
+		if (rounding == DBRRD_FLOOR) {
+			if (tv < cur->rrdd_time) {
+				break;
+			}
+			data = cur;
+		} else {
+			/* DBRRD_CEILING */
+			if (tv <= cur->rrdd_time) {
+				data = cur;
+				break;
+			}
+		}
+	}
+
+	return (data);
+}
+
+static const rrd_data_t *
+dbrrd_closest(hrtime_t tv, const rrd_data_t *r1, const rrd_data_t *r2)
+{
+
+	if (r1 == NULL)
+		return (r2);
+	if (r2 == NULL)
+		return (r1);
+
+	return (ABS(tv - r1->rrdd_time) < ABS(tv - r2->rrdd_time) ? r1 : r2);
+}
+
+uint64_t
+dbrrd_query(dbrrd_t *r, hrtime_t tv, dbrrd_rounding_t rounding)
+{
+	const rrd_data_t *data, *dm, *dd, *dy;
+
+	data = NULL;
+	dm = rrd_query(&r->dbr_minutes, tv, rounding);
+	dd = rrd_query(&r->dbr_days, tv, rounding);
+	dy = rrd_query(&r->dbr_months, tv, rounding);
+
+	data = dbrrd_closest(tv, dbrrd_closest(tv, dd, dm), dy);
+
+	return (data == NULL ? 0 : data->rrdd_txg);
+}
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index 3a413f4a7bd..dcb71229f96 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -1704,6 +1704,8 @@ zfs_ioc_pool_scan(zfs_cmd_t *zc)
 static const zfs_ioc_key_t zfs_keys_pool_scrub[] = {
 	{"scan_type",		DATA_TYPE_UINT64,	0},
 	{"scan_command",	DATA_TYPE_UINT64,	0},
+	{"scan_date_start",	DATA_TYPE_UINT64,	ZK_OPTIONAL},
+	{"scan_date_end",	DATA_TYPE_UINT64,	ZK_OPTIONAL},
 };
 
 static int
@@ -1712,6 +1714,7 @@ zfs_ioc_pool_scrub(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
 	spa_t *spa;
 	int error;
 	uint64_t scan_type, scan_cmd;
+	uint64_t date_start, date_end;
 
 	if (nvlist_lookup_uint64(innvl, "scan_type", &scan_type) != 0)
 		return (SET_ERROR(EINVAL));
@@ -1721,6 +1724,11 @@ zfs_ioc_pool_scrub(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
 	if (scan_cmd >= POOL_SCRUB_FLAGS_END)
 		return (SET_ERROR(EINVAL));
 
+	if (nvlist_lookup_uint64(innvl, "scan_date_start", &date_start) != 0)
+		date_start = 0;
+	if (nvlist_lookup_uint64(innvl, "scan_date_end", &date_end) != 0)
+		date_end = 0;
+
 	if ((error = spa_open(poolname, &spa, FTAG)) != 0)
 		return (error);
 
@@ -1732,7 +1740,24 @@ zfs_ioc_pool_scrub(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
 		error = spa_scan_range(spa, scan_type,
 		    spa_get_last_scrubbed_txg(spa), 0);
 	} else {
-		error = spa_scan(spa, scan_type);
+		uint64_t txg_start, txg_end;
+
+		txg_start = txg_end = 0;
+		if (date_start != 0 || date_end != 0) {
+			mutex_enter(&spa->spa_txg_log_time_lock);
+			if (date_start != 0) {
+				txg_start = dbrrd_query(&spa->spa_txg_log_time,
+				    date_start, DBRRD_FLOOR);
+			}
+
+			if (date_end != 0) {
+				txg_end = dbrrd_query(&spa->spa_txg_log_time,
+				    date_end, DBRRD_CEILING);
+			}
+			mutex_exit(&spa->spa_txg_log_time_lock);
+		}
+
+		error = spa_scan_range(spa, scan_type, txg_start, txg_end);
 	}
 
 	spa_close(spa, FTAG);
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index 16869d397c7..deca3c05b07 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -545,7 +545,8 @@ tests = ['zpool_scrub_001_neg', 'zpool_scrub_002_pos', 'zpool_scrub_003_pos',
     'zpool_scrub_offline_device', 'zpool_scrub_multiple_copies',
     'zpool_scrub_multiple_pools',
     'zpool_error_scrub_001_pos', 'zpool_error_scrub_002_pos',
-    'zpool_error_scrub_003_pos', 'zpool_error_scrub_004_pos']
+    'zpool_error_scrub_003_pos', 'zpool_error_scrub_004_pos',
+    'zpool_scrub_date_range_001']
 tags = ['functional', 'cli_root', 'zpool_scrub']
 
 [tests/functional/cli_root/zpool_set]
diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg
index 98e14ad9771..e273c9f85c2 100644
--- a/tests/zfs-tests/include/tunables.cfg
+++ b/tests/zfs-tests/include/tunables.cfg
@@ -87,6 +87,7 @@ SPA_ASIZE_INFLATION		spa.asize_inflation		spa_asize_inflation
 SPA_DISCARD_MEMORY_LIMIT	spa.discard_memory_limit	zfs_spa_discard_memory_limit
 SPA_LOAD_VERIFY_DATA		spa.load_verify_data		spa_load_verify_data
 SPA_LOAD_VERIFY_METADATA	spa.load_verify_metadata	spa_load_verify_metadata
+SPA_NOTE_TXG_TIME		spa.note_txg_time		spa_note_txg_time
 TRIM_EXTENT_BYTES_MIN		trim.extent_bytes_min		zfs_trim_extent_bytes_min
 TRIM_METASLAB_SKIP		trim.metaslab_skip		zfs_trim_metaslab_skip
 TRIM_TXG_BATCH			trim.txg_batch			zfs_trim_txg_batch
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index 505fe3daf82..5ab28b2d6c3 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -1244,6 +1244,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/cli_root/zpool_scrub/zpool_scrub_offline_device.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_print_repairing.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_txg_continue_from_last.ksh \
+	functional/cli_root/zpool_scrub/zpool_scrub_date_range_001.ksh \
 	functional/cli_root/zpool_scrub/zpool_error_scrub_001_pos.ksh \
 	functional/cli_root/zpool_scrub/zpool_error_scrub_002_pos.ksh \
 	functional/cli_root/zpool_scrub/zpool_error_scrub_003_pos.ksh \
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_date_range_001.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_date_range_001.ksh
new file mode 100755
index 00000000000..7f5f8052c8e
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_date_range_001.ksh
@@ -0,0 +1,94 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright 2025 Klara, Inc.
+# Copyright 2025 Mariusz Zaborski <oshogbo@FreeBSD.org>
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_scrub/zpool_scrub.cfg
+
+#
+# DESCRIPTION:
+#       Verify that the date range scrub only scrubs the files that were
+#       created/modified within a given time slot.
+#
+# STRATEGY:
+#     1. Write a file.
+#     2. Force a sync of everything via export/import.
+#     3. Wait for one minute.
+#     4. Repeat steps 1, 2, and 3 four two times.
+#     5. Inject checksum errors into all 3 files.
+#     6. Scrub the date range for the first file.
+#     7. Verify that the first file is scrubbed.
+#     8. Verify that newer files are not scrubbed.
+#     9. Repeat steps 6–8 for each of the remaining 2 files.
+#
+
+verify_runnable "global"
+
+function cleanup
+{
+	log_must zinject -c all
+	rm -f $TESTDIR/*_file
+	log_must restore_tunable SPA_NOTE_TXG_TIME
+}
+
+log_onexit cleanup
+
+log_assert "Verifiy scrub, -E, and -S show expected status."
+
+log_must save_tunable SPA_NOTE_TXG_TIME
+log_must set_tunable64 SPA_NOTE_TXG_TIME 30
+
+typeset -a date_list
+for i in `seq 0 2`; do
+	log_must sleep 60
+	log_must zpool export $TESTPOOL
+	log_must zpool import $TESTPOOL
+	date_list+=("$(date '+%Y-%m-%d %H:%M')")
+
+	log_must file_write -o create -f"$TESTDIR/${i}_file" \
+	    -b 512 -c 2048 -dR
+
+	log_must sleep 60
+	log_must zpool export $TESTPOOL
+	log_must zpool import $TESTPOOL
+	date_list+=("$(date '+%Y-%m-%d %H:%M')")
+done
+
+for i in `seq 0 2`; do
+	log_must zinject -t data -e checksum -f 100 $TESTDIR/${i}_file
+done
+
+for i in `seq 0 2`; do
+	log_must zpool scrub -w -S "${date_list[$((i * 2))]}" -E "${date_list[$((i * 2 + 1))]}" $TESTPOOL
+	log_must eval "zpool status -v $TESTPOOL | grep '${i}_file'"
+	for j in `seq 0 2`; do
+		if [ $i == $j ]; then
+			continue
+		fi
+		log_mustnot eval "zpool status -v $TESTPOOL | grep '${j}_file'"
+	done
+done
+
+log_pass "Verified scrub, -E, and -S show expected status."

From 4ae8bf406b6036cb16035aee56aa993ef35d200d Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Thu, 17 Jul 2025 12:50:54 -0400
Subject: [PATCH 70/72] Allow physical rewrite without logical

During regular block writes ZFS sets both logical and physical
birth times equal to the current TXG.  During dedup and block
cloning logical birth time is still set to the current TXG, but
physical may be copied from the original block that was used.
This represents the fact that logically user data has changed,
but the physically it is the same old block.

But block rewrite introduces a new situation, when block is not
changed logically, but stored in a different place of the pool.
From ARC, scrub and some other perspectives this is a new block,
but for example for user applications or incremental replication
it is not.  Somewhat similar thing happen during remap phase of
device removal, but in that case space blocks are still acounted
as allocated at their logical birth times.

This patch introduces a new "rewrite" flag in the block pointer
structure, allowing to differentiate physical rewrite (when the
block is actually reallocated at the physical birth time) from
the device reval case (when the logical birth time is used).

The new functionality is not used at this point, and the only
expected change is that error log is now kept in terms of physical
physical birth times, rather than logical, since if a block with
logged error was somehow rewritten, then the previous error does
not matter any more.

This change also introduces a new TRAVERSE_LOGICAL flag to the
traverse code, allowing zfs send, redact and diff to work in
context of logical birth times, ignoring physical-only rewrites.
It also changes nothing at this point due to lack of those writes,
but they will come in a following patch.

Reviewed-by: Rob Norris <robn@despairlabs.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <alexander.motin@TrueNAS.com>
Closes #17565
---
 cmd/zdb/zdb.c              | 16 ++++-----
 cmd/zdb/zdb_il.c           |  8 ++---
 include/sys/dmu_traverse.h |  7 ++++
 include/sys/spa.h          | 70 +++++++++++++++++++++++++++-----------
 lib/libzdb/libzdb.c        |  4 +--
 module/zfs/arc.c           | 10 +++---
 module/zfs/bpobj.c         |  4 +--
 module/zfs/dbuf.c          | 15 ++++----
 module/zfs/ddt.c           | 15 ++++----
 module/zfs/dmu.c           | 10 +++---
 module/zfs/dmu_diff.c      |  4 +--
 module/zfs/dmu_recv.c      |  4 +--
 module/zfs/dmu_redact.c    |  4 +--
 module/zfs/dmu_send.c      |  4 +--
 module/zfs/dmu_traverse.c  | 17 ++++++---
 module/zfs/dsl_bookmark.c  |  2 +-
 module/zfs/dsl_dataset.c   | 16 ++++-----
 module/zfs/dsl_deadlist.c  |  4 +--
 module/zfs/dsl_destroy.c   | 12 +++----
 module/zfs/dsl_pool.c      |  2 +-
 module/zfs/dsl_scan.c      | 27 +++++++++------
 module/zfs/metaslab.c      | 22 +++++++++---
 module/zfs/spa.c           |  4 +--
 module/zfs/spa_errlog.c    |  4 +--
 module/zfs/vdev_mirror.c   |  2 +-
 module/zfs/vdev_raidz.c    | 10 ++----
 module/zfs/zil.c           | 12 +++----
 module/zfs/zio.c           | 38 ++++++++++-----------
 module/zfs/zio_checksum.c  |  2 +-
 29 files changed, 205 insertions(+), 144 deletions(-)

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 6439b1bc96c..bf44d9c322b 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -208,7 +208,7 @@ sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free,
 				sublivelist_verify_block_t svb = {
 				    .svb_dva = bp->blk_dva[i],
 				    .svb_allocated_txg =
-				    BP_GET_LOGICAL_BIRTH(bp)
+				    BP_GET_BIRTH(bp)
 				};
 
 				if (zfs_btree_find(&sv->sv_leftover, &svb,
@@ -2569,7 +2569,7 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp,
 		    (u_longlong_t)BP_GET_PSIZE(bp),
 		    (u_longlong_t)BP_GET_FILL(bp),
 		    (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp),
-		    (u_longlong_t)BP_GET_BIRTH(bp));
+		    (u_longlong_t)BP_GET_PHYSICAL_BIRTH(bp));
 		if (bp_freed)
 			(void) snprintf(blkbuf + strlen(blkbuf),
 			    buflen - strlen(blkbuf), " %s", "FREE");
@@ -2619,7 +2619,7 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
 {
 	int err = 0;
 
-	if (BP_GET_LOGICAL_BIRTH(bp) == 0)
+	if (BP_GET_BIRTH(bp) == 0)
 		return (0);
 
 	print_indirect(spa, bp, zb, dnp);
@@ -2807,7 +2807,7 @@ dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 	(void) arg, (void) tx;
 	char blkbuf[BP_SPRINTF_LEN];
 
-	if (BP_GET_LOGICAL_BIRTH(bp) != 0) {
+	if (BP_GET_BIRTH(bp) != 0) {
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 		(void) printf("\t%s\n", blkbuf);
 	}
@@ -2848,7 +2848,7 @@ dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
 	(void) arg, (void) tx;
 	char blkbuf[BP_SPRINTF_LEN];
 
-	ASSERT(BP_GET_LOGICAL_BIRTH(bp) != 0);
+	ASSERT(BP_GET_BIRTH(bp) != 0);
 	snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, bp_freed);
 	(void) printf("\t%s\n", blkbuf);
 	return (0);
@@ -5922,11 +5922,11 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
 		 * entry back to the block pointer before we claim it.
 		 */
 		if (v == DDT_PHYS_FLAT) {
-			ASSERT3U(BP_GET_BIRTH(bp), ==,
+			ASSERT3U(BP_GET_PHYSICAL_BIRTH(bp), ==,
 			    ddt_phys_birth(dde->dde_phys, v));
 			tempbp = *bp;
 			ddt_bp_fill(dde->dde_phys, v, &tempbp,
-			    BP_GET_BIRTH(bp));
+			    BP_GET_PHYSICAL_BIRTH(bp));
 			bp = &tempbp;
 		}
 
@@ -6152,7 +6152,7 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 	if (zb->zb_level == ZB_DNODE_LEVEL)
 		return (0);
 
-	if (dump_opt['b'] >= 5 && BP_GET_LOGICAL_BIRTH(bp) > 0) {
+	if (dump_opt['b'] >= 5 && BP_GET_BIRTH(bp) > 0) {
 		char blkbuf[BP_SPRINTF_LEN];
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 		(void) printf("objset %llu object %llu "
diff --git a/cmd/zdb/zdb_il.c b/cmd/zdb/zdb_il.c
index 6b90b08ca1b..62e290cd122 100644
--- a/cmd/zdb/zdb_il.c
+++ b/cmd/zdb/zdb_il.c
@@ -176,7 +176,7 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, const void *arg)
 
 	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
 		(void) printf("%shas blkptr, %s\n", tab_prefix,
-		    !BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) >=
+		    !BP_IS_HOLE(bp) && BP_GET_BIRTH(bp) >=
 		    spa_min_claim_txg(zilog->zl_spa) ?
 		    "will claim" : "won't claim");
 		print_log_bp(bp, tab_prefix);
@@ -189,7 +189,7 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, const void *arg)
 			(void) printf("%s<hole>\n", tab_prefix);
 			return;
 		}
-		if (BP_GET_LOGICAL_BIRTH(bp) < zilog->zl_header->zh_claim_txg) {
+		if (BP_GET_BIRTH(bp) < zilog->zl_header->zh_claim_txg) {
 			(void) printf("%s<block already committed>\n",
 			    tab_prefix);
 			return;
@@ -240,7 +240,7 @@ zil_prt_rec_write_enc(zilog_t *zilog, int txtype, const void *arg)
 
 	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
 		(void) printf("%shas blkptr, %s\n", tab_prefix,
-		    !BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) >=
+		    !BP_IS_HOLE(bp) && BP_GET_BIRTH(bp) >=
 		    spa_min_claim_txg(zilog->zl_spa) ?
 		    "will claim" : "won't claim");
 		print_log_bp(bp, tab_prefix);
@@ -476,7 +476,7 @@ print_log_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
 
 	if (claim_txg != 0)
 		claim = "already claimed";
-	else if (BP_GET_LOGICAL_BIRTH(bp) >= spa_min_claim_txg(zilog->zl_spa))
+	else if (BP_GET_BIRTH(bp) >= spa_min_claim_txg(zilog->zl_spa))
 		claim = "will claim";
 	else
 		claim = "won't claim";
diff --git a/include/sys/dmu_traverse.h b/include/sys/dmu_traverse.h
index 3196b2addee..70cafa4c74f 100644
--- a/include/sys/dmu_traverse.h
+++ b/include/sys/dmu_traverse.h
@@ -59,6 +59,13 @@ typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
  */
 #define	TRAVERSE_NO_DECRYPT		(1<<5)
 
+/*
+ * Always use logical birth time for birth time comparisons.  This is useful
+ * for operations that care about user data changes rather than physical
+ * block rewrites (e.g., incremental replication).
+ */
+#define	TRAVERSE_LOGICAL		(1<<6)
+
 /* Special traverse error return value to indicate skipping of children */
 #define	TRAVERSE_VISIT_NO_CHILDREN	-1
 
diff --git a/include/sys/spa.h b/include/sys/spa.h
index e0eed831d30..db6de332ae6 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -140,7 +140,7 @@ typedef struct zio_cksum_salt {
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 6	|BDX|lvl| type	| cksum |E| comp|    PSIZE	|     LSIZE	|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 7	|			padding					|
+ * 7	|R|			padding					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 8	|			padding					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
@@ -175,6 +175,7 @@ typedef struct zio_cksum_salt {
  * E		blkptr_t contains embedded data (see below)
  * lvl		level of indirection
  * type		DMU object type
+ * R		rewrite (reallocated/rewritten at phys birth TXG)
  * phys birth	txg when dva[0] was written; zero if same as logical birth txg
  *              note that typically all the dva's would be written in this
  *              txg, but they could be different if they were moved by
@@ -204,7 +205,7 @@ typedef struct zio_cksum_salt {
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 6	|BDX|lvl| type	| cksum |E| comp|    PSIZE	|     LSIZE	|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 7	|			padding					|
+ * 7	|R|			padding					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 8	|			padding					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
@@ -373,7 +374,8 @@ typedef enum bp_embedded_type {
 typedef struct blkptr {
 	dva_t		blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */
 	uint64_t	blk_prop;	/* size, compression, type, etc	    */
-	uint64_t	blk_pad[2];	/* Extra space for the future	    */
+	uint64_t	blk_prop2;	/* additional properties	    */
+	uint64_t	blk_pad;	/* Extra space for the future	    */
 	uint64_t	blk_birth_word[2];
 	uint64_t	blk_fill;	/* fill count			    */
 	zio_cksum_t	blk_cksum;	/* 256-bit checksum		    */
@@ -476,32 +478,51 @@ typedef struct blkptr {
 #define	BP_GET_FREE(bp)			BF64_GET((bp)->blk_fill, 0, 1)
 #define	BP_SET_FREE(bp, x)		BF64_SET((bp)->blk_fill, 0, 1, x)
 
+/*
+ * Block birth time macros for different use cases:
+ * - BP_GET_LOGICAL_BIRTH(): When the block was logically modified by user.
+ *   To be used with a focus on user data, like incremental replication.
+ * - BP_GET_PHYSICAL_BIRTH(): When the block was physically written to disks.
+ *   For regular writes is equal to logical birth.  For dedup and block cloning
+ *   can be smaller than logical birth.  For remapped and rewritten blocks can
+ *   be bigger. To be used with focus on physical disk content: ARC, DDT, scrub.
+ * - BP_GET_RAW_PHYSICAL_BIRTH(): Raw physical birth value.  Zero if equal
+ *   to logical birth.  Should only be used for BP copying and debugging.
+ * - BP_GET_BIRTH(): When the block was allocated, which is a physical birth
+ *   for rewritten blocks (rewrite flag set) or logical birth otherwise.
+ */
 #define	BP_GET_LOGICAL_BIRTH(bp)	(bp)->blk_birth_word[1]
 #define	BP_SET_LOGICAL_BIRTH(bp, x)	((bp)->blk_birth_word[1] = (x))
 
-#define	BP_GET_PHYSICAL_BIRTH(bp)	(bp)->blk_birth_word[0]
+#define	BP_GET_RAW_PHYSICAL_BIRTH(bp)	(bp)->blk_birth_word[0]
 #define	BP_SET_PHYSICAL_BIRTH(bp, x)	((bp)->blk_birth_word[0] = (x))
 
-#define	BP_GET_BIRTH(bp)					\
-	(BP_IS_EMBEDDED(bp) ? 0 : 				\
-	BP_GET_PHYSICAL_BIRTH(bp) ? BP_GET_PHYSICAL_BIRTH(bp) :	\
+#define	BP_GET_PHYSICAL_BIRTH(bp)					\
+	(BP_IS_EMBEDDED(bp) ? 0 : 					\
+	BP_GET_RAW_PHYSICAL_BIRTH(bp) ? BP_GET_RAW_PHYSICAL_BIRTH(bp) :	\
 	BP_GET_LOGICAL_BIRTH(bp))
 
-#define	BP_SET_BIRTH(bp, logical, physical)	\
-{						\
-	ASSERT(!BP_IS_EMBEDDED(bp));		\
-	BP_SET_LOGICAL_BIRTH(bp, logical);	\
-	BP_SET_PHYSICAL_BIRTH(bp, 		\
-	    ((logical) == (physical) ? 0 : (physical))); \
+#define	BP_GET_BIRTH(bp)					\
+	((BP_IS_EMBEDDED(bp) || !BP_GET_REWRITE(bp)) ?		\
+	BP_GET_LOGICAL_BIRTH(bp) : BP_GET_PHYSICAL_BIRTH(bp))
+
+#define	BP_SET_BIRTH(bp, logical, physical)			\
+{								\
+	ASSERT(!BP_IS_EMBEDDED(bp));				\
+	BP_SET_LOGICAL_BIRTH(bp, logical);			\
+	BP_SET_PHYSICAL_BIRTH(bp, 				\
+	    ((logical) == (physical) ? 0 : (physical)));	\
 }
 
 #define	BP_GET_FILL(bp)				\
-	((BP_IS_ENCRYPTED(bp)) ? BF64_GET((bp)->blk_fill, 0, 32) : \
-	((BP_IS_EMBEDDED(bp)) ? 1 : (bp)->blk_fill))
+	(BP_IS_EMBEDDED(bp) ? 1 : 			\
+	BP_IS_ENCRYPTED(bp) ? BF64_GET((bp)->blk_fill, 0, 32) : \
+	(bp)->blk_fill)
 
 #define	BP_SET_FILL(bp, fill)			\
 {						\
-	if (BP_IS_ENCRYPTED(bp))			\
+	ASSERT(!BP_IS_EMBEDDED(bp));		\
+	if (BP_IS_ENCRYPTED(bp))		\
 		BF64_SET((bp)->blk_fill, 0, 32, fill); \
 	else					\
 		(bp)->blk_fill = fill;		\
@@ -516,6 +537,15 @@ typedef struct blkptr {
 	BF64_SET((bp)->blk_fill, 32, 32, iv2);	\
 }
 
+#define	BP_GET_REWRITE(bp)			\
+	(BP_IS_EMBEDDED(bp) ? 0 : BF64_GET((bp)->blk_prop2, 63, 1))
+
+#define	BP_SET_REWRITE(bp, x)			\
+{						\
+	ASSERT(!BP_IS_EMBEDDED(bp));		\
+	BF64_SET((bp)->blk_prop2, 63, 1, x);	\
+}
+
 #define	BP_IS_METADATA(bp)	\
 	(BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))
 
@@ -545,7 +575,7 @@ typedef struct blkptr {
 	(dva1)->dva_word[0] == (dva2)->dva_word[0])
 
 #define	BP_EQUAL(bp1, bp2)	\
-	(BP_GET_BIRTH(bp1) == BP_GET_BIRTH(bp2) &&	\
+	(BP_GET_PHYSICAL_BIRTH(bp1) == BP_GET_PHYSICAL_BIRTH(bp2) &&	\
 	BP_GET_LOGICAL_BIRTH(bp1) == BP_GET_LOGICAL_BIRTH(bp2) &&	\
 	DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) &&	\
 	DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) &&	\
@@ -588,8 +618,8 @@ typedef struct blkptr {
 {						\
 	BP_ZERO_DVAS(bp);			\
 	(bp)->blk_prop = 0;			\
-	(bp)->blk_pad[0] = 0;			\
-	(bp)->blk_pad[1] = 0;			\
+	(bp)->blk_prop2 = 0;			\
+	(bp)->blk_pad = 0;			\
 	(bp)->blk_birth_word[0] = 0;		\
 	(bp)->blk_birth_word[1] = 0;		\
 	(bp)->blk_fill = 0;			\
@@ -696,7 +726,7 @@ typedef struct blkptr {
 		    (u_longlong_t)BP_GET_LSIZE(bp),			\
 		    (u_longlong_t)BP_GET_PSIZE(bp),			\
 		    (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp),		\
-		    (u_longlong_t)BP_GET_BIRTH(bp),			\
+		    (u_longlong_t)BP_GET_PHYSICAL_BIRTH(bp),		\
 		    (u_longlong_t)BP_GET_FILL(bp),			\
 		    ws,							\
 		    (u_longlong_t)bp->blk_cksum.zc_word[0],		\
diff --git a/lib/libzdb/libzdb.c b/lib/libzdb/libzdb.c
index 12144dc65e7..cca1327b1b0 100644
--- a/lib/libzdb/libzdb.c
+++ b/lib/libzdb/libzdb.c
@@ -93,9 +93,9 @@ livelist_compare(const void *larg, const void *rarg)
 	 * Since we're storing blkptrs without cancelling FREE/ALLOC pairs,
 	 * it's possible the offsets are equal. In that case, sort by txg
 	 */
-	if (BP_GET_LOGICAL_BIRTH(l) < BP_GET_LOGICAL_BIRTH(r)) {
+	if (BP_GET_BIRTH(l) < BP_GET_BIRTH(r)) {
 		return (-1);
-	} else if (BP_GET_LOGICAL_BIRTH(l) > BP_GET_LOGICAL_BIRTH(r)) {
+	} else if (BP_GET_BIRTH(l) > BP_GET_BIRTH(r)) {
 		return (+1);
 	}
 	return (0);
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index a2cb3b8a53e..3483be64ec5 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -1052,7 +1052,7 @@ static arc_buf_hdr_t *
 buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
 {
 	const dva_t *dva = BP_IDENTITY(bp);
-	uint64_t birth = BP_GET_BIRTH(bp);
+	uint64_t birth = BP_GET_PHYSICAL_BIRTH(bp);
 	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 	arc_buf_hdr_t *hdr;
@@ -5587,7 +5587,7 @@ arc_read_done(zio_t *zio)
 	if (HDR_IN_HASH_TABLE(hdr)) {
 		arc_buf_hdr_t *found;
 
-		ASSERT3U(hdr->b_birth, ==, BP_GET_BIRTH(zio->io_bp));
+		ASSERT3U(hdr->b_birth, ==, BP_GET_PHYSICAL_BIRTH(zio->io_bp));
 		ASSERT3U(hdr->b_dva.dva_word[0], ==,
 		    BP_IDENTITY(zio->io_bp)->dva_word[0]);
 		ASSERT3U(hdr->b_dva.dva_word[1], ==,
@@ -5690,7 +5690,7 @@ arc_read_done(zio_t *zio)
 			error = SET_ERROR(EIO);
 			if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
 				spa_log_error(zio->io_spa, &acb->acb_zb,
-				    BP_GET_LOGICAL_BIRTH(zio->io_bp));
+				    BP_GET_PHYSICAL_BIRTH(zio->io_bp));
 				(void) zfs_ereport_post(
 				    FM_EREPORT_ZFS_AUTHENTICATION,
 				    zio->io_spa, NULL, &acb->acb_zb, zio, 0);
@@ -6109,7 +6109,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
 
 			if (!embedded_bp) {
 				hdr->b_dva = *BP_IDENTITY(bp);
-				hdr->b_birth = BP_GET_BIRTH(bp);
+				hdr->b_birth = BP_GET_PHYSICAL_BIRTH(bp);
 				exists = buf_hash_insert(hdr, &hash_lock);
 			}
 			if (exists != NULL) {
@@ -6957,7 +6957,7 @@ arc_write_done(zio_t *zio)
 			buf_discard_identity(hdr);
 		} else {
 			hdr->b_dva = *BP_IDENTITY(zio->io_bp);
-			hdr->b_birth = BP_GET_BIRTH(zio->io_bp);
+			hdr->b_birth = BP_GET_PHYSICAL_BIRTH(zio->io_bp);
 		}
 	} else {
 		ASSERT(HDR_EMPTY(hdr));
diff --git a/module/zfs/bpobj.c b/module/zfs/bpobj.c
index 8c19de93f12..0a8a077edf6 100644
--- a/module/zfs/bpobj.c
+++ b/module/zfs/bpobj.c
@@ -954,8 +954,8 @@ space_range_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
 	(void) bp_freed, (void) tx;
 	struct space_range_arg *sra = arg;
 
-	if (BP_GET_LOGICAL_BIRTH(bp) > sra->mintxg &&
-	    BP_GET_LOGICAL_BIRTH(bp) <= sra->maxtxg) {
+	if (BP_GET_BIRTH(bp) > sra->mintxg &&
+	    BP_GET_BIRTH(bp) <= sra->maxtxg) {
 		if (dsl_pool_sync_context(spa_get_dsl(sra->spa)))
 			sra->used += bp_get_dsize_sync(sra->spa, bp);
 		else
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index a4cc79c35c2..a96666a4675 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -1243,11 +1243,9 @@ dbuf_verify(dmu_buf_impl_t *db)
 					    DVA_IS_EMPTY(&bp->blk_dva[1]) &&
 					    DVA_IS_EMPTY(&bp->blk_dva[2]));
 					ASSERT0(bp->blk_fill);
-					ASSERT0(bp->blk_pad[0]);
-					ASSERT0(bp->blk_pad[1]);
 					ASSERT(!BP_IS_EMBEDDED(bp));
 					ASSERT(BP_IS_HOLE(bp));
-					ASSERT0(BP_GET_PHYSICAL_BIRTH(bp));
+					ASSERT0(BP_GET_RAW_PHYSICAL_BIRTH(bp));
 				}
 			}
 		}
@@ -1623,7 +1621,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, dmu_flags_t flags,
 	 */
 	if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bp)) {
 		spa_log_error(db->db_objset->os_spa, &zb,
-		    BP_GET_LOGICAL_BIRTH(bp));
+		    BP_GET_PHYSICAL_BIRTH(bp));
 		err = SET_ERROR(EIO);
 		goto early_unlock;
 	}
@@ -4907,7 +4905,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
 	dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
 	zio->io_prev_space_delta = delta;
 
-	if (BP_GET_LOGICAL_BIRTH(bp) != 0) {
+	if (BP_GET_BIRTH(bp) != 0) {
 		ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
 		    BP_GET_TYPE(bp) == dn->dn_type) ||
 		    (db->db_blkid == DMU_SPILL_BLKID &&
@@ -5194,7 +5192,7 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
 	ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
 
 	drica.drica_os = dn->dn_objset;
-	drica.drica_blk_birth = BP_GET_LOGICAL_BIRTH(bp);
+	drica.drica_blk_birth = BP_GET_BIRTH(bp);
 	drica.drica_tx = tx;
 	if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback,
 	    &drica)) {
@@ -5209,8 +5207,7 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
 		if (dn->dn_objset != spa_meta_objset(spa)) {
 			dsl_dataset_t *ds = dmu_objset_ds(dn->dn_objset);
 			if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
-			    BP_GET_LOGICAL_BIRTH(bp) >
-			    ds->ds_dir->dd_origin_txg) {
+			    BP_GET_BIRTH(bp) > ds->ds_dir->dd_origin_txg) {
 				ASSERT(!BP_IS_EMBEDDED(bp));
 				ASSERT(dsl_dir_is_clone(ds->ds_dir));
 				ASSERT(spa_feature_is_enabled(spa,
@@ -5328,7 +5325,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 	}
 
 	ASSERT(db->db_level == 0 || data == db->db_buf);
-	ASSERT3U(BP_GET_LOGICAL_BIRTH(db->db_blkptr), <=, txg);
+	ASSERT3U(BP_GET_BIRTH(db->db_blkptr), <=, txg);
 	ASSERT(pio);
 
 	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index 60cbb7755a7..e0b9fc3951f 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -724,10 +724,13 @@ ddt_phys_extend(ddt_univ_phys_t *ddp, ddt_phys_variant_t v, const blkptr_t *bp)
 		dvas[2] = bp->blk_dva[2];
 
 	if (ddt_phys_birth(ddp, v) == 0) {
-		if (v == DDT_PHYS_FLAT)
-			ddp->ddp_flat.ddp_phys_birth = BP_GET_BIRTH(bp);
-		else
-			ddp->ddp_trad[v].ddp_phys_birth = BP_GET_BIRTH(bp);
+		if (v == DDT_PHYS_FLAT) {
+			ddp->ddp_flat.ddp_phys_birth =
+			    BP_GET_PHYSICAL_BIRTH(bp);
+		} else {
+			ddp->ddp_trad[v].ddp_phys_birth =
+			    BP_GET_PHYSICAL_BIRTH(bp);
+		}
 	}
 }
 
@@ -891,14 +894,14 @@ ddt_phys_select(const ddt_t *ddt, const ddt_entry_t *dde, const blkptr_t *bp)
 
 	if (ddt->ddt_flags & DDT_FLAG_FLAT) {
 		if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_flat.ddp_dva[0]) &&
-		    BP_GET_BIRTH(bp) == ddp->ddp_flat.ddp_phys_birth) {
+		    BP_GET_PHYSICAL_BIRTH(bp) == ddp->ddp_flat.ddp_phys_birth) {
 			return (DDT_PHYS_FLAT);
 		}
 	} else /* traditional phys */ {
 		for (int p = 0; p < DDT_PHYS_MAX; p++) {
 			if (DVA_EQUAL(BP_IDENTITY(bp),
 			    &ddp->ddp_trad[p].ddp_dva[0]) &&
-			    BP_GET_BIRTH(bp) ==
+			    BP_GET_PHYSICAL_BIRTH(bp) ==
 			    ddp->ddp_trad[p].ddp_phys_birth) {
 				return (p);
 			}
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index 21c46532813..690227a3093 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -1966,7 +1966,7 @@ dmu_sync_late_arrival_done(zio_t *zio)
 			blkptr_t *bp_orig __maybe_unused = &zio->io_bp_orig;
 			ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE));
 			ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
-			ASSERT(BP_GET_LOGICAL_BIRTH(zio->io_bp) == zio->io_txg);
+			ASSERT(BP_GET_BIRTH(zio->io_bp) == zio->io_txg);
 			ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
 			zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
 		}
@@ -2655,11 +2655,12 @@ dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
 		 * operation into ZIL, or it may be impossible to replay, since
 		 * the block may appear not yet allocated at that point.
 		 */
-		if (BP_GET_BIRTH(bp) > spa_freeze_txg(os->os_spa)) {
+		if (BP_GET_PHYSICAL_BIRTH(bp) > spa_freeze_txg(os->os_spa)) {
 			error = SET_ERROR(EINVAL);
 			goto out;
 		}
-		if (BP_GET_BIRTH(bp) > spa_last_synced_txg(os->os_spa)) {
+		if (BP_GET_PHYSICAL_BIRTH(bp) >
+		    spa_last_synced_txg(os->os_spa)) {
 			error = SET_ERROR(EAGAIN);
 			goto out;
 		}
@@ -2731,7 +2732,8 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
 		if (!BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) != 0) {
 			if (!BP_IS_EMBEDDED(bp)) {
 				BP_SET_BIRTH(&dl->dr_overridden_by, dr->dr_txg,
-				    BP_GET_BIRTH(bp));
+				    BP_GET_PHYSICAL_BIRTH(bp));
+				BP_SET_REWRITE(&dl->dr_overridden_by, 0);
 			} else {
 				BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by,
 				    dr->dr_txg);
diff --git a/module/zfs/dmu_diff.c b/module/zfs/dmu_diff.c
index 86f751e886c..fb13b2f87f5 100644
--- a/module/zfs/dmu_diff.c
+++ b/module/zfs/dmu_diff.c
@@ -224,8 +224,8 @@ dmu_diff(const char *tosnap_name, const char *fromsnap_name,
 	 * call the ZFS_IOC_OBJ_TO_STATS ioctl.
 	 */
 	error = traverse_dataset(tosnap, fromtxg,
-	    TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_NO_DECRYPT,
-	    diff_cb, &da);
+	    TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_NO_DECRYPT |
+	    TRAVERSE_LOGICAL, diff_cb, &da);
 
 	if (error != 0) {
 		da.da_err = error;
diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c
index afc9823decc..73227b58c14 100644
--- a/module/zfs/dmu_recv.c
+++ b/module/zfs/dmu_recv.c
@@ -1403,7 +1403,7 @@ corrective_read_done(zio_t *zio)
 	/* Corruption corrected; update error log if needed */
 	if (zio->io_error == 0) {
 		spa_remove_error(data->spa, &data->zb,
-		    BP_GET_LOGICAL_BIRTH(zio->io_bp));
+		    BP_GET_PHYSICAL_BIRTH(zio->io_bp));
 	}
 	kmem_free(data, sizeof (cr_cb_data_t));
 	abd_free(zio->io_abd);
@@ -1530,7 +1530,7 @@ do_corrective_recv(struct receive_writer_arg *rwa, struct drr_write *drrw,
 	}
 	rrd->abd = abd;
 
-	io = zio_rewrite(NULL, rwa->os->os_spa, BP_GET_LOGICAL_BIRTH(bp), bp,
+	io = zio_rewrite(NULL, rwa->os->os_spa, BP_GET_BIRTH(bp), bp,
 	    abd, BP_GET_PSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, flags,
 	    &zb);
 
diff --git a/module/zfs/dmu_redact.c b/module/zfs/dmu_redact.c
index 65443d112f2..9226ac9e4b8 100644
--- a/module/zfs/dmu_redact.c
+++ b/module/zfs/dmu_redact.c
@@ -370,8 +370,8 @@ redact_traverse_thread(void *arg)
 #endif
 
 	err = traverse_dataset_resume(rt_arg->ds, rt_arg->txg,
-	    &rt_arg->resume, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA,
-	    redact_cb, rt_arg);
+	    &rt_arg->resume, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
+	    TRAVERSE_LOGICAL, redact_cb, rt_arg);
 
 	if (err != EINTR)
 		rt_arg->error_code = err;
diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c
index 4f27f3df0e5..deeba29e159 100644
--- a/module/zfs/dmu_send.c
+++ b/module/zfs/dmu_send.c
@@ -1084,7 +1084,7 @@ send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 	 */
 	if (sta->os->os_encrypted &&
 	    !BP_IS_HOLE(bp) && !BP_USES_CRYPT(bp)) {
-		spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp));
+		spa_log_error(spa, zb, BP_GET_PHYSICAL_BIRTH(bp));
 		return (SET_ERROR(EIO));
 	}
 
@@ -1210,7 +1210,7 @@ send_traverse_thread(void *arg)
 
 	err = traverse_dataset_resume(st_arg->os->os_dsl_dataset,
 	    st_arg->fromtxg, &st_arg->resume,
-	    st_arg->flags, send_cb, st_arg);
+	    st_arg->flags | TRAVERSE_LOGICAL, send_cb, st_arg);
 
 	if (err != EINTR)
 		st_arg->error_code = err;
diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c
index f534a7dd64e..dd1df170504 100644
--- a/module/zfs/dmu_traverse.c
+++ b/module/zfs/dmu_traverse.c
@@ -74,6 +74,15 @@ static int traverse_dnode(traverse_data_t *td, const blkptr_t *bp,
 static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *,
     uint64_t objset, uint64_t object);
 
+static inline uint64_t
+get_birth_time(traverse_data_t *td, const blkptr_t *bp)
+{
+	if (td->td_flags & TRAVERSE_LOGICAL)
+		return (BP_GET_LOGICAL_BIRTH(bp));
+	else
+		return (BP_GET_BIRTH(bp));
+}
+
 static int
 traverse_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
     uint64_t claim_txg)
@@ -85,7 +94,7 @@ traverse_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
 		return (0);
 
 	if (claim_txg == 0 &&
-	    BP_GET_LOGICAL_BIRTH(bp) >= spa_min_claim_txg(td->td_spa))
+	    get_birth_time(td, bp) >= spa_min_claim_txg(td->td_spa))
 		return (-1);
 
 	SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
@@ -110,7 +119,7 @@ traverse_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg,
 		if (BP_IS_HOLE(bp))
 			return (0);
 
-		if (claim_txg == 0 || BP_GET_LOGICAL_BIRTH(bp) < claim_txg)
+		if (claim_txg == 0 || get_birth_time(td, bp) < claim_txg)
 			return (0);
 
 		ASSERT3U(BP_GET_LSIZE(bp), !=, 0);
@@ -194,7 +203,7 @@ traverse_prefetch_metadata(traverse_data_t *td, const dnode_phys_t *dnp,
 	 */
 	if (resume_skip_check(td, dnp, zb) != RESUME_SKIP_NONE)
 		return (B_FALSE);
-	if (BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) <= td->td_min_txg)
+	if (BP_IS_HOLE(bp) || get_birth_time(td, bp) <= td->td_min_txg)
 		return (B_FALSE);
 	if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)
 		return (B_FALSE);
@@ -265,7 +274,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
 		    zb->zb_object == DMU_META_DNODE_OBJECT) &&
 		    td->td_hole_birth_enabled_txg <= td->td_min_txg)
 			return (0);
-	} else if (BP_GET_LOGICAL_BIRTH(bp) <= td->td_min_txg) {
+	} else if (get_birth_time(td, bp) <= td->td_min_txg) {
 		return (0);
 	}
 
diff --git a/module/zfs/dsl_bookmark.c b/module/zfs/dsl_bookmark.c
index e301fe19f64..fdc8b7b198f 100644
--- a/module/zfs/dsl_bookmark.c
+++ b/module/zfs/dsl_bookmark.c
@@ -1523,7 +1523,7 @@ dsl_bookmark_block_killed(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
 		 * If the block was live (referenced) at the time of this
 		 * bookmark, add its space to the bookmark's FBN.
 		 */
-		if (BP_GET_LOGICAL_BIRTH(bp) <=
+		if (BP_GET_BIRTH(bp) <=
 		    dbn->dbn_phys.zbm_creation_txg &&
 		    (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) {
 			mutex_enter(&dbn->dbn_lock);
diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c
index 21c8a682199..b767c964141 100644
--- a/module/zfs/dsl_dataset.c
+++ b/module/zfs/dsl_dataset.c
@@ -159,7 +159,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
 		return;
 	}
 
-	ASSERT3U(BP_GET_LOGICAL_BIRTH(bp), >,
+	ASSERT3U(BP_GET_BIRTH(bp), >,
 	    dsl_dataset_phys(ds)->ds_prev_snap_txg);
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	mutex_enter(&ds->ds_lock);
@@ -194,7 +194,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
 	 * they do not need to be freed.
 	 */
 	if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
-	    BP_GET_LOGICAL_BIRTH(bp) > ds->ds_dir->dd_origin_txg &&
+	    BP_GET_BIRTH(bp) > ds->ds_dir->dd_origin_txg &&
 	    !(BP_IS_EMBEDDED(bp))) {
 		ASSERT(dsl_dir_is_clone(ds->ds_dir));
 		ASSERT(spa_feature_is_enabled(spa,
@@ -263,7 +263,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
 		return (0);
 
 	ASSERT(dmu_tx_is_syncing(tx));
-	ASSERT(BP_GET_LOGICAL_BIRTH(bp) <= tx->tx_txg);
+	ASSERT(BP_GET_BIRTH(bp) <= tx->tx_txg);
 
 	if (ds == NULL) {
 		dsl_free(tx->tx_pool, tx->tx_txg, bp);
@@ -281,7 +281,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
 	 * they do not need to be freed.
 	 */
 	if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
-	    BP_GET_LOGICAL_BIRTH(bp) > ds->ds_dir->dd_origin_txg &&
+	    BP_GET_BIRTH(bp) > ds->ds_dir->dd_origin_txg &&
 	    !(BP_IS_EMBEDDED(bp))) {
 		ASSERT(dsl_dir_is_clone(ds->ds_dir));
 		ASSERT(spa_feature_is_enabled(spa,
@@ -289,7 +289,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
 		bplist_append(&ds->ds_dir->dd_pending_frees, bp);
 	}
 
-	if (BP_GET_LOGICAL_BIRTH(bp) > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
+	if (BP_GET_BIRTH(bp) > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
 		int64_t delta;
 
 		/*
@@ -346,14 +346,14 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
 		ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_num_children > 0);
 		/* if (logical birth > prev prev snap txg) prev unique += bs */
 		if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
-		    ds->ds_object && BP_GET_LOGICAL_BIRTH(bp) >
+		    ds->ds_object && BP_GET_BIRTH(bp) >
 		    dsl_dataset_phys(ds->ds_prev)->ds_prev_snap_txg) {
 			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
 			mutex_enter(&ds->ds_prev->ds_lock);
 			dsl_dataset_phys(ds->ds_prev)->ds_unique_bytes += used;
 			mutex_exit(&ds->ds_prev->ds_lock);
 		}
-		if (BP_GET_LOGICAL_BIRTH(bp) > ds->ds_dir->dd_origin_txg) {
+		if (BP_GET_BIRTH(bp) > ds->ds_dir->dd_origin_txg) {
 			dsl_dir_transfer_space(ds->ds_dir, used,
 			    DD_USED_HEAD, DD_USED_SNAP, tx);
 		}
@@ -2944,7 +2944,7 @@ dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap)
 	if (snap == NULL)
 		return (B_FALSE);
 	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
-	birth = BP_GET_LOGICAL_BIRTH(dsl_dataset_get_blkptr(ds));
+	birth = BP_GET_BIRTH(dsl_dataset_get_blkptr(ds));
 	rrw_exit(&ds->ds_bp_rwlock, FTAG);
 	if (birth > dsl_dataset_phys(snap)->ds_creation_txg) {
 		objset_t *os, *os_snap;
diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c
index 3113d932fb6..9ffc998ac17 100644
--- a/module/zfs/dsl_deadlist.c
+++ b/module/zfs/dsl_deadlist.c
@@ -484,7 +484,7 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed,
 	dl->dl_phys->dl_comp += sign * BP_GET_PSIZE(bp);
 	dl->dl_phys->dl_uncomp += sign * BP_GET_UCSIZE(bp);
 
-	dle_tofind.dle_mintxg = BP_GET_LOGICAL_BIRTH(bp);
+	dle_tofind.dle_mintxg = BP_GET_BIRTH(bp);
 	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
 	if (dle == NULL)
 		dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
@@ -493,7 +493,7 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed,
 
 	if (dle == NULL) {
 		zfs_panic_recover("blkptr at %p has invalid BLK_BIRTH %llu",
-		    bp, (longlong_t)BP_GET_LOGICAL_BIRTH(bp));
+		    bp, (longlong_t)BP_GET_BIRTH(bp));
 		dle = avl_first(&dl->dl_tree);
 	}
 
diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c
index f5ec93b2dc5..fff49c97f4d 100644
--- a/module/zfs/dsl_destroy.c
+++ b/module/zfs/dsl_destroy.c
@@ -133,11 +133,11 @@ process_old_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
 
 	ASSERT(!BP_IS_HOLE(bp));
 
-	if (BP_GET_LOGICAL_BIRTH(bp) <=
+	if (BP_GET_BIRTH(bp) <=
 	    dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) {
 		dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, bp_freed, tx);
 		if (poa->ds_prev && !poa->after_branch_point &&
-		    BP_GET_LOGICAL_BIRTH(bp) >
+		    BP_GET_BIRTH(bp) >
 		    dsl_dataset_phys(poa->ds_prev)->ds_prev_snap_txg) {
 			dsl_dataset_phys(poa->ds_prev)->ds_unique_bytes +=
 			    bp_get_dsize_sync(dp->dp_spa, bp);
@@ -315,8 +315,7 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
 
 	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
 	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
-	ASSERT3U(BP_GET_LOGICAL_BIRTH(&dsl_dataset_phys(ds)->ds_bp), <=,
-	    tx->tx_txg);
+	ASSERT3U(BP_GET_BIRTH(&dsl_dataset_phys(ds)->ds_bp), <=, tx->tx_txg);
 	rrw_exit(&ds->ds_bp_rwlock, FTAG);
 	ASSERT(zfs_refcount_is_zero(&ds->ds_longholds));
 
@@ -730,7 +729,7 @@ kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 		dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
 	} else {
 		ASSERT(zilog == NULL);
-		ASSERT3U(BP_GET_LOGICAL_BIRTH(bp), >,
+		ASSERT3U(BP_GET_BIRTH(bp), >,
 		    dsl_dataset_phys(ka->ds)->ds_prev_snap_txg);
 		(void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
 	}
@@ -1020,8 +1019,7 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
 	ASSERT(ds->ds_prev == NULL ||
 	    dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj != ds->ds_object);
 	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
-	ASSERT3U(BP_GET_LOGICAL_BIRTH(&dsl_dataset_phys(ds)->ds_bp), <=,
-	    tx->tx_txg);
+	ASSERT3U(BP_GET_BIRTH(&dsl_dataset_phys(ds)->ds_bp), <=, tx->tx_txg);
 	rrw_exit(&ds->ds_bp_rwlock, FTAG);
 	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
 
diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c
index f1088d87208..4f1f66b835f 100644
--- a/module/zfs/dsl_pool.c
+++ b/module/zfs/dsl_pool.c
@@ -1056,7 +1056,7 @@ upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 		 * will be wrong.
 		 */
 		rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
-		ASSERT0(BP_GET_LOGICAL_BIRTH(&dsl_dataset_phys(prev)->ds_bp));
+		ASSERT0(BP_GET_BIRTH(&dsl_dataset_phys(prev)->ds_bp));
 		rrw_exit(&ds->ds_bp_rwlock, FTAG);
 
 		/* The origin doesn't get attached to itself */
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index 1b2cd3e361d..5052992d775 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -454,7 +454,7 @@ static inline void
 bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i)
 {
 	sio->sio_blk_prop = bp->blk_prop;
-	sio->sio_phys_birth = BP_GET_PHYSICAL_BIRTH(bp);
+	sio->sio_phys_birth = BP_GET_RAW_PHYSICAL_BIRTH(bp);
 	sio->sio_birth = BP_GET_LOGICAL_BIRTH(bp);
 	sio->sio_cksum = bp->blk_cksum;
 	sio->sio_nr_dvas = BP_GET_NDVAS(bp);
@@ -1768,7 +1768,7 @@ dsl_scan_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
 
 	ASSERT(!BP_IS_REDACTED(bp));
 	if (BP_IS_HOLE(bp) ||
-	    BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg)
+	    BP_GET_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg)
 		return (0);
 
 	/*
@@ -1778,7 +1778,7 @@ dsl_scan_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
 	 * scrub there's nothing to do to it).
 	 */
 	if (claim_txg == 0 &&
-	    BP_GET_LOGICAL_BIRTH(bp) >= spa_min_claim_txg(dp->dp_spa))
+	    BP_GET_BIRTH(bp) >= spa_min_claim_txg(dp->dp_spa))
 		return (0);
 
 	SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
@@ -1804,7 +1804,7 @@ dsl_scan_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg,
 
 		ASSERT(!BP_IS_REDACTED(bp));
 		if (BP_IS_HOLE(bp) ||
-		    BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg)
+		    BP_GET_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg)
 			return (0);
 
 		/*
@@ -1812,7 +1812,7 @@ dsl_scan_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg,
 		 * already txg sync'ed (but this log block contains
 		 * other records that are not synced)
 		 */
-		if (claim_txg == 0 || BP_GET_LOGICAL_BIRTH(bp) < claim_txg)
+		if (claim_txg == 0 || BP_GET_BIRTH(bp) < claim_txg)
 			return (0);
 
 		ASSERT3U(BP_GET_LSIZE(bp), !=, 0);
@@ -1952,7 +1952,7 @@ dsl_scan_prefetch(scan_prefetch_ctx_t *spc, blkptr_t *bp, zbookmark_phys_t *zb)
 		return;
 
 	if (BP_IS_HOLE(bp) ||
-	    BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg ||
+	    BP_GET_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg ||
 	    (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE &&
 	    BP_GET_TYPE(bp) != DMU_OT_OBJSET))
 		return;
@@ -2223,7 +2223,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
 	if (dnp != NULL &&
 	    dnp->dn_bonuslen > DN_MAX_BONUS_LEN(dnp)) {
 		scn->scn_phys.scn_errors++;
-		spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp));
+		spa_log_error(spa, zb, BP_GET_PHYSICAL_BIRTH(bp));
 		return (SET_ERROR(EINVAL));
 	}
 
@@ -2319,7 +2319,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
 		 * by arc_read() for the cases above.
 		 */
 		scn->scn_phys.scn_errors++;
-		spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp));
+		spa_log_error(spa, zb, BP_GET_PHYSICAL_BIRTH(bp));
 		return (SET_ERROR(EINVAL));
 	}
 
@@ -2396,7 +2396,12 @@ dsl_scan_visitbp(const blkptr_t *bp, const zbookmark_phys_t *zb,
 	if (f != SPA_FEATURE_NONE)
 		ASSERT(dsl_dataset_feature_is_active(ds, f));
 
-	if (BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg) {
+	/*
+	 * Recurse any blocks that were written either logically or physically
+	 * at or after cur_min_txg.  About logical birth we care for traversal,
+	 * looking for any changes, while about physical for the actual scan.
+	 */
+	if (BP_GET_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg) {
 		scn->scn_lt_min_this_txg++;
 		return;
 	}
@@ -2422,7 +2427,7 @@ dsl_scan_visitbp(const blkptr_t *bp, const zbookmark_phys_t *zb,
 	 * Don't scan it now unless we need to because something
 	 * under it was modified.
 	 */
-	if (BP_GET_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) {
+	if (BP_GET_PHYSICAL_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) {
 		scn->scn_gt_max_this_txg++;
 		return;
 	}
@@ -4806,7 +4811,7 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
 {
 	dsl_scan_t *scn = dp->dp_scan;
 	spa_t *spa = dp->dp_spa;
-	uint64_t phys_birth = BP_GET_BIRTH(bp);
+	uint64_t phys_birth = BP_GET_PHYSICAL_BIRTH(bp);
 	size_t psize = BP_GET_PSIZE(bp);
 	boolean_t needs_io = B_FALSE;
 	int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 102a43e1166..0e5f09b2724 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -5603,7 +5603,21 @@ remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
 	vdev_indirect_births_t *vib = oldvd->vdev_indirect_births;
 	uint64_t physical_birth = vdev_indirect_births_physbirth(vib,
 	    DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0]));
-	BP_SET_PHYSICAL_BIRTH(bp, physical_birth);
+
+	/*
+	 * For rewritten blocks, use the old physical birth as the new logical
+	 * birth (representing when the space was allocated) and the removal
+	 * time as the new physical birth (representing when it was actually
+	 * written).
+	 */
+	if (BP_GET_REWRITE(bp)) {
+		uint64_t old_physical_birth = BP_GET_PHYSICAL_BIRTH(bp);
+		ASSERT3U(old_physical_birth, <, physical_birth);
+		BP_SET_BIRTH(bp, old_physical_birth, physical_birth);
+		BP_SET_REWRITE(bp, 0);
+	} else {
+		BP_SET_PHYSICAL_BIRTH(bp, physical_birth);
+	}
 
 	DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
 	DVA_SET_OFFSET(&bp->blk_dva[0], offset);
@@ -5972,7 +5986,7 @@ metaslab_alloc_range(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 	int error = 0;
 
 	ASSERT0(BP_GET_LOGICAL_BIRTH(bp));
-	ASSERT0(BP_GET_PHYSICAL_BIRTH(bp));
+	ASSERT0(BP_GET_RAW_PHYSICAL_BIRTH(bp));
 
 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
 
@@ -6034,7 +6048,7 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
 	int ndvas = BP_GET_NDVAS(bp);
 
 	ASSERT(!BP_IS_HOLE(bp));
-	ASSERT(!now || BP_GET_LOGICAL_BIRTH(bp) >= spa_syncing_txg(spa));
+	ASSERT(!now || BP_GET_BIRTH(bp) >= spa_syncing_txg(spa));
 
 	/*
 	 * If we have a checkpoint for the pool we need to make sure that
@@ -6052,7 +6066,7 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
 	 * normally as they will be referenced by the checkpointed uberblock.
 	 */
 	boolean_t checkpoint = B_FALSE;
-	if (BP_GET_LOGICAL_BIRTH(bp) <= spa->spa_checkpoint_txg &&
+	if (BP_GET_BIRTH(bp) <= spa->spa_checkpoint_txg &&
 	    spa_syncing_txg(spa) > spa->spa_checkpoint_txg) {
 		/*
 		 * At this point, if the block is part of the checkpoint
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index bbf474ed631..5ecb175fbd6 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -2862,8 +2862,8 @@ spa_claim_notify(zio_t *zio)
 		return;
 
 	mutex_enter(&spa->spa_props_lock);	/* any mutex will do */
-	if (spa->spa_claim_max_txg < BP_GET_LOGICAL_BIRTH(zio->io_bp))
-		spa->spa_claim_max_txg = BP_GET_LOGICAL_BIRTH(zio->io_bp);
+	if (spa->spa_claim_max_txg < BP_GET_BIRTH(zio->io_bp))
+		spa->spa_claim_max_txg = BP_GET_BIRTH(zio->io_bp);
 	mutex_exit(&spa->spa_props_lock);
 }
 
diff --git a/module/zfs/spa_errlog.c b/module/zfs/spa_errlog.c
index 3e08f261fda..7252fd534bd 100644
--- a/module/zfs/spa_errlog.c
+++ b/module/zfs/spa_errlog.c
@@ -253,7 +253,7 @@ find_birth_txg(dsl_dataset_t *ds, zbookmark_err_phys_t *zep,
 	if (error == 0 && BP_IS_HOLE(&bp))
 		error = SET_ERROR(ENOENT);
 
-	*birth_txg = BP_GET_LOGICAL_BIRTH(&bp);
+	*birth_txg = BP_GET_PHYSICAL_BIRTH(&bp);
 	rw_exit(&dn->dn_struct_rwlock);
 	dnode_rele(dn, FTAG);
 	return (error);
@@ -885,7 +885,7 @@ sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj,
 		if (error == EACCES)
 			error = 0;
 		else if (!error)
-			zep.zb_birth = BP_GET_LOGICAL_BIRTH(&bp);
+			zep.zb_birth = BP_GET_PHYSICAL_BIRTH(&bp);
 
 		rw_exit(&dn->dn_struct_rwlock);
 		dnode_rele(dn, FTAG);
diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c
index 2b78340cf70..18efdaac006 100644
--- a/module/zfs/vdev_mirror.c
+++ b/module/zfs/vdev_mirror.c
@@ -532,7 +532,7 @@ vdev_mirror_child_select(zio_t *zio)
 	uint64_t txg = zio->io_txg;
 	int c, lowest_load;
 
-	ASSERT(zio->io_bp == NULL || BP_GET_BIRTH(zio->io_bp) == txg);
+	ASSERT(zio->io_bp == NULL || BP_GET_PHYSICAL_BIRTH(zio->io_bp) == txg);
 
 	lowest_load = INT_MAX;
 	mm->mm_preferred_cnt = 0;
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
index a5fa9a60493..210cdcab1ec 100644
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -2206,11 +2206,7 @@ vdev_raidz_close(vdev_t *vd)
 
 /*
  * Return the logical width to use, given the txg in which the allocation
- * happened.  Note that BP_GET_BIRTH() is usually the txg in which the
- * BP was allocated.  Remapped BP's (that were relocated due to device
- * removal, see remap_blkptr_cb()), will have a more recent physical birth
- * which reflects when the BP was relocated, but we can ignore these because
- * they can't be on RAIDZ (device removal doesn't support RAIDZ).
+ * happened.
  */
 static uint64_t
 vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg)
@@ -2343,7 +2339,7 @@ vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col)
 	logical_rs.rs_start = rr->rr_offset;
 	logical_rs.rs_end = logical_rs.rs_start +
 	    vdev_raidz_psize_to_asize(zio->io_vd, rr->rr_size,
-	    BP_GET_BIRTH(zio->io_bp));
+	    BP_GET_PHYSICAL_BIRTH(zio->io_bp));
 
 	raidz_col_t *rc = &rr->rr_col[col];
 	vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
@@ -2566,7 +2562,7 @@ vdev_raidz_io_start(zio_t *zio)
 	raidz_map_t *rm;
 
 	uint64_t logical_width = vdev_raidz_get_logical_width(vdrz,
-	    BP_GET_BIRTH(zio->io_bp));
+	    BP_GET_PHYSICAL_BIRTH(zio->io_bp));
 	if (logical_width != vdrz->vd_physical_width) {
 		zfs_locked_range_t *lr = NULL;
 		uint64_t synced_offset = UINT64_MAX;
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index 3aa188a9581..6e4f8425740 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -589,7 +589,7 @@ zil_clear_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
 	 * that we rewind to is invalid. Thus, we return -1 so
 	 * zil_parse() doesn't attempt to read it.
 	 */
-	if (BP_GET_LOGICAL_BIRTH(bp) >= first_txg)
+	if (BP_GET_BIRTH(bp) >= first_txg)
 		return (-1);
 
 	if (zil_bp_tree_add(zilog, bp) != 0)
@@ -615,7 +615,7 @@ zil_claim_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
 	 * Claim log block if not already committed and not already claimed.
 	 * If tx == NULL, just verify that the block is claimable.
 	 */
-	if (BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) < first_txg ||
+	if (BP_IS_HOLE(bp) || BP_GET_BIRTH(bp) < first_txg ||
 	    zil_bp_tree_add(zilog, bp) != 0)
 		return (0);
 
@@ -640,7 +640,7 @@ zil_claim_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg)
 	 * waited for all writes to be stable first), so it is semantically
 	 * correct to declare this the end of the log.
 	 */
-	if (BP_GET_LOGICAL_BIRTH(&lr->lr_blkptr) >= first_txg) {
+	if (BP_GET_BIRTH(&lr->lr_blkptr) >= first_txg) {
 		error = zil_read_log_data(zilog, lr, NULL);
 		if (error != 0)
 			return (error);
@@ -687,7 +687,7 @@ zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx,
 		 * just in case lets be safe and just stop here now instead of
 		 * corrupting the pool.
 		 */
-		if (BP_GET_BIRTH(bp) >= first_txg)
+		if (BP_GET_PHYSICAL_BIRTH(bp) >= first_txg)
 			return (SET_ERROR(ENOENT));
 
 		/*
@@ -742,7 +742,7 @@ zil_free_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t claim_txg)
 	/*
 	 * If we previously claimed it, we need to free it.
 	 */
-	if (BP_GET_LOGICAL_BIRTH(bp) >= claim_txg &&
+	if (BP_GET_BIRTH(bp) >= claim_txg &&
 	    zil_bp_tree_add(zilog, bp) == 0 && !BP_IS_HOLE(bp)) {
 		zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
 	}
@@ -1997,7 +1997,7 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
 		    &slog);
 	}
 	if (error == 0) {
-		ASSERT3U(BP_GET_LOGICAL_BIRTH(bp), ==, txg);
+		ASSERT3U(BP_GET_BIRTH(bp), ==, txg);
 		BP_SET_CHECKSUM(bp, nlwb->lwb_slim ? ZIO_CHECKSUM_ZILOG2 :
 		    ZIO_CHECKSUM_ZILOG);
 		bp->blk_cksum = lwb->lwb_blk.blk_cksum;
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 0fde2d6f7c1..41e0dc0004d 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -692,7 +692,7 @@ zio_decrypt(zio_t *zio, abd_t *data, uint64_t size)
 		zio->io_error = SET_ERROR(EIO);
 		if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
 			spa_log_error(spa, &zio->io_bookmark,
-			    BP_GET_LOGICAL_BIRTH(zio->io_bp));
+			    BP_GET_PHYSICAL_BIRTH(zio->io_bp));
 			(void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
 			    spa, NULL, &zio->io_bookmark, zio, 0);
 		}
@@ -1104,7 +1104,8 @@ zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp,
 	    "DVA[1]=%#llx/%#llx "
 	    "DVA[2]=%#llx/%#llx "
 	    "prop=%#llx "
-	    "pad=%#llx,%#llx "
+	    "prop2=%#llx "
+	    "pad=%#llx "
 	    "phys_birth=%#llx "
 	    "birth=%#llx "
 	    "fill=%#llx "
@@ -1117,9 +1118,9 @@ zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp,
 	    (long long)bp->blk_dva[2].dva_word[0],
 	    (long long)bp->blk_dva[2].dva_word[1],
 	    (long long)bp->blk_prop,
-	    (long long)bp->blk_pad[0],
-	    (long long)bp->blk_pad[1],
-	    (long long)BP_GET_PHYSICAL_BIRTH(bp),
+	    (long long)bp->blk_prop2,
+	    (long long)bp->blk_pad,
+	    (long long)BP_GET_RAW_PHYSICAL_BIRTH(bp),
 	    (long long)BP_GET_LOGICAL_BIRTH(bp),
 	    (long long)bp->blk_fill,
 	    (long long)bp->blk_cksum.zc_word[0],
@@ -1334,7 +1335,7 @@ zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
 {
 	zio_t *zio;
 
-	zio = zio_create(pio, spa, BP_GET_BIRTH(bp), bp,
+	zio = zio_create(pio, spa, BP_GET_PHYSICAL_BIRTH(bp), bp,
 	    data, size, size, done, private,
 	    ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
 	    ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
@@ -1854,7 +1855,7 @@ zio_write_bp_init(zio_t *zio)
 		blkptr_t *bp = zio->io_bp;
 		zio_prop_t *zp = &zio->io_prop;
 
-		ASSERT(BP_GET_LOGICAL_BIRTH(bp) != zio->io_txg);
+		ASSERT(BP_GET_BIRTH(bp) != zio->io_txg);
 
 		*bp = *zio->io_bp_override;
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
@@ -1942,7 +1943,7 @@ zio_write_compress(zio_t *zio)
 	ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
 	ASSERT(zio->io_bp_override == NULL);
 
-	if (!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg) {
+	if (!BP_IS_HOLE(bp) && BP_GET_BIRTH(bp) == zio->io_txg) {
 		/*
 		 * We're rewriting an existing block, which means we're
 		 * working on behalf of spa_sync().  For spa_sync() to
@@ -2079,7 +2080,7 @@ zio_write_compress(zio_t *zio)
 	 * spa_sync() to allocate new blocks, but force rewrites after that.
 	 * There should only be a handful of blocks after pass 1 in any case.
 	 */
-	if (!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg &&
+	if (!BP_IS_HOLE(bp) && BP_GET_BIRTH(bp) == zio->io_txg &&
 	    BP_GET_PSIZE(bp) == psize &&
 	    pass >= zfs_sync_pass_rewrite) {
 		VERIFY3U(psize, !=, 0);
@@ -3894,7 +3895,7 @@ zio_ddt_write(zio_t *zio)
 			 * block and leave.
 			 */
 			if (have_dvas == 0) {
-				ASSERT(BP_GET_LOGICAL_BIRTH(bp) == txg);
+				ASSERT(BP_GET_BIRTH(bp) == txg);
 				ASSERT(BP_EQUAL(bp, zio->io_bp_override));
 				ddt_phys_extend(ddp, v, bp);
 				ddt_phys_addref(ddp, v);
@@ -4224,8 +4225,10 @@ zio_dva_allocate(zio_t *zio)
 		ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_GANG);
 		memcpy(zio->io_bp->blk_dva, zio->io_bp_orig.blk_dva,
 		    3 * sizeof (dva_t));
-		BP_SET_BIRTH(zio->io_bp, BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig),
-		    BP_GET_PHYSICAL_BIRTH(&zio->io_bp_orig));
+		BP_SET_LOGICAL_BIRTH(zio->io_bp,
+		    BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig));
+		BP_SET_PHYSICAL_BIRTH(zio->io_bp,
+		    BP_GET_RAW_PHYSICAL_BIRTH(&zio->io_bp_orig));
 		return (zio);
 	}
 
@@ -4385,12 +4388,11 @@ zio_dva_claim(zio_t *zio)
 static void
 zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
 {
-	ASSERT(BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg || BP_IS_HOLE(bp));
+	ASSERT(BP_GET_BIRTH(bp) == zio->io_txg || BP_IS_HOLE(bp));
 	ASSERT(zio->io_bp_override == NULL);
 
 	if (!BP_IS_HOLE(bp)) {
-		metaslab_free(zio->io_spa, bp, BP_GET_LOGICAL_BIRTH(bp),
-		    B_TRUE);
+		metaslab_free(zio->io_spa, bp, BP_GET_BIRTH(bp), B_TRUE);
 	}
 
 	if (gn != NULL) {
@@ -5268,7 +5270,7 @@ zio_ready(zio_t *zio)
 
 	if (zio->io_ready) {
 		ASSERT(IO_IS_ALLOCATING(zio));
-		ASSERT(BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg ||
+		ASSERT(BP_GET_BIRTH(bp) == zio->io_txg ||
 		    BP_IS_HOLE(bp) || (zio->io_flags & ZIO_FLAG_NOPWRITE));
 		ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
 
@@ -5423,8 +5425,6 @@ zio_done(zio_t *zio)
 			ASSERT(zio->io_children[c][w] == 0);
 
 	if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) {
-		ASSERT(zio->io_bp->blk_pad[0] == 0);
-		ASSERT(zio->io_bp->blk_pad[1] == 0);
 		ASSERT(memcmp(zio->io_bp, &zio->io_bp_copy,
 		    sizeof (blkptr_t)) == 0 ||
 		    (zio->io_bp == zio_unique_parent(zio)->io_bp));
@@ -5539,7 +5539,7 @@ zio_done(zio_t *zio)
 			 * error and generate a logical data ereport.
 			 */
 			spa_log_error(zio->io_spa, &zio->io_bookmark,
-			    BP_GET_LOGICAL_BIRTH(zio->io_bp));
+			    BP_GET_PHYSICAL_BIRTH(zio->io_bp));
 			(void) zfs_ereport_post(FM_EREPORT_ZFS_DATA,
 			    zio->io_spa, NULL, &zio->io_bookmark, zio, 0);
 		}
diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c
index 4cb9da0db0b..63d0c6dadd4 100644
--- a/module/zfs/zio_checksum.c
+++ b/module/zfs/zio_checksum.c
@@ -279,7 +279,7 @@ static void
 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
 {
 	const dva_t *dva = BP_IDENTITY(bp);
-	uint64_t txg = BP_GET_BIRTH(bp);
+	uint64_t txg = BP_GET_PHYSICAL_BIRTH(bp);
 
 	ASSERT(BP_IS_GANG(bp));
 

From 60f714e6e2cc349e120e2cb6cd29e95718106735 Mon Sep 17 00:00:00 2001
From: Alexander Motin <alexander.motin@TrueNAS.com>
Date: Wed, 23 Jul 2025 15:51:00 -0400
Subject: [PATCH 71/72] Implement physical rewrites

Based on previous commit this implements `zfs rewrite -P` flag,
making ZFS to keep blocks logical birth times while rewriting
files.  It should exclude the rewritten blocks from incremental
sends, snapshot diffs, etc.  Snapshots space usage same time will
reflect the additional space usage from newly allocated blocks.

Since this begins to use new "rewrite" flag in the block pointers,
this commit introduces a new read-compatible per-dataset feature
physical_rewrite.  It must be enabled for the command to not fail,
it is activated on first use and deactivated on deletion of the
last affected dataset.

Reviewed-by: Rob Norris <robn@despairlabs.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:  Alexander Motin <alexander.motin@TrueNAS.com>
Closes #17565
---
 cmd/zfs/zfs_main.c                            |   7 +-
 include/sys/dbuf.h                            |   1 +
 include/sys/dmu.h                             |   1 +
 include/sys/fs/zfs.h                          |   3 +
 include/sys/zio.h                             |   1 +
 include/zfeature_common.h                     |   1 +
 lib/libzfs/libzfs.abi                         |  11 +-
 man/man7/zpool-features.7                     |  17 +++
 man/man8/zfs-rewrite.8                        |  28 +++--
 module/zcommon/zfeature_common.c              |  12 +++
 module/zfs/dbuf.c                             |  57 ++++++++++
 module/zfs/dmu.c                              |   1 +
 module/zfs/zfs_vnops.c                        |  16 ++-
 module/zfs/zio.c                              |  26 +++++
 tests/runfiles/common.run                     |   2 +-
 tests/runfiles/sanity.run                     |   2 +-
 tests/zfs-tests/tests/Makefile.am             |   1 +
 .../zfs_rewrite/zfs_rewrite_physical.ksh      | 100 ++++++++++++++++++
 .../cli_root/zpool_get/zpool_get.cfg          |   1 +
 19 files changed, 270 insertions(+), 18 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_rewrite/zfs_rewrite_physical.ksh

diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c
index 81727224b04..533b355fa85 100644
--- a/cmd/zfs/zfs_main.c
+++ b/cmd/zfs/zfs_main.c
@@ -440,7 +440,7 @@ get_usage(zfs_help_t idx)
 		return (gettext("\tredact <snapshot> <bookmark> "
 		    "<redaction_snapshot> ...\n"));
 	case HELP_REWRITE:
-		return (gettext("\trewrite [-rvx] [-o <offset>] [-l <length>] "
+		return (gettext("\trewrite [-Prvx] [-o <offset>] [-l <length>] "
 		    "<directory|file ...>\n"));
 	case HELP_JAIL:
 		return (gettext("\tjail <jailid|jailname> <filesystem>\n"));
@@ -9177,8 +9177,11 @@ zfs_do_rewrite(int argc, char **argv)
 	zfs_rewrite_args_t args;
 	memset(&args, 0, sizeof (args));
 
-	while ((c = getopt(argc, argv, "l:o:rvx")) != -1) {
+	while ((c = getopt(argc, argv, "Pl:o:rvx")) != -1) {
 		switch (c) {
+		case 'P':
+			args.flags |= ZFS_REWRITE_PHYSICAL;
+			break;
 		case 'l':
 			args.len = strtoll(optarg, NULL, 0);
 			break;
diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h
index 756459b2fbb..baf3b150833 100644
--- a/include/sys/dbuf.h
+++ b/include/sys/dbuf.h
@@ -164,6 +164,7 @@ typedef struct dbuf_dirty_record {
 			boolean_t dr_nopwrite;
 			boolean_t dr_brtwrite;
 			boolean_t dr_diowrite;
+			boolean_t dr_rewrite;
 			boolean_t dr_has_raw_params;
 
 			/* Override and raw params are mutually exclusive. */
diff --git a/include/sys/dmu.h b/include/sys/dmu.h
index 7dc6daaf06e..7c2024a16d8 100644
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@@ -825,6 +825,7 @@ struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db);
  */
 void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx);
 void dmu_buf_will_dirty_flags(dmu_buf_t *db, dmu_tx_t *tx, dmu_flags_t flags);
+void dmu_buf_will_rewrite(dmu_buf_t *db, dmu_tx_t *tx);
 boolean_t dmu_buf_is_dirty(dmu_buf_t *db, dmu_tx_t *tx);
 void dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder,
     const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx);
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index c8deb5be419..fc359c10365 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -1627,6 +1627,9 @@ typedef struct zfs_rewrite_args {
 	uint64_t	arg;
 } zfs_rewrite_args_t;
 
+/* zfs_rewrite_args flags */
+#define	ZFS_REWRITE_PHYSICAL	0x1	/* Preserve logical birth time. */
+
 #define	ZFS_IOC_REWRITE		_IOW(0x83, 3, zfs_rewrite_args_t)
 
 /*
diff --git a/include/sys/zio.h b/include/sys/zio.h
index b139c9de485..a3368034695 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -374,6 +374,7 @@ typedef struct zio_prop {
 	boolean_t		zp_encrypt;
 	boolean_t		zp_byteorder;
 	boolean_t		zp_direct_write;
+	boolean_t		zp_rewrite;
 	uint8_t			zp_salt[ZIO_DATA_SALT_LEN];
 	uint8_t			zp_iv[ZIO_DATA_IV_LEN];
 	uint8_t			zp_mac[ZIO_DATA_MAC_LEN];
diff --git a/include/zfeature_common.h b/include/zfeature_common.h
index 4877df4b114..56382ca85b5 100644
--- a/include/zfeature_common.h
+++ b/include/zfeature_common.h
@@ -89,6 +89,7 @@ typedef enum spa_feature {
 	SPA_FEATURE_LARGE_MICROZAP,
 	SPA_FEATURE_DYNAMIC_GANG_HEADER,
 	SPA_FEATURE_BLOCK_CLONING_ENDIAN,
+	SPA_FEATURE_PHYSICAL_REWRITE,
 	SPA_FEATURES
 } spa_feature_t;
 
diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi
index 37d22402e77..ba161d1ef10 100644
--- a/lib/libzfs/libzfs.abi
+++ b/lib/libzfs/libzfs.abi
@@ -639,7 +639,7 @@
     <elf-symbol name='fletcher_4_superscalar_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='libzfs_config_ops' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='sa_protocol_names' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
-    <elf-symbol name='spa_feature_table' size='2576' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='spa_feature_table' size='2632' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zfeature_checks_disable' size='4' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zfs_deleg_perm_tab' size='528' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zfs_history_event_names' size='328' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@@ -6399,7 +6399,8 @@
       <enumerator name='SPA_FEATURE_LARGE_MICROZAP' value='43'/>
       <enumerator name='SPA_FEATURE_DYNAMIC_GANG_HEADER' value='44'/>
       <enumerator name='SPA_FEATURE_BLOCK_CLONING_ENDIAN' value='45'/>
-      <enumerator name='SPA_FEATURES' value='46'/>
+      <enumerator name='SPA_FEATURE_PHYSICAL_REWRITE' value='46'/>
+      <enumerator name='SPA_FEATURES' value='47'/>
     </enum-decl>
     <typedef-decl name='spa_feature_t' type-id='33ecb627' id='d6618c78'/>
     <qualified-type-def type-id='80f4b756' const='yes' id='b99c00c9'/>
@@ -9614,8 +9615,8 @@
     </function-decl>
   </abi-instr>
   <abi-instr address-size='64' path='module/zcommon/zfeature_common.c' language='LANG_C99'>
-    <array-type-def dimensions='1' type-id='83f29ca2' size-in-bits='20608' id='b9408bab'>
-      <subrange length='46' type-id='7359adad' id='8b86bc1b'/>
+    <array-type-def dimensions='1' type-id='83f29ca2' size-in-bits='21056' id='fd43354e'>
+      <subrange length='47' type-id='7359adad' id='8f8900fe'/>
     </array-type-def>
     <enum-decl name='zfeature_flags' id='6db816a4'>
       <underlying-type type-id='9cac1fee'/>
@@ -9693,7 +9694,7 @@
     <pointer-type-def type-id='611586a1' size-in-bits='64' id='2e243169'/>
     <qualified-type-def type-id='eaa32e2f' const='yes' id='83be723c'/>
     <pointer-type-def type-id='83be723c' size-in-bits='64' id='7acd98a2'/>
-    <var-decl name='spa_feature_table' type-id='b9408bab' mangled-name='spa_feature_table' visibility='default' elf-symbol-id='spa_feature_table'/>
+    <var-decl name='spa_feature_table' type-id='fd43354e' mangled-name='spa_feature_table' visibility='default' elf-symbol-id='spa_feature_table'/>
     <var-decl name='zfeature_checks_disable' type-id='c19b74c3' mangled-name='zfeature_checks_disable' visibility='default' elf-symbol-id='zfeature_checks_disable'/>
     <function-decl name='opendir' visibility='default' binding='global' size-in-bits='64'>
       <parameter type-id='80f4b756'/>
diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7
index 66aa100b714..10dfd1f9293 100644
--- a/man/man7/zpool-features.7
+++ b/man/man7/zpool-features.7
@@ -853,6 +853,23 @@ when the
 command is used on a top-level vdev, and will never return to being
 .Sy enabled .
 .
+.feature com.truenas physical_rewrite yes extensible_dataset
+This feature enables physical block rewriting that preserves logical birth
+times, avoiding unnecessary inclusion of rewritten blocks in incremental
+.Nm zfs Cm send
+streams.
+When enabled, the
+.Nm zfs Cm rewrite Fl P
+command can be used.
+.Pp
+This feature becomes
+.Sy active
+the first time
+.Nm zfs Cm rewrite Fl P
+is used on any dataset, and will return to being
+.Sy enabled
+once all datasets that have ever used physical rewrite are destroyed.
+.
 .feature org.zfsonlinux project_quota yes extensible_dataset
 This feature allows administrators to account the spaces and objects usage
 information against the project identifier
diff --git a/man/man8/zfs-rewrite.8 b/man/man8/zfs-rewrite.8
index 423d6d439e2..a3a037f3794 100644
--- a/man/man8/zfs-rewrite.8
+++ b/man/man8/zfs-rewrite.8
@@ -31,7 +31,7 @@
 .Sh SYNOPSIS
 .Nm zfs
 .Cm rewrite
-.Oo Fl rvx Ns Oc
+.Oo Fl Prvx Ns Oc
 .Op Fl l Ar length
 .Op Fl o Ar offset
 .Ar file Ns | Ns Ar directory Ns …
@@ -43,6 +43,15 @@ as is without modification at a new location and possibly with new
 properties, such as checksum, compression, dedup, copies, etc,
 as if they were atomically read and written back.
 .Bl -tag -width "-r"
+.It Fl P
+Perform physical rewrite, preserving logical birth time of blocks.
+By default, rewrite updates logical birth times, making blocks appear
+as modified in snapshots and incremental send streams.
+Physical rewrite preserves logical birth times, avoiding unnecessary
+inclusion in incremental streams.
+Physical rewrite requires the
+.Sy physical_rewrite
+feature to be enabled on the pool.
 .It Fl l Ar length
 Rewrite at most this number of bytes.
 .It Fl o Ar offset
@@ -60,17 +69,22 @@ same as some property changes may increase pool space usage.
 Holes that were never written or were previously zero-compressed are
 not rewritten and will remain holes even if compression is disabled.
 .Pp
-Rewritten blocks will be seen as modified in next snapshot and as such
-included into the incremental
-.Nm zfs Cm send
-stream.
-.Pp
 If a
 .Fl l
 or
 .Fl o
 value request a rewrite to regions past the end of the file, then those
 regions are silently ignored, and no error is reported.
+.Pp
+By default, rewritten blocks update their logical birth time,
+meaning they will be included in incremental
+.Nm zfs Cm send
+streams as modified data.
+When the
+.Fl P
+flag is used, rewritten blocks preserve their logical birth time, since
+there are no user data changes.
 .
 .Sh SEE ALSO
-.Xr zfsprops 7
+.Xr zfsprops 7 ,
+.Xr zpool-features 7
diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c
index 0b37530b0e1..6ba9892eeb6 100644
--- a/module/zcommon/zfeature_common.c
+++ b/module/zcommon/zfeature_common.c
@@ -798,6 +798,18 @@ zpool_feature_init(void)
 	    ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_NO_UPGRADE,
 	    ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures);
 
+	{
+		static const spa_feature_t physical_rewrite_deps[] = {
+			SPA_FEATURE_EXTENSIBLE_DATASET,
+			SPA_FEATURE_NONE
+		};
+		zfeature_register(SPA_FEATURE_PHYSICAL_REWRITE,
+		    "com.truenas:physical_rewrite", "physical_rewrite",
+		    "Support for preserving logical birth time during rewrite.",
+		    ZFEATURE_FLAG_READONLY_COMPAT | ZFEATURE_FLAG_PER_DATASET,
+		    ZFEATURE_TYPE_BOOLEAN, physical_rewrite_deps, sfeatures);
+	}
+
 	zfs_mod_list_supported_free(sfeatures);
 }
 
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index a96666a4675..432c99cec96 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -2160,6 +2160,12 @@ dbuf_redirty(dbuf_dirty_record_t *dr)
 			ASSERT(arc_released(db->db_buf));
 			arc_buf_thaw(db->db_buf);
 		}
+
+		/*
+		 * Clear the rewrite flag since this is now a logical
+		 * modification.
+		 */
+		dr->dt.dl.dr_rewrite = B_FALSE;
 	}
 }
 
@@ -2707,6 +2713,38 @@ dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
 	dmu_buf_will_dirty_flags(db_fake, tx, DMU_READ_NO_PREFETCH);
 }
 
+void
+dmu_buf_will_rewrite(dmu_buf_t *db_fake, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+	ASSERT(tx->tx_txg != 0);
+	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
+
+	/*
+	 * If the dbuf is already dirty in this txg, it will be written
+	 * anyway, so there's nothing to do.
+	 */
+	mutex_enter(&db->db_mtx);
+	if (dbuf_find_dirty_eq(db, tx->tx_txg) != NULL) {
+		mutex_exit(&db->db_mtx);
+		return;
+	}
+	mutex_exit(&db->db_mtx);
+
+	/*
+	 * The dbuf is not dirty, so we need to make it dirty and
+	 * mark it for rewrite (preserve logical birth time).
+	 */
+	dmu_buf_will_dirty_flags(db_fake, tx, DMU_READ_NO_PREFETCH);
+
+	mutex_enter(&db->db_mtx);
+	dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);
+	if (dr != NULL && db->db_level == 0)
+		dr->dt.dl.dr_rewrite = B_TRUE;
+	mutex_exit(&db->db_mtx);
+}
+
 boolean_t
 dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
 {
@@ -5338,6 +5376,24 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 
 	dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
 
+	/*
+	 * Set rewrite properties for zfs_rewrite() operations.
+	 */
+	if (db->db_level == 0 && dr->dt.dl.dr_rewrite) {
+		zp.zp_rewrite = B_TRUE;
+
+		/*
+		 * Mark physical rewrite feature for activation.
+		 * This will be activated automatically during dataset sync.
+		 */
+		dsl_dataset_t *ds = os->os_dsl_dataset;
+		if (!dsl_dataset_feature_is_active(ds,
+		    SPA_FEATURE_PHYSICAL_REWRITE)) {
+			ds->ds_feature_activation[
+			    SPA_FEATURE_PHYSICAL_REWRITE] = (void *)B_TRUE;
+		}
+	}
+
 	/*
 	 * We copy the blkptr now (rather than when we instantiate the dirty
 	 * record), because its value can change between open context and
@@ -5408,6 +5464,7 @@ EXPORT_SYMBOL(dbuf_release_bp);
 EXPORT_SYMBOL(dbuf_dirty);
 EXPORT_SYMBOL(dmu_buf_set_crypt_params);
 EXPORT_SYMBOL(dmu_buf_will_dirty);
+EXPORT_SYMBOL(dmu_buf_will_rewrite);
 EXPORT_SYMBOL(dmu_buf_is_dirty);
 EXPORT_SYMBOL(dmu_buf_will_clone_or_dio);
 EXPORT_SYMBOL(dmu_buf_will_not_fill);
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index 690227a3093..296e58ef9cd 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -2508,6 +2508,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
 	zp->zp_encrypt = encrypt;
 	zp->zp_byteorder = ZFS_HOST_BYTEORDER;
 	zp->zp_direct_write = (wp & WP_DIRECT_WR) ? B_TRUE : B_FALSE;
+	zp->zp_rewrite = B_FALSE;
 	memset(zp->zp_salt, 0, ZIO_DATA_SALT_LEN);
 	memset(zp->zp_iv, 0, ZIO_DATA_IV_LEN);
 	memset(zp->zp_mac, 0, ZIO_DATA_MAC_LEN);
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index 8ad992f5b62..74aa91a4f2e 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -49,6 +49,7 @@
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
 #include <sys/dsl_crypt.h>
+#include <sys/dsl_dataset.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/dbuf.h>
@@ -1101,13 +1102,21 @@ zfs_rewrite(znode_t *zp, uint64_t off, uint64_t len, uint64_t flags,
 {
 	int error;
 
-	if (flags != 0 || arg != 0)
+	if ((flags & ~ZFS_REWRITE_PHYSICAL) != 0 || arg != 0)
 		return (SET_ERROR(EINVAL));
 
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
+	/* Check if physical rewrite is allowed */
+	spa_t *spa = zfsvfs->z_os->os_spa;
+	if ((flags & ZFS_REWRITE_PHYSICAL) &&
+	    !spa_feature_is_enabled(spa, SPA_FEATURE_PHYSICAL_REWRITE)) {
+		zfs_exit(zfsvfs, FTAG);
+		return (SET_ERROR(ENOTSUP));
+	}
+
 	if (zfs_is_readonly(zfsvfs)) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EROFS));
@@ -1195,7 +1204,10 @@ zfs_rewrite(znode_t *zp, uint64_t off, uint64_t len, uint64_t flags,
 			if (dmu_buf_is_dirty(dbp[i], tx))
 				continue;
 			nw += dbp[i]->db_size;
-			dmu_buf_will_dirty(dbp[i], tx);
+			if (flags & ZFS_REWRITE_PHYSICAL)
+				dmu_buf_will_rewrite(dbp[i], tx);
+			else
+				dmu_buf_will_dirty(dbp[i], tx);
 		}
 		dmu_buf_rele_array(dbp, numbufs, FTAG);
 
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 41e0dc0004d..218aec6093e 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -3923,6 +3923,23 @@ zio_ddt_write(zio_t *zio)
 		 * then we can just use them as-is.
 		 */
 		if (have_dvas >= need_dvas) {
+			/*
+			 * For rewrite operations, try preserving the original
+			 * logical birth time.  If the result matches the
+			 * original BP, this becomes a NOP.
+			 */
+			if (zp->zp_rewrite) {
+				uint64_t orig_logical_birth =
+				    BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig);
+				ddt_bp_fill(ddp, v, bp, orig_logical_birth);
+				if (BP_EQUAL(bp, &zio->io_bp_orig)) {
+					/* We can skip accounting. */
+					zio->io_flags |= ZIO_FLAG_NOPWRITE;
+					ddt_exit(ddt);
+					return (zio);
+				}
+			}
+
 			ddt_bp_fill(ddp, v, bp, txg);
 			ddt_phys_addref(ddp, v);
 			ddt_exit(ddt);
@@ -4355,6 +4372,15 @@ zio_dva_allocate(zio_t *zio)
 			    error);
 		}
 		zio->io_error = error;
+	} else if (zio->io_prop.zp_rewrite) {
+		/*
+		 * For rewrite operations, preserve the logical birth time
+		 * but set the physical birth time to the current txg.
+		 */
+		uint64_t logical_birth = BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig);
+		ASSERT3U(logical_birth, <=, zio->io_txg);
+		BP_SET_BIRTH(zio->io_bp, logical_birth, zio->io_txg);
+		BP_SET_REWRITE(zio->io_bp, 1);
 	}
 
 	return (zio);
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index deca3c05b07..9fad8946f4f 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -308,7 +308,7 @@ tests = ['zfs_reservation_001_pos', 'zfs_reservation_002_pos']
 tags = ['functional', 'cli_root', 'zfs_reservation']
 
 [tests/functional/cli_root/zfs_rewrite]
-tests = ['zfs_rewrite']
+tests = ['zfs_rewrite', 'zfs_rewrite_physical']
 tags = ['functional', 'cli_root', 'zfs_rewrite']
 
 [tests/functional/cli_root/zfs_rollback]
diff --git a/tests/runfiles/sanity.run b/tests/runfiles/sanity.run
index 732f252b52d..7767c0c2d53 100644
--- a/tests/runfiles/sanity.run
+++ b/tests/runfiles/sanity.run
@@ -195,7 +195,7 @@ tests = ['zfs_reservation_001_pos', 'zfs_reservation_002_pos']
 tags = ['functional', 'cli_root', 'zfs_reservation']
 
 [tests/functional/cli_root/zfs_rewrite]
-tests = ['zfs_rewrite']
+tests = ['zfs_rewrite', 'zfs_rewrite_physical']
 tags = ['functional', 'cli_root', 'zfs_rewrite']
 
 [tests/functional/cli_root/zfs_rollback]
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index 5ab28b2d6c3..c2542287c1d 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -869,6 +869,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/cli_root/zfs_rewrite/cleanup.ksh \
 	functional/cli_root/zfs_rewrite/setup.ksh \
 	functional/cli_root/zfs_rewrite/zfs_rewrite.ksh \
+	functional/cli_root/zfs_rewrite/zfs_rewrite_physical.ksh \
 	functional/cli_root/zfs_rollback/cleanup.ksh \
 	functional/cli_root/zfs_rollback/setup.ksh \
 	functional/cli_root/zfs_rollback/zfs_rollback_001_pos.ksh \
diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_rewrite/zfs_rewrite_physical.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_rewrite/zfs_rewrite_physical.ksh
new file mode 100755
index 00000000000..142e44f5351
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zfs_rewrite/zfs_rewrite_physical.ksh
@@ -0,0 +1,100 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2025, iXsystems, Inc.
+#
+
+# DESCRIPTION:
+#	Verify zfs rewrite -P flag correctly preserves logical birth times.
+#
+# STRATEGY:
+#	1. Create a test file and sync it.
+#	2. Create a snapshot to capture the original birth time.
+#	3. Test default rewrite behavior (updates logical birth time).
+#	4. Test -P flag behavior (preserves logical birth time).
+#	5. Verify incremental send behavior difference.
+
+. $STF_SUITE/include/libtest.shlib
+
+typeset tmp=$(mktemp)
+typeset send_default=$(mktemp)
+typeset send_physical=$(mktemp)
+
+function cleanup
+{
+	rm -rf $tmp $send_default $send_physical $TESTDIR/*
+	zfs destroy -R $TESTPOOL/$TESTFS@snap1 2>/dev/null || true
+	zfs destroy -R $TESTPOOL/$TESTFS@snap2 2>/dev/null || true
+	zfs destroy -R $TESTPOOL/$TESTFS@snap3 2>/dev/null || true
+}
+
+log_assert "zfs rewrite -P flag correctly preserves logical birth times"
+
+log_onexit cleanup
+
+log_must zfs set recordsize=128k $TESTPOOL/$TESTFS
+
+# Create test file and initial snapshot
+log_must dd if=/dev/urandom of=$TESTDIR/testfile bs=128k count=4
+log_must sync_pool $TESTPOOL
+typeset orig_hash=$(xxh128digest $TESTDIR/testfile)
+log_must zfs snapshot $TESTPOOL/$TESTFS@snap1
+
+# Test default rewrite behavior (updates logical birth time)
+log_must zfs rewrite $TESTDIR/testfile
+log_must sync_pool $TESTPOOL
+typeset default_hash=$(xxh128digest $TESTDIR/testfile)
+log_must [ "$orig_hash" = "$default_hash" ]
+log_must zfs snapshot $TESTPOOL/$TESTFS@snap2
+
+# Test incremental send size - should be large with updated birth time
+log_must eval "zfs send -i @snap1 $TESTPOOL/$TESTFS@snap2 > $send_default"
+typeset default_size=$(wc -c < $send_default)
+log_note "Default rewrite incremental send size: $default_size bytes"
+
+# Reset the file to original state
+log_must zfs rollback -r $TESTPOOL/$TESTFS@snap1
+
+# Test -P flag behavior (preserves logical birth time)
+log_must zfs rewrite -P $TESTDIR/testfile
+log_must sync_pool $TESTPOOL
+typeset physical_hash=$(xxh128digest $TESTDIR/testfile)
+log_must [ "$orig_hash" = "$physical_hash" ]
+log_must zfs snapshot $TESTPOOL/$TESTFS@snap3
+
+# Test incremental send size - should be minimal with preserved birth time
+log_must eval "zfs send -i @snap1 $TESTPOOL/$TESTFS@snap3 > $send_physical"
+typeset physical_size=$(wc -c < $send_physical)
+log_note "Physical rewrite incremental send size: $physical_size bytes"
+
+# Verify that -P flag produces smaller incremental send
+if [[ $physical_size -lt $default_size ]]; then
+	log_note "SUCCESS: -P flag produces smaller incremental send" \
+	    "($physical_size < $default_size)"
+else
+	log_fail "FAIL: -P flag should produce smaller incremental send" \
+	    "($physical_size >= $default_size)"
+fi
+
+log_pass
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
index 3389dcf72f8..bdf5fdf85cf 100644
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
@@ -92,6 +92,7 @@ typeset -a properties=(
     "feature@draid"
     "feature@redaction_list_spill"
     "feature@dynamic_gang_header"
+    "feature@physical_rewrite"
 )
 
 if is_linux || is_freebsd; then

From 8302b6e32bb153a6f402b106484d5736f082abcc Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Wed, 6 Aug 2025 13:45:45 -0400
Subject: [PATCH 72/72] Some documentation polishing for log vdevs

Reviewed-by: George Melikov <mail@gmelikov.ru>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <alexander.motin@TrueNAS.com>
Closes #17592
---
 man/man4/zfs.4           | 11 ++++++-----
 man/man7/zfsprops.7      |  7 ++++---
 man/man7/zpoolconcepts.7 |  9 ++-------
 3 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index fa37c7cdb9e..4a5f9fd93f4 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -1723,9 +1723,10 @@ storing all written data into ZIL to not depend on regular vdev latency.
 .
 .It Sy zil_special_is_slog Ns = Ns Sy 1 Ns | Ns 0 Pq int
 When enabled, and written blocks go to normal vdevs, treat present special
-vdevs as SLOGs, storing all synchronously written data into ZIL directly.
-Disabling this forces the indirect writes to preserve special vdev write
-throughput and endurance, likely at the cost of normal vdev latency.
+vdevs as SLOGs.
+Blocks that go to the special vdevs are still written indirectly, as with
+.Sy logbias Ns = Ns Sy throughput .
+This parameter is ignored if an SLOG is present.
 .
 .It Sy zfs_initialize_value Ns = Ns Sy 16045690984833335022 Po 0xDEADBEEFDEADBEEE Pc Pq u64
 Pattern written to vdev free space by
@@ -2486,8 +2487,8 @@ code for this record type.
 The tunable has no effect if the feature is disabled.
 .
 .It Sy zfs_embedded_slog_min_ms Ns = Ns Sy 64 Pq uint
-Usually, one metaslab from each normal-class vdev is dedicated for use by
-the ZIL to log synchronous writes.
+Usually, one metaslab from each normal and special class vdev is dedicated
+for use by the ZIL to log synchronous writes.
 However, if there are fewer than
 .Sy zfs_embedded_slog_min_ms
 metaslabs in the vdev, this functionality is disabled.
diff --git a/man/man7/zfsprops.7 b/man/man7/zfsprops.7
index 829eb420527..ac3152cb5d5 100644
--- a/man/man7/zfsprops.7
+++ b/man/man7/zfsprops.7
@@ -1867,7 +1867,8 @@ property is updated with
 , the property is set to desired value, but the operation to share, reshare
 or unshare the the dataset is not performed.
 .It Sy logbias Ns = Ns Sy latency Ns | Ns Sy throughput
-Provide a hint to ZFS about handling of synchronous requests in this dataset.
+Provide a hint to ZFS about handling of synchronous write requests in this
+dataset.
 If
 .Sy logbias
 is set to
@@ -1875,12 +1876,12 @@ is set to
 .Pq the default ,
 ZFS will use pool log devices
 .Pq if configured
-to handle the requests at low latency.
+to handle the write requests at low latency.
 If
 .Sy logbias
 is set to
 .Sy throughput ,
-ZFS will not use configured pool log devices.
+ZFS will not use configured pool log devices to store written data.
 ZFS will instead optimize synchronous operations for global pool throughput and
 efficient use of resources.
 .It Sy snapdev Ns = Ns Sy hidden Ns | Ns Sy visible
diff --git a/man/man7/zpoolconcepts.7 b/man/man7/zpoolconcepts.7
index 60f16269a0a..dafe3bffc45 100644
--- a/man/man7/zpoolconcepts.7
+++ b/man/man7/zpoolconcepts.7
@@ -390,11 +390,6 @@ Multiple log devices can also be specified, and they can be mirrored.
 See the
 .Sx EXAMPLES
 section for an example of mirroring multiple log devices.
-.Pp
-Log devices can be added, replaced, attached, detached, and removed.
-In addition, log devices are imported and exported as part of the pool
-that contains them.
-Mirrored devices can be removed by specifying the top-level mirror vdev.
 .
 .Ss Cache Devices
 Devices can be added to a storage pool as
@@ -486,8 +481,8 @@ current state of the pool won't be scanned during a scrub.
 .
 .Ss Special Allocation Class
 Allocations in the special class are dedicated to specific block types.
-By default, this includes all metadata, the indirect blocks of user data, and
-any deduplication tables.
+By default, this includes all metadata, the indirect blocks of user data,
+intent log (in absence of separate log device), and deduplication tables.
 The class can also be provisioned to accept small file blocks or zvol blocks
 on a per dataset granularity.
 .Pp